diff --git a/.gitignore b/.gitignore index 65c99da30e..7b23facbe3 100644 --- a/.gitignore +++ b/.gitignore @@ -31,8 +31,10 @@ bindings/ruby/lib/fdboptions.rb bindings/ruby/fdb.gemspec fdbclient/vexillographer/obj/ fdbrpc/hgVersion*.h +fdbrpc/SourceVersion*.h fdbrpc/libeio/config.h flow/hgVersion*.h +flow/SourceVersion*.h generated.mk versions.h packaging/msi/FDBInstaller.wix* diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ca7d2842d..762ba597c4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,18 +29,23 @@ if("${PROJECT_SOURCE_DIR}" STREQUAL "${PROJECT_BINARY_DIR}") message(FATAL_ERROR "In-source builds are forbidden") endif() +set(OPEN_FOR_IDE OFF CACHE BOOL "Open this in an IDE (won't compile/link)") + if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) - message(STATUS "Setting build type to 'Release' as none was specified") - set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build" FORCE) - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" - "MinSizeRel" "RelWithDebInfo") + if (OPEN_FOR_IDE) + message(STATUS "Defaulting build type to 'Debug' for OPEN_FOR_IDE") + set(CMAKE_BUILD_TYPE Debug CACHE STRING "Choose the type of build" FORCE) + else() + message(STATUS "Setting build type to 'Release' as none was specified") + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build" FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" + "MinSizeRel" "RelWithDebInfo") + endif() endif() set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) -set(OPEN_FOR_IDE OFF CACHE BOOL "Open this in an IDE (won't compile/link)") - ################################################################################ # Packages used for bindings ################################################################################ @@ -196,7 +201,7 @@ add_subdirectory(tests) if(WITH_DOCUMENTATION) add_subdirectory(documentation) endif() -add_subdirectory(monitoring) +add_subdirectory(contrib/monitoring) if(WIN32) add_subdirectory(packaging/msi) diff --git a/Makefile b/Makefile index 875ca76593..79f2cb05ec 100644 --- a/Makefile +++ b/Makefile @@ -44,6 +44,8 @@ ifeq ($(PLATFORM),Linux) ifneq '' '$(findstring clang++,$(CXX))' CXXFLAGS += -Wno-undefined-var-template -Wno-unknown-warning-option -Wno-unused-command-line-argument -Wno-register -Wno-logical-op-parentheses + else + CXXFLAGS += -Wno-attributes endif CXXFLAGS += -std=c++17 diff --git a/bindings/c/test/mako/mako.c b/bindings/c/test/mako/mako.c index b365ce3d32..cc8cdc785f 100755 --- a/bindings/c/test/mako/mako.c +++ b/bindings/c/test/mako/mako.c @@ -23,12 +23,17 @@ #include "utils.h" #include "fdbclient/zipf.h" +/* global variables */ +FILE *printme; /* descriptor used for default messages */ +FILE *annoyme; /* descriptor used for annoying messages */ +FILE *debugme; /* descriptor used for debug messages */ + #define check_fdb_error(_e) \ do { \ if (_e) { \ fprintf(stderr, "ERROR: Failed at %s:%d (%s)\n", __FILE__, __LINE__, \ fdb_get_error(_e)); \ - goto FDB_FAIL; \ + goto failExit; \ } \ } while (0) @@ -37,10 +42,47 @@ if ((fdb_future_block_until_ready(_f)) != 0) { \ fprintf(stderr, "ERROR: fdb_future_block_until_ready failed at %s:%d\n", \ __FILE__, __LINE__); \ - goto FDB_FAIL; \ + goto failExit; \ } \ } while (0) +#define fdb_wait_and_handle_error(_func, _f, _t) \ + do { \ + int err = wait_future(_f); \ + if (err) { \ + int err2; \ + if ((err != 1020 /* not_committed */) && \ + (err != 1021 /* commit_unknown_result */)) { \ + fprintf(stderr, "ERROR: Error %s (%d) occured at %s\n", \ + #_func, err, fdb_get_error(err)); \ + } else { \ + fprintf(annoyme, "ERROR: Error %s (%d) occured at %s\n", \ + #_func, err, fdb_get_error(err)); \ + } \ + fdb_future_destroy(_f); \ + _f = fdb_transaction_on_error(_t, err); \ + /* this will return the original error for non-retryable errors */ \ + err2 = wait_future(_f); \ + fdb_future_destroy(_f); \ + if (err2) { \ + /* unretryable error */ \ + fprintf(stderr, \ + "ERROR: fdb_transaction_on_error returned %d at %s:%d\n", \ + err2, __FILE__, __LINE__); \ + fdb_transaction_reset(_t); \ + /* TODO: if we adda retry limit in the future, \ + * handle the conflict stats properly. \ + */ \ + return FDB_ERROR_ABORT; \ + } \ + if (err == 1020 /* not_committed */) { \ + return FDB_ERROR_CONFLICT; \ + } \ + return FDB_ERROR_RETRY; \ + } \ + } while (0) + + fdb_error_t wait_future(FDBFuture *f) { fdb_error_t err; @@ -52,47 +94,17 @@ fdb_error_t wait_future(FDBFuture *f) { } -int commit_transaction(FDBTransaction *transaction, mako_stats_t *stats) { +int commit_transaction(FDBTransaction *transaction) { FDBFuture *f; - fdb_error_t err = 0; - int retry = DEFAULT_RETRY_COUNT; - do { - f = fdb_transaction_commit(transaction); - err = wait_future(f); - fdb_future_destroy(f); - if (stats) { - if (err == 1020 /* not_committed */) - stats->conflicts++; - else { - stats->errors[OP_COMMIT]++; - } - } - - if (err) { - fprintf(stderr, "ERROR: Error %d occured at fdb_transaction_commit\n", - err); - f = fdb_transaction_on_error(transaction, err); - err = wait_future(f); - fdb_future_destroy(f); - if (err) { - /* not retryable */ - fprintf(stderr, - "ERROR: fdb_transaction_on_error returned %d at %s:%d\n", - err, __FILE__, __LINE__); - break; - } - } else { - if (stats) - stats->ops[OP_COMMIT]++; - break; - } - } while (err && retry--); - - return err; + f = fdb_transaction_commit(transaction); + fdb_wait_and_handle_error(commit_transaction, f, transaction); + + return FDB_SUCCESS; } -void update_op_stats(struct timespec *start, struct timespec *end, int op, + +void update_op_lat_stats(struct timespec *start, struct timespec *end, int op, mako_stats_t *stats) { uint64_t latencyus; @@ -109,13 +121,12 @@ void update_op_stats(struct timespec *start, struct timespec *end, int op, } } + /* FDB network thread */ void *fdb_network_thread(void *args) { fdb_error_t err; - if (((mako_args_t *)args)->verbose == VERBOSE_DEBUG) { - printf("DEBUG: fdb_network_thread started\n"); - } + fprintf(debugme, "DEBUG: fdb_network_thread started\n"); err = fdb_run_network(); if (err) { @@ -125,6 +136,7 @@ void *fdb_network_thread(void *args) { return 0; } + /* cleanup database */ int cleanup(FDBTransaction *transaction, mako_args_t *args) { struct timespec timer_start, timer_end; @@ -138,24 +150,23 @@ int cleanup(FDBTransaction *transaction, mako_args_t *args) { clock_gettime(CLOCK_MONOTONIC_COARSE, &timer_start); fdb_transaction_clear_range(transaction, (uint8_t *)beginstr, 5, (uint8_t *)endstr, 5); - if (commit_transaction(transaction, NULL)) - goto FDB_FAIL; + if (commit_transaction(transaction) != FDB_SUCCESS) + goto failExit; fdb_transaction_reset(transaction); clock_gettime(CLOCK_MONOTONIC_COARSE, &timer_end); - if (args->verbose >= VERBOSE_DEFAULT) { - printf("INFO: Clear range: %6.3f sec\n", - ((timer_end.tv_sec - timer_start.tv_sec) * 1000000000.0 + - timer_end.tv_nsec - timer_start.tv_nsec) / - 1000000000); - } + fprintf(printme, "INFO: Clear range: %6.3f sec\n", + ((timer_end.tv_sec - timer_start.tv_sec) * 1000000000.0 + + timer_end.tv_nsec - timer_start.tv_nsec) / + 1000000000); return 0; -FDB_FAIL: +failExit: fprintf(stderr, "ERROR: FDB failure in cleanup()\n"); return -1; } + /* populate database */ int populate(FDBTransaction *transaction, mako_args_t *args, int worker_id, int thread_id, int thread_tps, mako_stats_t *stats) { @@ -221,12 +232,12 @@ int populate(FDBTransaction *transaction, mako_args_t *args, int worker_id, /* commit every 100 inserts (default) */ if (i % args->txnspec.ops[OP_INSERT][OP_COUNT] == 0) { - if (commit_transaction(transaction, NULL)) - goto FDB_FAIL; + if (commit_transaction(transaction) != FDB_SUCCESS) + goto failExit; /* xact latency stats */ clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_end); - update_op_stats(&timer_per_xact_start, &timer_per_xact_end, OP_COMMIT, + update_op_lat_stats(&timer_per_xact_start, &timer_per_xact_end, OP_COMMIT, stats); stats->ops[OP_COMMIT]++; clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_start); @@ -237,29 +248,27 @@ int populate(FDBTransaction *transaction, mako_args_t *args, int worker_id, } } - if (commit_transaction(transaction, NULL)) - goto FDB_FAIL; + if (commit_transaction(transaction) != FDB_SUCCESS) + goto failExit; /* xact latency stats */ clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_end); - update_op_stats(&timer_per_xact_start, &timer_per_xact_end, OP_COMMIT, stats); + update_op_lat_stats(&timer_per_xact_start, &timer_per_xact_end, OP_COMMIT, stats); clock_gettime(CLOCK_MONOTONIC, &timer_end); stats->xacts++; - if (args->verbose == VERBOSE_DEBUG) { - printf("DEBUG: Populated %d rows (%d-%d): %6.3f sec\n", end - begin, begin, - end, - ((timer_end.tv_sec - timer_start.tv_sec) * 1000000000.0 + - timer_end.tv_nsec - timer_start.tv_nsec) / - 1000000000); - } + fprintf(debugme, "DEBUG: Populated %d rows (%d-%d): %6.3f sec\n", end - begin, begin, + end, + ((timer_end.tv_sec - timer_start.tv_sec) * 1000000000.0 + + timer_end.tv_nsec - timer_start.tv_nsec) / + 1000000000); free(keystr); free(valstr); return 0; -FDB_FAIL: +failExit: if (keystr) free(keystr); if (valstr) @@ -268,50 +277,40 @@ FDB_FAIL: return -1; } -int64_t run_op_getreadversion(FDBTransaction *transaction) { - int64_t rv = 0; + +int64_t run_op_getreadversion(FDBTransaction *transaction, int64_t *rv) { FDBFuture *f; fdb_error_t err; - int retry = DEFAULT_RETRY_COUNT; - do { - f = fdb_transaction_get_read_version(transaction); - err = wait_future(f); + *rv = 0; - if (err) { - fdb_future_destroy(f); - f = fdb_transaction_on_error(transaction, err); - err = wait_future(f); - fdb_future_destroy(f); - if (err) { - /* not retryable */ - break; - } - } - } while (err && retry--); - - if (err) { - fprintf(stderr, "ERROR: fdb_transaction_get_read_version: %s\n", fdb_get_error(err)); - return -1; - } + f = fdb_transaction_get_read_version(transaction); + fdb_wait_and_handle_error(fdb_transaction_get_read_version, f, transaction); #if FDB_API_VERSION < 620 - err = fdb_future_get_version(f, &rv); + err = fdb_future_get_version(f, rv); #else - err = fdb_future_get_int64(f, &rv); + err = fdb_future_get_int64(f, rv); #endif + fdb_future_destroy(f); if (err) { #if FDB_API_VERSION < 620 fprintf(stderr, "ERROR: fdb_future_get_version: %s\n", fdb_get_error(err)); #else fprintf(stderr, "ERROR: fdb_future_get_int64: %s\n", fdb_get_error(err)); #endif + return FDB_ERROR_RETRY; } - fdb_future_destroy(f); - return rv; + + /* fail if rv not properly set */ + if (!*rv) { + return FDB_ERROR_RETRY; + } + return FDB_SUCCESS; } + int run_op_get(FDBTransaction *transaction, char *keystr, char *valstr, int snapshot) { FDBFuture *f; @@ -319,41 +318,23 @@ int run_op_get(FDBTransaction *transaction, char *keystr, char *valstr, char *val; int vallen; fdb_error_t err; - int retry = DEFAULT_RETRY_COUNT; - - do { - f = fdb_transaction_get(transaction, (uint8_t *)keystr, strlen(keystr), - snapshot); - err = wait_future(f); - - if (err) { - fdb_future_destroy(f); - f = fdb_transaction_on_error(transaction, err); - err = wait_future(f); - fdb_future_destroy(f); - if (err) { - /* not retryable */ - break; - } - } - } while (err && retry--); - - if (err) { - fprintf(stderr, "ERROR: fdb_transaction_get: %s\n", fdb_get_error(err)); - return -1; - } + f = fdb_transaction_get(transaction, (uint8_t *)keystr, strlen(keystr), + snapshot); + fdb_wait_and_handle_error(fdb_transaction_get, f, transaction); + err = fdb_future_get_value(f, &out_present, (const uint8_t **)&val, &vallen); fdb_future_destroy(f); if (err || !out_present) { /* error or value not present */ - return -1; + return FDB_ERROR_RETRY; } strncpy(valstr, val, vallen); valstr[vallen] = '\0'; - return 0; + return FDB_SUCCESS; } + int run_op_getrange(FDBTransaction *transaction, char *keystr, char *keystr2, char *valstr, int snapshot, int reverse) { FDBFuture *f; @@ -361,111 +342,79 @@ int run_op_getrange(FDBTransaction *transaction, char *keystr, char *keystr2, FDBKeyValue const *out_kv; int out_count; int out_more; - int retry = DEFAULT_RETRY_COUNT; - do { - f = fdb_transaction_get_range( - transaction, - FDB_KEYSEL_FIRST_GREATER_OR_EQUAL((uint8_t *)keystr, strlen(keystr)), - FDB_KEYSEL_LAST_LESS_OR_EQUAL((uint8_t *)keystr2, strlen(keystr2)) + 1, - 0 /* limit */, 0 /* target_bytes */, - FDB_STREAMING_MODE_WANT_ALL /* FDBStreamingMode */, 0 /* iteration */, - snapshot, reverse /* reverse */); - err = wait_future(f); - - if (err) { - fdb_future_destroy(f); - f = fdb_transaction_on_error(transaction, err); - err = wait_future(f); - fdb_future_destroy(f); - if (err) { - /* not retryable */ - break; - } - } - } while (err && retry--); - - if (err) { - fprintf(stderr, "ERROR: fdb_transaction_get_range: %s\n", fdb_get_error(err)); - return -1; - } + f = fdb_transaction_get_range( + transaction, + FDB_KEYSEL_FIRST_GREATER_OR_EQUAL((uint8_t *)keystr, strlen(keystr)), + FDB_KEYSEL_LAST_LESS_OR_EQUAL((uint8_t *)keystr2, strlen(keystr2)) + 1, + 0 /* limit */, 0 /* target_bytes */, + FDB_STREAMING_MODE_WANT_ALL /* FDBStreamingMode */, 0 /* iteration */, + snapshot, reverse /* reverse */); + fdb_wait_and_handle_error(fdb_transaction_get_range, f, transaction); err = fdb_future_get_keyvalue_array(f, &out_kv, &out_count, &out_more); if (err) { fprintf(stderr, "ERROR: fdb_future_get_keyvalue_array: %s\n", fdb_get_error(err)); fdb_future_destroy(f); - return -1; + return FDB_ERROR_RETRY; } fdb_future_destroy(f); - return 0; + return FDB_SUCCESS; } + +/* Update -- GET and SET the same key */ int run_op_update(FDBTransaction *transaction, char *keystr, char *valstr) { FDBFuture *f; int out_present; char *val; int vallen; fdb_error_t err; - int retry = DEFAULT_RETRY_COUNT; /* GET first */ - do { - f = fdb_transaction_get(transaction, (uint8_t *)keystr, strlen(keystr), 0); - err = wait_future(f); - - if (err) { - fdb_future_destroy(f); - f = fdb_transaction_on_error(transaction, err); - err = wait_future(f); - fdb_future_destroy(f); - if (err) { - /* not retryable */ - break; - } - } - } while (err && retry--); - - if (err) { - fprintf(stderr, "ERROR: fdb_transaction_get: %s\n", fdb_get_error(err)); - return -1; - } + f = fdb_transaction_get(transaction, (uint8_t *)keystr, strlen(keystr), 0); + fdb_wait_and_handle_error(fdb_transaction_get, f, transaction); err = fdb_future_get_value(f, &out_present, (const uint8_t **)&val, &vallen); fdb_future_destroy(f); if (err || !out_present) { /* error or value not present */ - return -1; + return FDB_ERROR_RETRY; } /* Update Value (SET) */ fdb_transaction_set(transaction, (uint8_t *)keystr, strlen(keystr), (uint8_t *)valstr, strlen(valstr)); - return 0; + return FDB_SUCCESS; } + int run_op_insert(FDBTransaction *transaction, char *keystr, char *valstr) { fdb_transaction_set(transaction, (uint8_t *)keystr, strlen(keystr), (uint8_t *)valstr, strlen(valstr)); - return 0; + return FDB_SUCCESS; } + int run_op_clear(FDBTransaction *transaction, char *keystr) { fdb_transaction_clear(transaction, (uint8_t *)keystr, strlen(keystr)); - return 0; + return FDB_SUCCESS; } + int run_op_clearrange(FDBTransaction *transaction, char *keystr, char *keystr2) { fdb_transaction_clear_range(transaction, (uint8_t *)keystr, strlen(keystr), (uint8_t *)keystr2, strlen(keystr2)); - return 0; + return FDB_SUCCESS; } + /* run one transaction */ -int run_transaction(FDBTransaction *transaction, mako_args_t *args, - mako_stats_t *stats, char *keystr, char *keystr2, - char *valstr) { +int run_one_transaction(FDBTransaction *transaction, mako_args_t *args, + mako_stats_t *stats, char *keystr, char *keystr2, + char *valstr) { int i; int count; int rc; @@ -478,172 +427,228 @@ int run_transaction(FDBTransaction *transaction, mako_args_t *args, int randstrlen; int rangei; - /* transaction */ - clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_start); - for (i = 0; i < MAX_OP; i++) { + /* make sure that the transaction object is clean */ + fdb_transaction_reset(transaction); + clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_start); + + retryTxn: + for (i = 0; i < MAX_OP; i++) { + if ((args->txnspec.ops[i][OP_COUNT] > 0) && (i != OP_COMMIT)) { for (count = 0; count < args->txnspec.ops[i][OP_COUNT]; count++) { + + /* note: for simplicity, always generate a new key(s) even when retrying */ - /* pick a random key(s) */ - if (args->zipf) { - keynum = zipfian_next(); - } else { - keynum = urand(0, args->rows - 1); - } - genkey(keystr, keynum, args->rows, args->key_length + 1); + /* pick a random key(s) */ + if (args->zipf) { + keynum = zipfian_next(); + } else { + keynum = urand(0, args->rows - 1); + } + genkey(keystr, keynum, args->rows, args->key_length + 1); + + /* range */ + if (args->txnspec.ops[i][OP_RANGE] > 0) { + keyend = keynum + args->txnspec.ops[i][OP_RANGE] - 1; /* inclusive */ + if (keyend > args->rows - 1) { + keyend = args->rows - 1; + } + genkey(keystr2, keyend, args->rows, args->key_length + 1); + } + + if (stats->xacts % args->sampling == 0) { + /* per op latency */ + clock_gettime(CLOCK_MONOTONIC, &timer_start); + } + + switch (i) { + case OP_GETREADVERSION: + rc = run_op_getreadversion(transaction, &readversion); + break; + case OP_GET: + rc = run_op_get(transaction, keystr, valstr, 0); + break; + case OP_GETRANGE: + rc = run_op_getrange(transaction, keystr, keystr2, valstr, 0, + args->txnspec.ops[i][OP_REVERSE]); + break; + case OP_SGET: + rc = run_op_get(transaction, keystr, valstr, 1); + break; + case OP_SGETRANGE: + rc = run_op_getrange(transaction, keystr, keystr2, valstr, 1, + args->txnspec.ops[i][OP_REVERSE]); + break; + case OP_UPDATE: + randstr(valstr, args->value_length + 1); + rc = run_op_update(transaction, keystr, valstr); + docommit = 1; + break; + case OP_INSERT: + randstr(keystr + KEYPREFIXLEN, + args->key_length - KEYPREFIXLEN + 1); /* make it (almost) unique */ + randstr(valstr, args->value_length + 1); + rc = run_op_insert(transaction, keystr, valstr); + docommit = 1; + break; + case OP_INSERTRANGE: + randstrlen = args->key_length - KEYPREFIXLEN - + digits(args->txnspec.ops[i][OP_RANGE]); + randstr(keystr + KEYPREFIXLEN, randstrlen + 1); /* make it (almost) unique */ + randstr(valstr, args->value_length + 1); + for (rangei = 0; rangei < args->txnspec.ops[i][OP_RANGE]; rangei++) { + sprintf(keystr + KEYPREFIXLEN + randstrlen, "%0.*d", + digits(args->txnspec.ops[i][OP_RANGE]), rangei); + rc = run_op_insert(transaction, keystr, valstr); + if (rc != FDB_SUCCESS) + break; + } + docommit = 1; + break; + case OP_CLEAR: + rc = run_op_clear(transaction, keystr); + docommit = 1; + break; + case OP_SETCLEAR: + randstr(keystr + KEYPREFIXLEN, + args->key_length - KEYPREFIXLEN + 1); /* make it (almost) unique */ + randstr(valstr, args->value_length + 1); + rc = run_op_insert(transaction, keystr, valstr); + if (rc == FDB_SUCCESS) { + /* commit insert so mutation goes to storage */ + rc = commit_transaction(transaction); + if (rc == FDB_SUCCESS) { + stats->ops[OP_COMMIT]++; + clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_end); + update_op_lat_stats(&timer_per_xact_start, &timer_per_xact_end, + OP_COMMIT, stats); + } else { + /* error */ + if (rc == FDB_ERROR_CONFLICT) { + stats->conflicts++; + } else { + stats->errors[OP_COMMIT]++; + } + if (rc == FDB_ERROR_ABORT) { + return rc; /* abort */ + } + goto retryTxn; + } + fdb_transaction_reset(transaction); + rc = run_op_clear(transaction, keystr); + } + docommit = 1; + break; + case OP_CLEARRANGE: + rc = run_op_clearrange(transaction, keystr, keystr2); + docommit = 1; + break; + case OP_SETCLEARRANGE: + randstrlen = args->key_length - KEYPREFIXLEN - + digits(args->txnspec.ops[i][OP_RANGE]); + randstr(keystr + KEYPREFIXLEN, + randstrlen + 1); /* make it (almost) unique */ + randstr(valstr, args->value_length + 1); + for (rangei = 0; rangei < args->txnspec.ops[i][OP_RANGE]; rangei++) { + sprintf(keystr + KEYPREFIXLEN + randstrlen, "%0.*d", + digits(args->txnspec.ops[i][OP_RANGE]), rangei); + if (rangei == 0) { + strcpy(keystr2, keystr); + keystr2[strlen(keystr)] = '\0'; + } + rc = run_op_insert(transaction, keystr, valstr); + /* rollback not necessary, move on */ + if (rc == FDB_ERROR_RETRY) { + goto retryTxn; + } else if (rc == FDB_ERROR_ABORT) { + return rc; /* abort */ + } + } + /* commit insert so mutation goes to storage */ + rc = commit_transaction(transaction); + if (rc == FDB_SUCCESS) { + stats->ops[OP_COMMIT]++; + clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_end); + update_op_lat_stats(&timer_per_xact_start, &timer_per_xact_end, + OP_COMMIT, stats); + } else { + /* error */ + if (rc == FDB_ERROR_CONFLICT) { + stats->conflicts++; + } else { + stats->errors[OP_COMMIT]++; + } + if (rc == FDB_ERROR_ABORT) { + return rc; /* abort */ + } + goto retryTxn; + } + fdb_transaction_reset(transaction); + rc = run_op_clearrange(transaction, keystr2, keystr); + docommit = 1; + break; + default: + fprintf(stderr, "ERROR: Unknown Operation %d\n", i); + break; + } - /* range */ - if (args->txnspec.ops[i][OP_RANGE] > 0) { - keyend = keynum + args->txnspec.ops[i][OP_RANGE] - 1; /* inclusive */ - if (keyend > args->rows - 1) { - keyend = args->rows - 1; - } - genkey(keystr2, keyend, args->rows, args->key_length + 1); - } + if (stats->xacts % args->sampling == 0) { + clock_gettime(CLOCK_MONOTONIC, &timer_end); + if (rc == FDB_SUCCESS) { + /* per op latency, record successful transactions */ + update_op_lat_stats(&timer_start, &timer_end, i, stats); + } + } - if (stats->xacts % args->sampling == 0) { - /* per op latency */ - clock_gettime(CLOCK_MONOTONIC, &timer_start); - } - - switch (i) { - case OP_GETREADVERSION: - readversion = run_op_getreadversion(transaction); - if (!readversion) { - rc = -1; - } - break; - case OP_GET: - rc = run_op_get(transaction, keystr, valstr, 0); - break; - case OP_GETRANGE: - rc = run_op_getrange(transaction, keystr, keystr2, valstr, 0, - args->txnspec.ops[i][OP_REVERSE]); - break; - case OP_SGET: - rc = run_op_get(transaction, keystr, valstr, 1); - break; - case OP_SGETRANGE: - rc = run_op_getrange(transaction, keystr, keystr2, valstr, 1, - args->txnspec.ops[i][OP_REVERSE]); - break; - case OP_UPDATE: - randstr(valstr, args->value_length + 1); - rc = run_op_update(transaction, keystr, valstr); - docommit = 1; - break; - case OP_INSERT: - randstr(keystr + KEYPREFIXLEN, args->key_length - KEYPREFIXLEN + - 1); /* make it (almost) unique */ - randstr(valstr, args->value_length + 1); - rc = run_op_insert(transaction, keystr, valstr); - docommit = 1; - break; - case OP_INSERTRANGE: - randstrlen = args->key_length - KEYPREFIXLEN - - digits(args->txnspec.ops[i][OP_RANGE]); - randstr(keystr + KEYPREFIXLEN, - randstrlen + 1); /* make it (almost) unique */ - randstr(valstr, args->value_length + 1); - for (rangei = 0; rangei < args->txnspec.ops[i][OP_RANGE]; rangei++) { - sprintf(keystr + KEYPREFIXLEN + randstrlen, "%0.*d", - digits(args->txnspec.ops[i][OP_RANGE]), rangei); - rc = run_op_insert(transaction, keystr, valstr); - if (rc != 0) - break; - } - docommit = 1; - break; - case OP_CLEAR: - rc = run_op_clear(transaction, keystr); - docommit = 1; - break; - case OP_SETCLEAR: - randstr(keystr + KEYPREFIXLEN, args->key_length - KEYPREFIXLEN + - 1); /* make it (almost) unique */ - randstr(valstr, args->value_length + 1); - rc = run_op_insert(transaction, keystr, valstr); - if (rc == 0) { - /* commit insert so mutation goes to storage */ - if (commit_transaction(transaction, stats) == 0) { - clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_end); - update_op_stats(&timer_per_xact_start, &timer_per_xact_end, - OP_COMMIT, stats); - } - fdb_transaction_reset(transaction); - rc = run_op_clear(transaction, keystr); - } - docommit = 1; - break; - case OP_CLEARRANGE: - rc = run_op_clearrange(transaction, keystr, keystr2); - docommit = 1; - break; - case OP_SETCLEARRANGE: - randstrlen = args->key_length - KEYPREFIXLEN - - digits(args->txnspec.ops[i][OP_RANGE]); - randstr(keystr + KEYPREFIXLEN, - randstrlen + 1); /* make it (almost) unique */ - randstr(valstr, args->value_length + 1); - for (rangei = 0; rangei < args->txnspec.ops[i][OP_RANGE]; rangei++) { - sprintf(keystr + KEYPREFIXLEN + randstrlen, "%0.*d", - digits(args->txnspec.ops[i][OP_RANGE]), rangei); - if (rangei == 0) { - strcpy(keystr2, keystr); - keystr2[strlen(keystr)] = '\0'; - } - rc = run_op_insert(transaction, keystr, valstr); - if (rc != 0) { - /* rollback not necessary, transaction will be reset */ - break; - } - } - /* commit inserts so mutation goes to storage */ - if (commit_transaction(transaction, stats) == 0) { - clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_end); - update_op_stats(&timer_per_xact_start, &timer_per_xact_end, - OP_COMMIT, stats); - } - fdb_transaction_reset(transaction); - rc = run_op_clearrange(transaction, keystr2, keystr); - docommit = 1; - break; - default: - fprintf(stderr, "ERROR: Unknown Operation %d\n", i); - break; - } - - if (stats->xacts % args->sampling == 0) { - clock_gettime(CLOCK_MONOTONIC, &timer_end); - if (rc == 0) { - /* per op latency */ - update_op_stats(&timer_start, &timer_end, i, stats); - } - } - - /* check rc */ - if (rc != 0) { - stats->errors[i]++; - } else { - stats->ops[i]++; - } + /* check rc and update stats */ + if (rc == FDB_SUCCESS) { + stats->ops[i]++; + } else { + /* error */ + if (rc == FDB_ERROR_CONFLICT) { + stats->conflicts++; + } else { + stats->errors[OP_COMMIT]++; + } + if (rc == FDB_ERROR_ABORT) { + return rc; /* abort */ + } + goto retryTxn; + } } } } + + /* commit only successful transaction */ if (docommit | args->commit_get) { - if (commit_transaction(transaction, stats) == 0) { + rc = commit_transaction(transaction); + if (rc == FDB_SUCCESS) { + /* success */ + stats->ops[OP_COMMIT]++; clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_end); - update_op_stats(&timer_per_xact_start, &timer_per_xact_end, OP_COMMIT, - stats); + update_op_lat_stats(&timer_per_xact_start, &timer_per_xact_end, + OP_COMMIT, stats); + } else { + /* error */ + if (rc == FDB_ERROR_CONFLICT) { + stats->conflicts++; + } else { + stats->errors[OP_COMMIT]++; + } + if (rc == FDB_ERROR_ABORT) { + return rc; /* abort */ + } + goto retryTxn; } } + stats->xacts++; - fdb_transaction_reset(transaction); return 0; } + int run_workload(FDBTransaction *transaction, mako_args_t *args, int thread_tps, volatile double *throttle_factor, int thread_iters, volatile int *signal, mako_stats_t *stats) { @@ -677,6 +682,7 @@ int run_workload(FDBTransaction *transaction, mako_args_t *args, clock_gettime(CLOCK_MONOTONIC_COARSE, &timer_prev); + /* main transaction loop */ while (1) { if ((thread_tps > 0) && (xacts >= current_tps)) { @@ -699,17 +705,19 @@ int run_workload(FDBTransaction *transaction, mako_args_t *args, } } - rc = run_transaction(transaction, args, stats, keystr, keystr2, valstr); + rc = run_one_transaction(transaction, args, stats, keystr, keystr2, valstr); if (rc) { - /* should never get here */ - fprintf(stderr, "ERROR: run_transaction failed (%d)\n", rc); + /* FIXME: run_one_transaction should return something meaningful */ + fprintf(annoyme, "ERROR: run_one_transaction failed (%d)\n", rc); } if (thread_iters > 0) { if (thread_iters == xacts) { + /* xact limit reached */ break; } } else if (*signal == SIGNAL_RED) { + /* signal turned red, target duration reached */ break; } xacts++; @@ -721,6 +729,7 @@ int run_workload(FDBTransaction *transaction, mako_args_t *args, return rc; } + /* mako worker thread */ void *worker_thread(void *thread_args) { int worker_id = ((thread_args_t *)thread_args)->process->worker_id; @@ -749,11 +758,9 @@ void *worker_thread(void *thread_args) { stats->latency_us_total[op] = 0; } - if (args->verbose == VERBOSE_DEBUG) { - printf("DEBUG: worker_id:%d (%d) thread_id:%d (%d) (tid:%d)\n", worker_id, - args->num_processes, thread_id, args->num_threads, - (unsigned int)pthread_self()); - } + fprintf(debugme, "DEBUG: worker_id:%d (%d) thread_id:%d (%d) (tid:%d)\n", worker_id, + args->num_processes, thread_id, args->num_threads, + (unsigned int)pthread_self()); if (args->tpsmax) { thread_tps = compute_thread_tps(args->tpsmax, worker_id, thread_id, @@ -801,11 +808,12 @@ void *worker_thread(void *thread_args) { } /* fall through */ -FDB_FAIL: +failExit: fdb_transaction_destroy(transaction); pthread_exit(0); } + /* mako worker process */ int worker_process_main(mako_args_t *args, int worker_id, mako_shmhdr_t *shm) { int i; @@ -824,23 +832,16 @@ int worker_process_main(mako_args_t *args, int worker_id, mako_shmhdr_t *shm) { process.args = args; process.shm = (mako_shmhdr_t *)shm; - if (args->verbose == VERBOSE_DEBUG) { - printf("DEBUG: worker %d started\n", worker_id); - } + fprintf(debugme, "DEBUG: worker %d started\n", worker_id); /* Everything starts from here */ - /* Let's use the maximum API version */ - // fprintf(stderr, "fdb_get_max_api_version: %d\n", - // fdb_get_max_api_version()); - err = fdb_select_api_version(fdb_get_max_api_version()); + err = fdb_select_api_version(args->api_version); check_fdb_error(err); /* enable flatbuffers if specified */ if (args->flatbuffers) { #ifdef FDB_NET_OPTION_USE_FLATBUFFERS - if (args->verbose >= VERBOSE_DEBUG) { - printf("DEBUG: Using flatbuffers\n"); - } + fprintf(debugme, "DEBUG: Using flatbuffers\n"); err = fdb_network_set_option(FDB_NET_OPTION_USE_FLATBUFFERS, (uint8_t *)&args->flatbuffers, sizeof(uint8_t)); @@ -851,20 +852,16 @@ int worker_process_main(mako_args_t *args, int worker_id, mako_shmhdr_t *shm) { fdb_get_error(err)); } #else - if (args->verbose >= VERBOSE_DEFAULT) { - printf("INFO: flatbuffers is not supported in FDB API version %d\n", - FDB_API_VERSION); - } + fprintf(printme, "INFO: flatbuffers is not supported in FDB API version %d\n", + FDB_API_VERSION); #endif } /* enable tracing if specified */ if (args->trace) { - if (args->verbose >= VERBOSE_DEBUG) { - printf("DEBUG: Enable Tracing (%s)\n", (args->tracepath[0] == '\0') - ? "current directory" - : args->tracepath); - } + fprintf(debugme, "DEBUG: Enable Tracing (%s)\n", (args->tracepath[0] == '\0') + ? "current directory" + : args->tracepath); err = fdb_network_set_option(FDB_NET_OPTION_TRACE_ENABLE, (uint8_t *)args->tracepath, strlen(args->tracepath)); @@ -881,9 +878,7 @@ int worker_process_main(mako_args_t *args, int worker_id, mako_shmhdr_t *shm) { char delim[] = ", "; char *knob = strtok(args->knobs, delim); while (knob != NULL) { - if (args->verbose >= VERBOSE_DEBUG) { - printf("DEBUG: Setting client knobs: %s\n", knob); - } + fprintf(debugme, "DEBUG: Setting client knobs: %s\n", knob); err = fdb_network_set_option(FDB_NET_OPTION_KNOB, (uint8_t *)knob, strlen(knob)); if (err) { @@ -895,16 +890,12 @@ int worker_process_main(mako_args_t *args, int worker_id, mako_shmhdr_t *shm) { } /* Network thread must be setup before doing anything */ - if (args->verbose == VERBOSE_DEBUG) { - printf("DEBUG: fdb_setup_network\n"); - } + fprintf(debugme, "DEBUG: fdb_setup_network\n"); err = fdb_setup_network(); check_fdb_error(err); /* Each worker process will have its own network thread */ - if (args->verbose >= VERBOSE_DEBUG) { - printf("DEBUG: creating network thread\n"); - } + fprintf(debugme, "DEBUG: creating network thread\n"); rc = pthread_create(&network_thread, NULL, fdb_network_thread, (void *)args); if (rc != 0) { fprintf(stderr, "ERROR: Cannot create a network thread\n"); @@ -935,13 +926,11 @@ int worker_process_main(mako_args_t *args, int worker_id, mako_shmhdr_t *shm) { fdb_create_database(args->cluster_file, &process.database); #endif - if (args->verbose >= VERBOSE_DEBUG) { - printf("DEBUG: creating %d worker threads\n", args->num_threads); - } + fprintf(debugme, "DEBUG: creating %d worker threads\n", args->num_threads); worker_threads = (pthread_t *)calloc(sizeof(pthread_t), args->num_threads); if (!worker_threads) { fprintf(stderr, "ERROR: cannot allocate worker_threads\n"); - goto EXIT; + goto failExit; } /* spawn worker threads */ @@ -949,7 +938,7 @@ int worker_process_main(mako_args_t *args, int worker_id, mako_shmhdr_t *shm) { (thread_args_t *)calloc(sizeof(thread_args_t), args->num_threads); if (!thread_args) { fprintf(stderr, "ERROR: cannot allocate thread_args\n"); - goto EXIT; + goto failExit; } for (i = 0; i < args->num_threads; i++) { @@ -967,16 +956,14 @@ int worker_process_main(mako_args_t *args, int worker_id, mako_shmhdr_t *shm) { /* wait for everyone to finish */ for (i = 0; i < args->num_threads; i++) { - if (args->verbose >= VERBOSE_DEBUG) { - printf("DEBUG: worker_thread %d joining\n", i); - } + fprintf(debugme, "DEBUG: worker_thread %d joining\n", i); rc = pthread_join(worker_threads[i], NULL); if (rc != 0) { fprintf(stderr, "ERROR: threads %d failed to join\n", i); } } -EXIT: +failExit: if (worker_threads) free(worker_threads); if (thread_args) @@ -989,18 +976,12 @@ EXIT: #endif /* stop the network thread */ - if (args->verbose >= VERBOSE_DEBUG) { - printf("DEBUG: fdb_stop_network\n"); - } + fprintf(debugme, "DEBUG: fdb_stop_network\n"); err = fdb_stop_network(); check_fdb_error(err); -FDB_FAIL: - /* wait for the network thread to join */ - if (args->verbose >= VERBOSE_DEBUG) { - printf("DEBUG: network_thread joining\n"); - } + fprintf(debugme, "DEBUG: network_thread joining\n"); rc = pthread_join(network_thread, NULL); if (rc != 0) { fprintf(stderr, "ERROR: network thread failed to join\n"); @@ -1009,30 +990,32 @@ FDB_FAIL: return 0; } + /* initialize the parameters with default values */ int init_args(mako_args_t *args) { int i; if (!args) return -1; memset(args, 0, sizeof(mako_args_t)); /* zero-out everything */ + args->api_version = fdb_get_max_api_version(); args->json = 0; args->num_processes = 1; args->num_threads = 1; args->mode = MODE_INVALID; - args->rows = 10000; - args->seconds = 0; + args->rows = 100000; + args->seconds = 30; args->iteration = 0; args->tpsmax = 0; args->tpsmin = -1; args->tpsinterval = 10; args->tpschange = TPS_SIN; args->sampling = 1000; - args->key_length = 16; + args->key_length = 32; args->value_length = 16; args->zipf = 0; args->commit_get = 0; args->verbose = 1; - args->flatbuffers = 0; + args->flatbuffers = 0; /* internal */ args->knobs[0] = '\0'; args->trace = 0; args->tracepath[0] = '\0'; @@ -1042,6 +1025,7 @@ int init_args(mako_args_t *args) { return 0; } + /* parse transaction specification */ int parse_transaction(mako_args_t *args, char *optarg) { char *ptr = optarg; @@ -1099,9 +1083,7 @@ int parse_transaction(mako_args_t *args, char *optarg) { op = OP_SETCLEAR; ptr += 2; } else { - if (args->verbose == VERBOSE_DEBUG) { - printf("Error: Invalid transaction spec: %s\n", ptr); - } + fprintf(debugme, "Error: Invalid transaction spec: %s\n", ptr); error = 1; break; } @@ -1155,7 +1137,7 @@ int parse_transaction(mako_args_t *args, char *optarg) { if (args->verbose == VERBOSE_DEBUG) { for (op = 0; op < MAX_OP; op++) { - printf("DEBUG: OP: %d: %d: %d\n", op, args->txnspec.ops[op][0], + fprintf(debugme, "DEBUG: OP: %d: %d: %d\n", op, args->txnspec.ops[op][0], args->txnspec.ops[op][1]); } } @@ -1163,11 +1145,13 @@ int parse_transaction(mako_args_t *args, char *optarg) { return 0; } + void usage() { printf("Usage:\n"); printf("%-24s%s\n", "-h, --help", "Print this message"); printf("%-24s%s\n", " --version", "Print FDB version"); printf("%-24s%s\n", "-v, --verbose", "Specify verbosity"); + printf("%-24s%s\n", "-a, --api_version=API_VERSION", "Specify API_VERSION to use"); printf("%-24s%s\n", "-c, --cluster=FILE", "Specify FDB cluster file"); printf("%-24s%s\n", "-p, --procs=PROCS", "Specify number of worker processes"); @@ -1200,15 +1184,17 @@ void usage() { printf("%-24s%s\n", " --flatbuffers", "Use flatbuffers"); } + /* parse benchmark paramters */ int parse_args(int argc, char *argv[], mako_args_t *args) { int rc; int c; int idx; while (1) { - const char *short_options = "c:p:t:r:s:i:x:v:m:hjz"; + const char *short_options = "a:c:p:t:r:s:i:x:v:m:hjz"; static struct option long_options[] = { /* name, has_arg, flag, val */ + {"api_version", required_argument, NULL, 'a'}, {"cluster", required_argument, NULL, 'c'}, {"procs", required_argument, NULL, 'p'}, {"threads", required_argument, NULL, 't'}, @@ -1246,6 +1232,9 @@ int parse_args(int argc, char *argv[], mako_args_t *args) { case 'h': usage(); return -1; + case 'a': + args->api_version = atoi(optarg); + break; case 'c': strcpy(args->cluster_file, optarg); break; @@ -1340,9 +1329,27 @@ int parse_args(int argc, char *argv[], mako_args_t *args) { if ((args->tpsmin == -1) || (args->tpsmin > args->tpsmax)) { args->tpsmin = args->tpsmax; } + + if (args->verbose >= VERBOSE_DEFAULT) { + printme = stdout; + } else { + printme = fopen("/dev/null", "w"); + } + if (args->verbose >= VERBOSE_ANNOYING) { + annoyme = stdout; + } else { + annoyme = fopen("/dev/null", "w"); + } + if (args->verbose >= VERBOSE_DEBUG) { + debugme = stdout; + } else { + debugme = fopen("/dev/null", "w"); + } + return 0; } + int validate_args(mako_args_t *args) { if (args->mode == MODE_INVALID) { fprintf(stderr, "ERROR: --mode has to be set\n"); @@ -1380,6 +1387,7 @@ int validate_args(mako_args_t *args) { return 0; } + /* stats output formatting */ #define STR2(x) #x #define STR(x) STR2(x) @@ -1446,6 +1454,7 @@ void print_stats(mako_args_t *args, mako_stats_t *stats, struct timespec *now, return; } + void print_stats_header(mako_args_t *args) { int op; int i; @@ -1518,6 +1527,7 @@ void print_stats_header(mako_args_t *args) { printf("\n"); } + void print_report(mako_args_t *args, mako_stats_t *stats, struct timespec *timer_now, struct timespec *timer_start) { int i, j, op; @@ -1654,6 +1664,7 @@ void print_report(mako_args_t *args, mako_stats_t *stats, printf("\n"); } + int stats_process_main(mako_args_t *args, mako_stats_t *stats, volatile double *throttle_factor, volatile int *signal) { struct timespec timer_start, timer_prev, timer_now; @@ -1723,6 +1734,7 @@ int stats_process_main(mako_args_t *args, mako_stats_t *stats, return 0; } + int main(int argc, char *argv[]) { int rc; mako_args_t args; @@ -1779,7 +1791,7 @@ int main(int argc, char *argv[]) { if (ftruncate(shmfd, shmsize) < 0) { fprintf(stderr, "ERROR: ftruncate (fd:%d size:%llu) failed\n", shmfd, (unsigned long long)shmsize); - goto EXIT; + goto failExit; } /* map it */ @@ -1788,7 +1800,7 @@ int main(int argc, char *argv[]) { if (shm == MAP_FAILED) { fprintf(stderr, "ERROR: mmap (fd:%d size:%llu) failed\n", shmfd, (unsigned long long)shmsize); - goto EXIT; + goto failExit; } stats = (mako_stats_t *)((void *)shm + sizeof(mako_shmhdr_t)); @@ -1806,7 +1818,7 @@ int main(int argc, char *argv[]) { if (!worker_pids) { fprintf(stderr, "ERROR: cannot allocate worker_pids (%d processes)\n", args.num_processes); - goto EXIT; + goto failExit; } /* forking (num_process + 1) children */ @@ -1920,7 +1932,7 @@ int main(int argc, char *argv[]) { worker_pids[args.num_processes]); } -EXIT: +failExit: if (worker_pids) free(worker_pids); diff --git a/bindings/c/test/mako/mako.h b/bindings/c/test/mako/mako.h index 334a8774f8..d924f8a648 100755 --- a/bindings/c/test/mako/mako.h +++ b/bindings/c/test/mako/mako.h @@ -17,8 +17,6 @@ #include #endif -#define DEFAULT_RETRY_COUNT 3 - #define VERBOSE_NONE 0 #define VERBOSE_DEFAULT 1 #define VERBOSE_ANNOYING 2 @@ -29,9 +27,11 @@ #define MODE_BUILD 1 #define MODE_RUN 2 -/* we set mako_txn_t and mako_args_t only once in the master process, - * and won't be touched by child processes. - */ +#define FDB_SUCCESS 0 +#define FDB_ERROR_RETRY -1 +#define FDB_ERROR_ABORT -2 +#define FDB_ERROR_CONFLICT -3 + /* transaction specification */ enum Operations { @@ -55,7 +55,7 @@ enum Operations { #define OP_RANGE 1 #define OP_REVERSE 2 -/* for arguments */ +/* for long arguments */ enum Arguments { ARG_KEYLEN, ARG_VALLEN, @@ -82,6 +82,10 @@ enum TPSChangeTypes { #define KEYPREFIX "mako" #define KEYPREFIXLEN 4 +/* we set mako_txnspec_t and mako_args_t only once in the master process, + * and won't be touched by child processes. + */ + typedef struct { /* for each operation, it stores "count", "range" and "reverse" */ int ops[MAX_OP][3]; @@ -91,6 +95,7 @@ typedef struct { /* benchmark parameters */ typedef struct { + int api_version; int json; int num_processes; int num_threads; diff --git a/bindings/c/test/mako/mako.rst b/bindings/c/test/mako/mako.rst index 218642b7b3..05dcb525fc 100644 --- a/bindings/c/test/mako/mako.rst +++ b/bindings/c/test/mako/mako.rst @@ -38,6 +38,9 @@ Arguments | - ``build``: Populate data | - ``run``: Run the benchmark +- | ``-a | --api_version `` + | FDB API version to use (Default: Latest) + - | ``-c | --cluster `` | FDB cluster file (Required) @@ -48,7 +51,7 @@ Arguments | Number of threads per worker process (Default: 1) - | ``-r | --rows `` - | Number of rows populated (Default: 10000) + | Number of rows populated (Default: 100000) - | ``-s | --seconds `` | Test duration in seconds (Default: 30) @@ -58,12 +61,23 @@ Arguments | Specify the number of operations to be executed. | This option cannot be set with ``--seconds``. -- | ``--tps `` - | Target total transaction-per-second (TPS) of all worker processes/threads +- | ``--tps|--tpsmax `` + | Target total transaction-per-second (TPS) of all worker processes/threads. + | When --tpsmin is also specified, this defines the upper-bound TPS. | (Default: Unset / Unthrottled) +- | ``--tpsmin `` + | Target total lower-bound TPS of all worker processes/threads + | (Default: Unset / Unthrottled) + +- | ``--tpsinterval `` + | Time period TPS oscillates between --tpsmax and --tpsmin (Default: 10) + +- | ``--tpschange `` + | Shape of the TPS change (Default: sin) + - | ``--keylen `` - | Key string length in bytes (Default and Minimum: 16) + | Key string length in bytes (Default and Minimum: 32) - | ``--vallen `` | Value string length in bytes (Default and Minimum: 16) @@ -75,22 +89,19 @@ Arguments | Generate a skewed workload based on Zipf distribution (Default: Unset = Uniform) - | ``--sampling `` - | Sampling rate (1 sample / ops) for latency stats + | Sampling rate (1 sample / ops) for latency stats (Default: 1000) - | ``--trace`` - | Enable tracing. The trace file will be created in the current directory. + | Enable tracing. The trace file will be created in the current directory. (Default: Unset) - | ``--tracepath `` | Enable tracing and set the trace file path. - | ``--knobs `` - | Set client knobs - -- | ``--flatbuffers`` - | Enable flatbuffers + | Set client knobs (comma-separated) - | ``--commitget`` - | Force commit for read-only transactions + | Force commit for read-only transactions (Default: Unset) - | ``-v | --verbose `` | Set verbose level (Default: 1) diff --git a/bindings/java/src/main/com/apple/foundationdb/FDB.java b/bindings/java/src/main/com/apple/foundationdb/FDB.java index e20fa90432..621417256d 100644 --- a/bindings/java/src/main/com/apple/foundationdb/FDB.java +++ b/bindings/java/src/main/com/apple/foundationdb/FDB.java @@ -30,7 +30,7 @@ import java.util.concurrent.atomic.AtomicInteger; /** * The starting point for accessing FoundationDB. *
- *

Setting API version

+ *

Setting API version

* The FoundationDB API is accessed with a call to {@link #selectAPIVersion(int)}. * This call is required before using any other part of the API. The call allows * an error to be thrown at this point to prevent client code from accessing a later library @@ -49,11 +49,11 @@ import java.util.concurrent.atomic.AtomicInteger; * being used to connect to the cluster. In particular, you should not advance * the API version of your application after upgrading your client until the * cluster has also been upgraded.
- *

Getting a database

+ *

Getting a database

* Once the API version has been set, the easiest way to get a {@link Database} object to use is * to call {@link #open}. *
- *

Client networking

+ *

Client networking

* The network is started either implicitly with a call to a variant of {@link #open()} * or started explicitly with a call to {@link #startNetwork()}. *
diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java b/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java index e5556faaa6..70dde8d2b5 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java @@ -39,7 +39,7 @@ import com.apple.foundationdb.Range; * the same order in which they would sort in FoundationDB. {@code Tuple}s sort * first by the first element, then by the second, etc. This makes the tuple layer * ideal for building a variety of higher-level data models.
- *

Types

+ *

Types

* A {@code Tuple} can * contain byte arrays ({@code byte[]}), {@link String}s, {@link Number}s, {@link UUID}s, * {@code boolean}s, {@link List}s, {@link Versionstamp}s, other {@code Tuple}s, and {@code null}. @@ -50,7 +50,7 @@ import com.apple.foundationdb.Range; * a {@code long} integral value, so the range will be constrained to * [{@code -2^63}, {@code 2^63-1}]. Note that for numbers outside this range the way that Java * truncates integral values may yield unexpected results.
- *

{@code null} values

+ *

{@code null} values

* The FoundationDB tuple specification has a special type-code for {@code None}; {@code nil}; or, * as Java would understand it, {@code null}. * The behavior of the layer in the presence of {@code null} varies by type with the intention diff --git a/bindings/java/src/main/overview.html.in b/bindings/java/src/main/overview.html.in index d594b769e3..648a4e3478 100644 --- a/bindings/java/src/main/overview.html.in +++ b/bindings/java/src/main/overview.html.in @@ -2,7 +2,7 @@ This documents the client API for using FoundationDB from Java.

-

Installation

+

Installation

FoundationDB's Java bindings rely on native libraries that are installed as part of the FoundationDB client binaries installation (see @@ -10,7 +10,7 @@ Installing FoundationDB client binaries). The JAR can be downloaded from our website and then added to your classpath.

-

Getting started

+

Getting started

To start using FoundationDB from Java, create an instance of the {@link com.apple.foundationdb.FDB FoundationDB API interface} with the version of the API that you want to use (this release of the FoundationDB Java API supports versions between {@code 510} and {@code 620}). @@ -50,7 +50,7 @@ public class Example { } } -

FoundationDB {@link com.apple.foundationdb.tuple Tuple API}

+

FoundationDB {@link com.apple.foundationdb.tuple Tuple API}

The {@link com.apple.foundationdb.tuple Tuple API} is provided with the core Java API for FoundationDB. This layer is provided in some form in all official language bindings. It enables cross-language support for storing and retrieving typed data from the @@ -60,7 +60,7 @@ binary data that FoundationDB supports. And, just as importantly, data packed in and general Tuple documentation for information about how Tuples sort and can be used to efficiently model data.
-

FoundationDB {@link com.apple.foundationdb.directory Directory API}

+

FoundationDB {@link com.apple.foundationdb.directory Directory API}

The {@link com.apple.foundationdb.directory Directory API} is provided with the core Java API for FoundationDB. This layer is provided in some form in all official language bindings. The FoundationDB API provides directories as a tool for diff --git a/cmake/AddFdbTest.cmake b/cmake/AddFdbTest.cmake index b2f9b72ea7..c494e19229 100644 --- a/cmake/AddFdbTest.cmake +++ b/cmake/AddFdbTest.cmake @@ -130,9 +130,69 @@ function(add_fdb_test) ${VALGRIND_OPTION} ${ADD_FDB_TEST_TEST_FILES} WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) - get_filename_component(test_dir_full ${first_file} DIRECTORY) - if(NOT ${test_dir_full} STREQUAL "") - get_filename_component(test_dir ${test_dir_full} NAME) - set_tests_properties(${test_name} PROPERTIES TIMEOUT ${this_test_timeout} LABELS "${test_dir}") - endif() + get_filename_component(test_dir_full ${first_file} DIRECTORY) + if(NOT ${test_dir_full} STREQUAL "") + get_filename_component(test_dir ${test_dir_full} NAME) + set_tests_properties(${test_name} PROPERTIES TIMEOUT ${this_test_timeout} LABELS "${test_dir}") + endif() + # set variables used for generating test packages + set(TEST_NAMES ${TEST_NAMES} ${test_name} PARENT_SCOPE) + set(TEST_FILES_${test_name} ${ADD_FDB_TEST_TEST_FILES} PARENT_SCOPE) + set(TEST_TYPE_${test_name} ${test_type} PARENT_SCOPE) +endfunction() + +if(NOT WIN32) + set(TEST_PACKAGE_INCLUDE ".*" CACHE STRING "A regex of all tests that should be included in the test package") + set(TEST_PACKAGE_EXCLUDE ".^" CACHE STRING "A regex of all tests that shouldn't be added to the test package") + set(TEST_PACKAGE_ADD_DIRECTORIES "" CACHE STRING "A ;-separated list of directories. All files within each directory will be added to the test package") +endif() + +function(create_test_package) + if(WIN32) + return() + endif() + string(LENGTH "${CMAKE_SOURCE_DIR}/tests/" base_length) + foreach(test IN LISTS TEST_NAMES) + if(("${TEST_TYPE_${test}}" STREQUAL "simulation") AND + (${test} MATCHES ${TEST_PACKAGE_INCLUDE}) AND + (NOT ${test} MATCHES ${TEST_PACKAGE_EXCLUDE})) + foreach(file IN LISTS TEST_FILES_${test}) + string(SUBSTRING ${file} ${base_length} -1 rel_out_file) + set(out_file ${CMAKE_BINARY_DIR}/packages/tests/${rel_out_file}) + list(APPEND out_files ${out_file}) + get_filename_component(test_dir ${out_file} DIRECTORY) + file(MAKE_DIRECTORY packages/tests/${test_dir}) + add_custom_command( + OUTPUT ${out_file} + DEPENDS ${file} + COMMAND ${CMAKE_COMMAND} -E copy ${file} ${out_file}) + endforeach() + endif() + endforeach() + foreach(dir IN LISTS TEST_PACKAGE_ADD_DIRECTORIES) + file(GLOB_RECURSE files ${dir}/*) + string(LENGTH ${dir} dir_len) + foreach(file IN LISTS files) + get_filename_component(src_dir ${file} DIRECTORY) + # We need to make sure that ${src_dir} is at least + # as long as ${dir}. Otherwise the later call to + # SUBSTRING will fail + set(src_dir "${src_dir}/") + string(SUBSTRING ${src_dir} ${dir_len} -1 dest_dir) + string(SUBSTRING ${file} ${dir_len} -1 out_file) + list(APPEND external_files ${CMAKE_BINARY_DIR}/packages/${out_file}) + file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/packages/${dest_dir}) + endforeach() + endforeach() + set(tar_file ${CMAKE_BINARY_DIR}/packages/correctness.tar.gz) + add_custom_command( + OUTPUT ${tar_file} + DEPENDS ${out_files} + COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} ${CMAKE_BINARY_DIR}/packages/bin/fdbserver + ${out_files} ${external_files} + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/packages + COMMENT "Package correctness archive" + ) + add_custom_target(package_tests DEPENDS ${tar_file}) + add_dependencies(package_tests strip_fdbserver) endfunction() diff --git a/cmake/ConfigureCompiler.cmake b/cmake/ConfigureCompiler.cmake index 171b8d7db4..47f95b6d22 100644 --- a/cmake/ConfigureCompiler.cmake +++ b/cmake/ConfigureCompiler.cmake @@ -217,7 +217,13 @@ else() else() add_compile_options(-Werror) endif() - add_compile_options($<$:-Wno-pragmas>) + if (GCC) + add_compile_options(-Wno-pragmas) + + # Otherwise `state [[maybe_unused]] int x;` will issue a warning. + # https://stackoverflow.com/questions/50646334/maybe-unused-on-member-variable-gcc-warns-incorrectly-that-attribute-is + add_compile_options(-Wno-attributes) + endif() add_compile_options(-Wno-error=format -Wunused-variable -Wno-deprecated @@ -235,9 +241,14 @@ else() # Check whether we can use dtrace probes include(CheckSymbolExists) check_symbol_exists(DTRACE_PROBE sys/sdt.h SUPPORT_DTRACE) + check_symbol_exists(aligned_alloc stdlib.h HAS_ALIGNED_ALLOC) + message(STATUS "Has aligned_alloc: ${HAS_ALIGNED_ALLOC}") if(SUPPORT_DTRACE) add_compile_definitions(DTRACE_PROBES) endif() + if(HAS_ALIGNED_ALLOC) + add_compile_definitions(HAS_ALIGNED_ALLOC) + endif() if(CMAKE_COMPILER_IS_GNUCXX) set(USE_LTO OFF CACHE BOOL "Do link time optimization") diff --git a/cmake/FlowCommands.cmake b/cmake/FlowCommands.cmake index 8c9d964d3e..19df995f25 100644 --- a/cmake/FlowCommands.cmake +++ b/cmake/FlowCommands.cmake @@ -136,7 +136,6 @@ function(strip_debug_symbols target) add_custom_command(OUTPUT "${out_file}.debug" COMMAND objcopy --only-keep-debug $ "${out_file}.debug" && objcopy --add-gnu-debuglink="${out_file}.debug" ${out_file} - DEPENDS "${out_file}" COMMENT "Copy debug symbols to ${out_name}.debug") list(APPEND out_files "${out_file}.debug") endif() diff --git a/tools/alloc_instrumentation.py b/contrib/alloc_instrumentation.py similarity index 100% rename from tools/alloc_instrumentation.py rename to contrib/alloc_instrumentation.py diff --git a/monitoring/CMakeLists.txt b/contrib/monitoring/CMakeLists.txt similarity index 100% rename from monitoring/CMakeLists.txt rename to contrib/monitoring/CMakeLists.txt diff --git a/monitoring/actor_flamegraph.cpp b/contrib/monitoring/actor_flamegraph.cpp similarity index 100% rename from monitoring/actor_flamegraph.cpp rename to contrib/monitoring/actor_flamegraph.cpp diff --git a/contrib/transaction_profiling_analyzer.py b/contrib/transaction_profiling_analyzer.py new file mode 100644 index 0000000000..c7d6e0c602 --- /dev/null +++ b/contrib/transaction_profiling_analyzer.py @@ -0,0 +1,806 @@ +""" +Requirements: +python3 +fdb python bindings +optional packages: + dateparser (for human date parsing) + sortedcontainers (for estimating key range read/write density) +""" + + +import argparse +from collections import defaultdict +from enum import Enum +import fdb +from fdb.impl import strinc +import json +from json import JSONEncoder +import logging +import struct +from bisect import bisect_left +import time + +PROTOCOL_VERSION_5_2 = 0x0FDB00A552000001 +PROTOCOL_VERSION_6_0 = 0x0FDB00A570010001 +PROTOCOL_VERSION_6_1 = 0x0FDB00B061060001 +PROTOCOL_VERSION_6_2 = 0x0FDB00B062010001 +supported_protocol_versions = frozenset([PROTOCOL_VERSION_5_2, PROTOCOL_VERSION_6_0, PROTOCOL_VERSION_6_1, + PROTOCOL_VERSION_6_2]) + + +fdb.api_version(600) + +BASIC_FORMAT = "%(asctime)s - %(levelname)-8s %(message)s" +LOG_PATH = "transaction_profiling_analyzer.log" + + +def setup_logger(name): + root = logging.getLogger(name) + root.setLevel(logging.DEBUG) + root.propagate = False + + file_formatter = logging.Formatter(BASIC_FORMAT) + + file_handler = logging.FileHandler(LOG_PATH) + file_handler.setFormatter(file_formatter) + file_handler.setLevel(logging.DEBUG) + + root.addHandler(file_handler) + + return root + + +logger = setup_logger(__name__) + + +class ByteBuffer(object): + def __init__(self, val): + self._offset = 0 + self.val = val + + def get_bytes(self, n): + if self._offset + n > len(self.val): + raise IndexError("Request to read %d bytes with only %d remaining" % (n, self.get_remaining_bytes())) + ret = self.val[self._offset:self._offset + n] + self._offset += n + return ret + + def get_int(self): + return struct.unpack("= PROTOCOL_VERSION_6_2: + self.transaction_priority_type = bb.get_int() + + +class GetInfo(BaseInfo): + def __init__(self, bb): + super().__init__(bb.get_double()) + self.latency = bb.get_double() + self.value_size = bb.get_int() + self.key = bb.get_bytes_with_length() + + +class GetRangeInfo(BaseInfo): + def __init__(self, bb): + super().__init__(bb.get_double()) + self.latency = bb.get_double() + self.range_size = bb.get_int() + self.key_range = bb.get_key_range() + + +class CommitInfo(BaseInfo): + def __init__(self, bb, full_output=True): + super().__init__(bb.get_double()) + self.latency = bb.get_double() + self.num_mutations = bb.get_int() + self.commit_bytes = bb.get_int() + + read_conflict_range = bb.get_key_range_list() + if full_output: + self.read_conflict_range = read_conflict_range + write_conflict_range = bb.get_key_range_list() + if full_output: + self.write_conflict_range = write_conflict_range + mutations = bb.get_mutation_list() + if full_output: + self.mutations = mutations + + self.read_snapshot_version = bb.get_long() + + +class ErrorGetInfo(BaseInfo): + def __init__(self, bb): + super().__init__(bb.get_double()) + self.error_code = bb.get_int() + self.key = bb.get_bytes_with_length() + + +class ErrorGetRangeInfo(BaseInfo): + def __init__(self, bb): + super().__init__(bb.get_double()) + self.error_code = bb.get_int() + self.key_range = bb.get_key_range() + + +class ErrorCommitInfo(BaseInfo): + def __init__(self, bb, full_output=True): + super().__init__(bb.get_double()) + self.error_code = bb.get_int() + + read_conflict_range = bb.get_key_range_list() + if full_output: + self.read_conflict_range = read_conflict_range + write_conflict_range = bb.get_key_range_list() + if full_output: + self.write_conflict_range = write_conflict_range + mutations = bb.get_mutation_list() + if full_output: + self.mutations = mutations + + self.read_snapshot_version = bb.get_long() + + +class UnsupportedProtocolVersionError(Exception): + def __init__(self, protocol_version): + super().__init__("Unsupported protocol version 0x%0.2X" % protocol_version) + + +class ClientTransactionInfo: + def __init__(self, bb, full_output=True, type_filter=None): + self.get_version = None + self.gets = [] + self.get_ranges = [] + self.commit = None + self.error_gets = [] + self.error_get_ranges = [] + self.error_commits = [] + + protocol_version = bb.get_long() + if protocol_version not in supported_protocol_versions: + raise UnsupportedProtocolVersionError(protocol_version) + while bb.get_remaining_bytes(): + event = bb.get_int() + if event == 0: + # we need to read it to consume the buffer even if we don't want to store it + get_version = GetVersionInfo(bb, protocol_version) + if (not type_filter or "get_version" in type_filter): + self.get_version = get_version + elif event == 1: + get = GetInfo(bb) + if (not type_filter or "get" in type_filter): + # because of the crappy json serializtion using __dict__ we have to set the list here otherwise + # it doesn't print + if not self.gets: self.gets = [] + self.gets.append(get) + elif event == 2: + get_range = GetRangeInfo(bb) + if (not type_filter or "get_range" in type_filter): + if not self.get_ranges: self.get_ranges = [] + self.get_ranges.append(get_range) + elif event == 3: + commit = CommitInfo(bb, full_output=full_output) + if (not type_filter or "commit" in type_filter): + self.commit = commit + elif event == 4: + error_get = ErrorGetInfo(bb) + if (not type_filter or "error_gets" in type_filter): + if not self.error_gets: self.error_gets = [] + self.error_gets.append(error_get) + elif event == 5: + error_get_range = ErrorGetRangeInfo(bb) + if (not type_filter or "error_get_range" in type_filter): + if not self.error_get_ranges: self.error_get_ranges = [] + self.error_get_ranges.append(error_get_range) + elif event == 6: + error_commit = ErrorCommitInfo(bb, full_output=full_output) + if (not type_filter or "error_commit" in type_filter): + if not self.error_commits: self.error_commits = [] + self.error_commits.append(error_commit) + else: + raise Exception("Unknown event type %d" % event) + + def has_types(self): + return self.get_version or self.gets or self.get_ranges or self.commit or self.error_gets \ + or self.error_get_ranges or self.error_commits + + def to_json(self): + return json.dumps(self, cls=ObjJsonEncoder, sort_keys=True) + + +class TransactionInfoLoader(object): + max_num_chunks_to_store = 1000 # Each chunk would be 100 KB in size + + def __init__(self, db, full_output=True, type_filter=None, min_timestamp=None, max_timestamp=None): + self.db = db + self.full_output = full_output + self.type_filter = type_filter + self.min_timestamp = min_timestamp + self.max_timestamp = max_timestamp + ''' + Keys look like this + FF - 2 bytes \xff\x02 + SSSSSSSSSS - 10 bytes Version Stamp + RRRRRRRRRRRRRRRR - 16 bytes Transaction id + NNNN - 4 Bytes Chunk number + TTTT - 4 Bytes Total number of chunks + ''' + sample_key = "FF/fdbClientInfo/client_latency/SSSSSSSSSS/RRRRRRRRRRRRRRRR/NNNNTTTT/" + + self.client_latency_start = b'\xff\x02/fdbClientInfo/client_latency/' + self.client_latency_start_key_selector = fdb.KeySelector.first_greater_than(self.client_latency_start) + self.client_latency_end_key_selector = fdb.KeySelector.first_greater_or_equal(strinc(self.client_latency_start)) + self.version_stamp_start_idx = sample_key.index('S') + self.version_stamp_end_idx = sample_key.rindex('S') + self.tr_id_start_idx = sample_key.index('R') + self.tr_id_end_idx = sample_key.rindex('R') + self.chunk_num_start_idx = sample_key.index('N') + self.num_chunks_start_idx = sample_key.index('T') + + self.tr_info_map = {} + self.num_chunks_stored = 0 + self.num_transactions_discarded = 0 + + def _check_and_adjust_chunk_cache_size(self): + if self.num_chunks_stored > self.max_num_chunks_to_store: + c_list = self.tr_info_map.pop(next(iter(self.tr_info_map))) + self.num_chunks_stored -= len(c_list) + self.num_transactions_discarded += 1 + + def parse_key(self, k): + version_stamp_bytes = k[self.version_stamp_start_idx:self.version_stamp_end_idx + 1] + tr_id = k[self.tr_id_start_idx:self.tr_id_end_idx + 1] + num_chunks = struct.unpack(">i", k[self.num_chunks_start_idx:self.num_chunks_start_idx + 4])[0] + chunk_num = struct.unpack(">i", k[self.chunk_num_start_idx:self.chunk_num_start_idx + 4])[0] + return version_stamp_bytes, tr_id, num_chunks, chunk_num + + def get_key_prefix_for_version_stamp(self, version_stamp): + return self.client_latency_start + struct.pack(">Q", version_stamp) + b'\x00\x00' + + @fdb.transactional + def find_version_for_timestamp(self, tr, timestamp, start): + """ + Uses Timekeeper to find the closest version to a timestamp. + If start is True, will find the greatest version at or before timestamp. + If start is False, will find the smallest version at or after the timestamp. + + :param tr: + :param timestamp: + :param start: + :return: + """ + tr.options.set_read_system_keys() + tr.options.set_read_lock_aware() + timekeeper_prefix = b'\xff\x02/timeKeeper/map/' + timestamp_packed = fdb.tuple.pack((timestamp,)) + if start: + start_key = timekeeper_prefix + end_key = fdb.KeySelector.first_greater_than(timekeeper_prefix + timestamp_packed) + reverse = True + else: + start_key = fdb.KeySelector.first_greater_or_equal(timekeeper_prefix + timestamp_packed) + end_key = fdb.KeySelector.first_greater_or_equal(strinc(timekeeper_prefix)) + reverse = False + for k, v in tr.snapshot.get_range(start_key, end_key, limit=1, reverse=reverse): + return fdb.tuple.unpack(v)[0] + return 0 if start else 0x8000000000000000 # we didn't find any timekeeper data so find the max range + + def fetch_transaction_info(self): + if self.min_timestamp: + start_version = self.find_version_for_timestamp(self.db, self.min_timestamp, True) + logger.debug("Using start version %s" % start_version) + start_key = self.get_key_prefix_for_version_stamp(start_version) + else: + start_key = self.client_latency_start_key_selector + + if self.max_timestamp: + end_version = self.find_version_for_timestamp(self.db, self.max_timestamp, False) + logger.debug("Using end version %s" % end_version) + end_key = self.get_key_prefix_for_version_stamp(end_version) + else: + end_key = self.client_latency_end_key_selector + + valid_transaction_infos = 0 + invalid_transaction_infos = 0 + + def build_client_transaction_info(v): + return ClientTransactionInfo(ByteBuffer(v), full_output=self.full_output, type_filter=self.type_filter) + + more = True + tr = self.db.create_transaction() + while more: + tr.options.set_read_system_keys() + tr.options.set_read_lock_aware() + found = 0 + buffer = [] + try: + logger.debug("Querying [%s:%s]" % (start_key, end_key)) + transaction_info_range = tr.snapshot.get_range(start_key, end_key, + streaming_mode=fdb.impl.StreamingMode.want_all) + for k, v in transaction_info_range: + found += 1 + #logger.debug(k) + start_key = fdb.KeySelector.first_greater_than(k) + + _, tr_id, num_chunks, chunk_num = self.parse_key(k) + + #logger.debug("num_chunks=%d, chunk_num=%d" % (num_chunks,chunk_num)) + + if num_chunks == 1: + assert chunk_num == 1 + try: + info = build_client_transaction_info(v) + if info.has_types(): + buffer.append(info) + valid_transaction_infos += 1 + except UnsupportedProtocolVersionError as e: + invalid_transaction_infos += 1 + except ValueError: + invalid_transaction_infos += 1 + else: + if chunk_num == 1: + # first chunk + assert tr_id not in self.tr_info_map + self.tr_info_map[tr_id] = [TrInfoChunk(num_chunks, chunk_num, k, v)] + self.num_chunks_stored += 1 + self._check_and_adjust_chunk_cache_size() + else: + if tr_id not in self.tr_info_map: + logger.error("Got a middle chunk without getting beginning part. Discarding transaction id: %s\n" % tr_id) + continue + c_list = self.tr_info_map[tr_id] + if c_list[-1].num_chunks != num_chunks or c_list[-1].chunk_num != chunk_num - 1: + self.tr_info_map.pop(tr_id) + self.num_chunks_stored -= len(c_list) + raise Exception("Chunk numbers do not match for Transaction id: %s" % tr_id) + c_list.append(TrInfoChunk(num_chunks, chunk_num, k, v)) + self.num_chunks_stored += 1 + if num_chunks == chunk_num: + self.tr_info_map.pop(tr_id) + self.num_chunks_stored -= len(c_list) + try: + info = build_client_transaction_info(b''.join([chunk.value for chunk in c_list])) + if info.has_types(): + buffer.append(info) + valid_transaction_infos += 1 + except UnsupportedProtocolVersionError as e: + invalid_transaction_infos += 1 + except ValueError: + invalid_transaction_infos += 1 + self._check_and_adjust_chunk_cache_size() + if (valid_transaction_infos + invalid_transaction_infos) % 1000 == 0: + print("Processed valid: %d, invalid: %d" % (valid_transaction_infos, invalid_transaction_infos)) + if found == 0: + more = False + except fdb.FDBError as e: + # if too old then reset and don't wait + if e.code == 1007: + tr.reset() + else: + tr.on_error(e).wait() + for item in buffer: + yield item + + +def has_sortedcontainers(): + try: + import sortedcontainers + return True + except ImportError: + logger.warn("Can't find sortedcontainers so disabling RangeCounter") + return False + + +def has_dateparser(): + try: + import dateparser + return True + except ImportError: + logger.warn("Can't find dateparser so disabling human date parsing") + return False + + +class RangeCounter(object): + def __init__(self, k): + self.k = k + from sortedcontainers import SortedDict + self.ranges = SortedDict() + + def process(self, transaction_info): + for get_range in transaction_info.get_ranges: + self._insert_range(get_range.key_range.start_key, get_range.key_range.end_key) + + def _insert_range(self, start_key, end_key): + keys = self.ranges.keys() + if len(keys) == 0: + self.ranges[start_key] = end_key, 1 + return + + start_pos = bisect_left(keys, start_key) + end_pos = bisect_left(keys, end_key) + #print("start_pos=%d, end_pos=%d" % (start_pos, end_pos)) + + possible_intersection_keys = keys[max(0, start_pos - 1):min(len(keys), end_pos+1)] + + start_range_left = start_key + + for key in possible_intersection_keys: + cur_end_key, cur_count = self.ranges[key] + #logger.debug("key=%s, cur_end_key=%s, cur_count=%d, start_range_left=%s" % (key, cur_end_key, cur_count, start_range_left)) + if start_range_left < key: + if end_key <= key: + self.ranges[start_range_left] = end_key, 1 + return + self.ranges[start_range_left] = key, 1 + start_range_left = key + assert start_range_left >= key + if start_range_left >= cur_end_key: + continue + + # [key, start_range_left) = cur_count + # if key == start_range_left this will get overwritten below + self.ranges[key] = start_range_left, cur_count + + if end_key <= cur_end_key: + # [start_range_left, end_key) = cur_count+1 + # [end_key, cur_end_key) = cur_count + self.ranges[start_range_left] = end_key, cur_count + 1 + if end_key != cur_end_key: + self.ranges[end_key] = cur_end_key, cur_count + start_range_left = end_key + break + else: + # [start_range_left, cur_end_key) = cur_count+1 + self.ranges[start_range_left] = cur_end_key, cur_count+1 + start_range_left = cur_end_key + assert start_range_left <= end_key + + # there may be some range left + if start_range_left < end_key: + self.ranges[start_range_left] = end_key, 1 + + def get_count_for_key(self, key): + if key in self.ranges: + return self.ranges[key][1] + + keys = self.ranges.keys() + index = bisect_left(keys, key) + if index == 0: + return 0 + + index_key = keys[index-1] + if index_key <= key < self.ranges[index_key][0]: + return self.ranges[index_key][1] + return 0 + + def get_range_boundaries(self, shard_finder=None): + total = sum([count for _, (_, count) in self.ranges.items()]) + range_size = total // self.k + output_range_counts = [] + + def add_boundary(start, end, count): + if shard_finder: + shard_count = shard_finder.get_shard_count(start, end) + if shard_count == 1: + addresses = shard_finder.get_addresses_for_key(start) + else: + addresses = None + output_range_counts.append((start, end, count, shard_count, addresses)) + else: + output_range_counts.append((start, end, count, None, None)) + + this_range_start_key = None + count_this_range = 0 + for (start_key, (end_key, count)) in self.ranges.items(): + if not this_range_start_key: + this_range_start_key = start_key + count_this_range += count + if count_this_range >= range_size: + add_boundary(this_range_start_key, end_key, count_this_range) + count_this_range = 0 + this_range_start_key = None + if count_this_range > 0: + add_boundary(this_range_start_key, end_key, count_this_range) + + return output_range_counts + + +class ShardFinder(object): + def __init__(self, db): + self.db = db + + @staticmethod + @fdb.transactional + def _get_boundary_keys(tr, begin, end): + tr.options.set_read_lock_aware() + return fdb.locality.get_boundary_keys(tr, begin, end) + + @staticmethod + @fdb.transactional + def _get_addresses_for_key(tr, key): + tr.options.set_read_lock_aware() + return fdb.locality.get_addresses_for_key(tr, key) + + def get_shard_count(self, start_key, end_key): + return len(list(self._get_boundary_keys(self.db, start_key, end_key))) + 1 + + def get_addresses_for_key(self, key): + return [a.decode('ascii') for a in self._get_addresses_for_key(self.db, key).wait()] + + +class TopKeysCounter(object): + mutation_types_to_consider = frozenset([MutationType.SET_VALUE, MutationType.ADD_VALUE]) + + def __init__(self, k): + self.k = k + self.reads = defaultdict(lambda: 0) + self.writes = defaultdict(lambda: 0) + + def process(self, transaction_info): + for get in transaction_info.gets: + self.reads[get.key] += 1 + if transaction_info.commit: + for mutation in transaction_info.commit.mutations: + if mutation.code in self.mutation_types_to_consider: + self.writes[mutation.param_one] += 1 + + def _get_range_boundaries(self, counts, shard_finder=None): + total = sum([v for (k, v) in counts.items()]) + range_size = total // self.k + key_counts_sorted = sorted(counts.items()) + output_range_counts = [] + + def add_boundary(start, end, count): + if shard_finder: + shard_count = shard_finder.get_shard_count(start, end) + if shard_count == 1: + addresses = shard_finder.get_addresses_for_key(start) + else: + addresses = None + output_range_counts.append((start, end, count, shard_count, addresses)) + else: + output_range_counts.append((start, end, count, None, None)) + + start_key = None + count_this_range = 0 + for (k, v) in key_counts_sorted: + if not start_key: + start_key = k + count_this_range += v + if count_this_range >= range_size: + add_boundary(start_key, k, count_this_range) + count_this_range = 0 + start_key = None + if count_this_range > 0: + add_boundary(start_key, k, count_this_range) + + return output_range_counts + + def _get_top_k(self, counts): + count_key_pairs = sorted([(v, k) for (k, v) in counts.items()], reverse=True) + return count_key_pairs[0:self.k] + + def get_top_k_reads(self): + return self._get_top_k(self.reads) + + def get_top_k_writes(self): + return self._get_top_k(self.writes) + + def get_k_read_range_boundaries(self, shard_finder=None): + return self._get_range_boundaries(self.reads, shard_finder) + + def get_k_write_range_boundaries(self, shard_finder=None): + return self._get_range_boundaries(self.writes, shard_finder) + + +def connect(cluster_file=None): + db = fdb.open(cluster_file=cluster_file) + return db + + +def main(): + parser = argparse.ArgumentParser(description="TransactionProfilingAnalyzer") + parser.add_argument("-C", "--cluster-file", type=str, help="Cluster file") + parser.add_argument("--full-output", action="store_true", help="Print full output from mutations") + parser.add_argument("--filter-get-version", action="store_true", + help="Include get_version type. If no filter args are given all will be returned.") + parser.add_argument("--filter-get", action="store_true", + help="Include get type. If no filter args are given all will be returned.") + parser.add_argument("--filter-get-range", action="store_true", + help="Include get_range type. If no filter args are given all will be returned.") + parser.add_argument("--filter-commit", action="store_true", + help="Include commit type. If no filter args are given all will be returned.") + parser.add_argument("--filter-error-get", action="store_true", + help="Include error_get type. If no filter args are given all will be returned.") + parser.add_argument("--filter-error-get-range", action="store_true", + help="Include error_get_range type. If no filter args are given all will be returned.") + parser.add_argument("--filter-error-commit", action="store_true", + help="Include error_commit type. If no filter args are given all will be returned.") + start_time_group = parser.add_mutually_exclusive_group() + start_time_group.add_argument("--min-timestamp", type=int, help="Don't return events older than this epoch time") + start_time_group.add_argument("-s", "--start-time", type=str, + help="Don't return events older than this parsed time") + end_time_group = parser.add_mutually_exclusive_group() + end_time_group.add_argument("--max-timestamp", type=int, help="Don't return events newer than this epoch time") + end_time_group.add_argument("-e", "--end-time", type=str, help="Don't return events older than this parsed time") + parser.add_argument("--top-keys", type=int, help="If specified will output this many top keys for reads or writes", default=0) + args = parser.parse_args() + + type_filter = set() + if args.filter_get_version: type_filter.add("get_version") + if args.filter_get: type_filter.add("get") + if args.filter_get_range: type_filter.add("get_range") + if args.filter_commit: type_filter.add("commit") + if args.filter_error_get: type_filter.add("error_get") + if args.filter_error_get_range: type_filter.add("error_get_range") + if args.filter_error_commit: type_filter.add("error_commit") + top_keys = args.top_keys + key_counter = TopKeysCounter(top_keys) if top_keys else None + range_counter = RangeCounter(top_keys) if (has_sortedcontainers() and top_keys) else None + full_output = args.full_output or (top_keys is not None) + + if args.min_timestamp: + min_timestamp = args.min_timestamp + elif args.start_time: + if not has_dateparser(): + raise Exception("Can't find dateparser needed to parse human dates") + import dateparser + min_timestamp = int(dateparser.parse(args.start_time).timestamp()) + else: + raise Exception("Must specify start time") + + if args.max_timestamp: + max_timestamp = args.max_timestamp + elif args.end_time: + if not has_dateparser(): + raise Exception("Can't find dateparser needed to parse human dates") + import dateparser + max_timestamp = int(dateparser.parse(args.end_time).timestamp()) + else: + raise Exception("Must specify end time") + + now = time.time() + if max_timestamp > now: + raise Exception("max_timestamp is %d seconds in the future" % (max_timestamp - now)) + if min_timestamp > now: + raise Exception("min_timestamp is %d seconds in the future" % (min_timestamp - now)) + + logger.info("Loading transactions from %d to %d" % (min_timestamp, max_timestamp)) + + db = connect(cluster_file=args.cluster_file) + loader = TransactionInfoLoader(db, full_output=full_output, type_filter=type_filter, + min_timestamp=min_timestamp, max_timestamp=max_timestamp) + for info in loader.fetch_transaction_info(): + if info.has_types(): + if not key_counter and not range_counter: + print(info.to_json()) + else: + if key_counter: + key_counter.process(info) + if range_counter: + range_counter.process(info) + + if key_counter: + def print_top(top): + for (count, key) in top: + print("%s %d" % (key, count)) + + def print_range_boundaries(range_boundaries): + for (start, end, count, shard_count, addresses) in range_boundaries: + if not shard_count: + print("[%s, %s] %d" % (start, end, count)) + else: + addresses_string = "addresses=%s" % ','.join(addresses) if addresses else '' + print("[%s, %s] %d shards=%d %s" % (start, end, count, shard_count, addresses_string)) + + shard_finder = ShardFinder(db) + top_reads = key_counter.get_top_k_reads() + if top_reads: + print("Top %d reads:" % min(top_keys, len(top_reads))) + print_top(top_reads) + print("Approx equal sized gets range boundaries:") + print_range_boundaries(key_counter.get_k_read_range_boundaries(shard_finder=shard_finder)) + top_writes = key_counter.get_top_k_writes() + if top_writes: + print("Top %d writes:" % min(top_keys, len(top_writes))) + print_top(top_writes) + print("Approx equal sized commits range boundaries:") + print_range_boundaries(key_counter.get_k_write_range_boundaries(shard_finder=shard_finder)) + if range_counter: + range_boundaries = range_counter.get_range_boundaries(shard_finder=shard_finder) + if range_boundaries: + print("Approx equal sized get_ranges boundaries:") + print_range_boundaries(range_boundaries) + + +if __name__ == "__main__": + main() + diff --git a/documentation/CMakeLists.txt b/documentation/CMakeLists.txt index ba4b299433..83fabf20ba 100644 --- a/documentation/CMakeLists.txt +++ b/documentation/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory(tutorial) # build a virtualenv set(sphinx_dir ${CMAKE_CURRENT_SOURCE_DIR}/sphinx) set(venv_dir ${CMAKE_CURRENT_BINARY_DIR}/venv) diff --git a/documentation/sphinx/source/downloads.rst b/documentation/sphinx/source/downloads.rst index 82aefde475..4f300b9aee 100644 --- a/documentation/sphinx/source/downloads.rst +++ b/documentation/sphinx/source/downloads.rst @@ -10,38 +10,38 @@ macOS The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server. -* `FoundationDB-6.2.6.pkg `_ +* `FoundationDB-6.2.8.pkg `_ Ubuntu ------ The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x. -* `foundationdb-clients-6.2.6-1_amd64.deb `_ -* `foundationdb-server-6.2.6-1_amd64.deb `_ (depends on the clients package) +* `foundationdb-clients-6.2.8-1_amd64.deb `_ +* `foundationdb-server-6.2.8-1_amd64.deb `_ (depends on the clients package) RHEL/CentOS EL6 --------------- The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x. -* `foundationdb-clients-6.2.6-1.el6.x86_64.rpm `_ -* `foundationdb-server-6.2.6-1.el6.x86_64.rpm `_ (depends on the clients package) +* `foundationdb-clients-6.2.8-1.el6.x86_64.rpm `_ +* `foundationdb-server-6.2.8-1.el6.x86_64.rpm `_ (depends on the clients package) RHEL/CentOS EL7 --------------- The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x. -* `foundationdb-clients-6.2.6-1.el7.x86_64.rpm `_ -* `foundationdb-server-6.2.6-1.el7.x86_64.rpm `_ (depends on the clients package) +* `foundationdb-clients-6.2.8-1.el7.x86_64.rpm `_ +* `foundationdb-server-6.2.8-1.el7.x86_64.rpm `_ (depends on the clients package) Windows ------- The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server. -* `foundationdb-6.2.6-x64.msi `_ +* `foundationdb-6.2.8-x64.msi `_ API Language Bindings ===================== @@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part If you need to use the FoundationDB Python API from other Python installations or paths, download the Python package: -* `foundationdb-6.2.6.tar.gz `_ +* `foundationdb-6.2.8.tar.gz `_ Ruby 1.9.3/2.0.0+ ----------------- -* `fdb-6.2.6.gem `_ +* `fdb-6.2.8.gem `_ Java 8+ ------- -* `fdb-java-6.2.6.jar `_ -* `fdb-java-6.2.6-javadoc.jar `_ +* `fdb-java-6.2.8.jar `_ +* `fdb-java-6.2.8-javadoc.jar `_ Go 1.11+ -------- diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index 911e7d8baf..e8d4cb1b7f 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -29,7 +29,8 @@ "resolution", "proxy", "master", - "test" + "test", + "storage_cache" ] }, "degraded":true, @@ -66,6 +67,7 @@ "cluster_controller", "data_distributor", "ratekeeper", + "storage_cache", "router", "coordinator" ] diff --git a/documentation/sphinx/source/old-release-notes/release-notes-620.rst b/documentation/sphinx/source/old-release-notes/release-notes-620.rst index 8227a415a0..a15a73622f 100644 --- a/documentation/sphinx/source/old-release-notes/release-notes-620.rst +++ b/documentation/sphinx/source/old-release-notes/release-notes-620.rst @@ -2,7 +2,16 @@ Release Notes ############# -6.2.6 +6.2.8 +===== + +Fixes +----- + +* Significantly improved the rate at which the transaction logs in a remote region can pull data from the primary region. `(PR #2307) `_ `(PR #2323) `_. +* The ``system_kv_size_bytes`` status field could report a size much larger than the actual size of the system keyspace. `(PR #2305) `_. + +6.2.7 ===== Performance @@ -39,7 +48,6 @@ Fixes * File descriptors opened by clients and servers set close-on-exec, if available on the platform. `(PR #1581) `_. * ``fdbrestore`` commands other than ``start`` required a default cluster file to be found but did not actually use it. `(PR #1912) `_. * Unneeded network connections were not being closed because peer reference counts were handled improperly. `(PR #1768) `_. -* Under certain conditions, cross region replication could stall for 10 minute periods. `(PR #1818) `_. * In very rare scenarios, master recovery would restart because system metadata was loaded incorrectly. `(PR #1919) `_. * Ratekeeper will aggressively throttle when unable to fetch the list of storage servers for a considerable period of time. `(PR #1858) `_. * Proxies could become overloaded when all storage servers on a team fail. [6.2.1] `(PR #1976) `_. @@ -58,6 +66,10 @@ Fixes * Committing transactions larger than 1 MB could cause the proxy to stall for up to a second. [6.2.6] `(PR #2250) `_. * The cluster controller could become saturated in clusters with large numbers of connected clients using TLS. [6.2.6] `(PR #2252) `_. * Backup and DR would not share a mutation stream if they were started on different versions of FoundationDB. Either backup or DR must be restarted to resolve this issue. [6.2.6] `(PR #2202) `_. +* Don't track batch priority GRV requests in latency bands. [6.2.7] `(PR #2279) `_. +* Transaction log processes used twice their normal memory when switching spill types. [6.2.7] `(PR #2256) `_. +* Under certain conditions, cross region replication could stall for 10 minute periods. [6.2.7] `(PR #1818) `_ `(PR #2276) `_. +* When dropping a remote region from the configuration after processes in the region have failed, data distribution would create teams from the dead servers for one minute. [6.2.7] `(PR #2286) `_. Status ------ @@ -130,10 +142,10 @@ Fixes only impacting 6.2.0+ * The cluster controller would saturate its CPU for a few seconds when sending configuration information to all of the worker processes. [6.2.4] `(PR #2086) `_. * The data distributor would build all possible team combinations if it was tracking an unhealthy server with less than 10 teams. [6.2.4] `(PR #2099) `_. * The cluster controller could crash if a coordinator was unreachable when compiling cluster status. [6.2.4] `(PR #2065) `_. -* The cluster controller could crash if a coordinator was unreachable when compiling cluster status. [6.2.4] `(PR #2065) `_. * A storage server could crash if it took longer than 10 minutes to fetch a key range from another server. [6.2.5] `(PR #2170) `_. * Excluding or including servers would restart the data distributor. [6.2.5] `(PR #2170) `_. * The data distributor could read invalid memory when estimating database size. [6.2.6] `(PR #2225) `_. +* Status could incorrectly report that backup and DR were not sharing a mutation stream. [6.2.7] `(PR #2274) `_. Earlier release notes --------------------- diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index e2858be844..b2d130bc67 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -13,13 +13,14 @@ Fixes Status ------ -* Replaced ``cluster.database_locked`` status field with ``cluster.database_lock_state``, which contains two subfields: ``locked`` (boolean) and ``lock_uid`` (which contains the database lock uid if the database is locked). `(PR #2058) `_. +* Replaced ``cluster.database_locked`` status field with ``cluster.database_lock_state``, which contains two subfields: ``locked`` (boolean) and ``lock_uid`` (which contains the database lock uid if the database is locked). `(PR #2058) `_ Bindings -------- Other Changes ------------- +* Double the number of shard locations that the client will cache locally. `(PR #2198) `_ Earlier release notes --------------------- diff --git a/documentation/tutorial/CMakeLists.txt b/documentation/tutorial/CMakeLists.txt new file mode 100644 index 0000000000..5c5e181625 --- /dev/null +++ b/documentation/tutorial/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TUTORIAL_SRCS tutorial.actor.cpp) + +add_flow_target(EXECUTABLE NAME tutorial SRCS "${TUTORIAL_SRCS}") +target_link_libraries(tutorial PUBLIC fdbclient) diff --git a/documentation/tutorial/tutorial.actor.cpp b/documentation/tutorial/tutorial.actor.cpp new file mode 100644 index 0000000000..d0be6a3e2b --- /dev/null +++ b/documentation/tutorial/tutorial.actor.cpp @@ -0,0 +1,467 @@ +/* + * tutorial.actor.cpp + + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flow/flow.h" +#include "flow/Platform.h" +#include "flow/DeterministicRandom.h" +#include "fdbclient/NativeAPI.actor.h" +#include "fdbclient/ReadYourWrites.h" +#include +#include +#include +#include +#include "flow/actorcompiler.h" + +NetworkAddress serverAddress; + +// this is a simple actor that will report how long +// it is already running once a second. +ACTOR Future simpleTimer() { + // we need to remember the time when we first + // started. + // This needs to be a state-variable because + // we will use it in different parts of the + // actor. If you don't understand how state + // variables work, it is a good idea to remove + // the state keyword here and look at the + // generated C++ code from the actor compiler. + state double start_time = g_network->now(); + loop { + wait(delay(1.0)); + std::cout << format("Time: %.2f\n", g_network->now() - start_time); + } +} + +// A actor that demonstrates how choose-when +// blocks work. +ACTOR Future someFuture(Future ready) { + // loop choose {} works as well here - the braces are optional + loop choose { + when(wait(delay(0.5))) { std::cout << "Still waiting...\n"; } + when(int r = wait(ready)) { + std::cout << format("Ready %d\n", r); + wait(delay(double(r))); + std::cout << "Done\n"; + return Void(); + } + } +} + +ACTOR Future promiseDemo() { + state Promise promise; + state Future f = someFuture(promise.getFuture()); + wait(delay(3.0)); + promise.send(2); + wait(f); + return Void(); +} + +ACTOR Future eventLoop(AsyncTrigger* trigger) { + loop choose { + when(wait(delay(0.5))) { std::cout << "Still waiting...\n"; } + when(wait(trigger->onTrigger())) { std::cout << "Triggered!\n"; } + } +} + +ACTOR Future triggerDemo() { + state int runs = 1; + state AsyncTrigger trigger; + state Future triggerLoop = eventLoop(&trigger); + while (++runs < 10) { + wait(delay(1.0)); + std::cout << "trigger.."; + trigger.trigger(); + } + std::cout << "Done."; + return Void(); +} + +struct EchoServerInterface { + constexpr static FileIdentifier file_identifier = 3152015; + RequestStream getInterface; + RequestStream echo; + RequestStream reverse; + + template + void serialize(Ar& ar) { + serializer(ar, echo, reverse); + } +}; + +struct GetInterfaceRequest { + constexpr static FileIdentifier file_identifier = 12004156; + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, reply); + } +}; + +struct EchoRequest { + constexpr static FileIdentifier file_identifier = 10624019; + std::string message; + // this variable has to be called reply! + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, message, reply); + } +}; + +struct ReverseRequest { + constexpr static FileIdentifier file_identifier = 10765955; + std::string message; + // this variable has to be called reply! + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, message, reply); + } +}; + +uint64_t tokenCounter = 1; + +ACTOR Future echoServer() { + state EchoServerInterface echoServer; + echoServer.getInterface.makeWellKnownEndpoint(UID(-1, ++tokenCounter), TaskPriority::DefaultEndpoint); + loop { + choose { + when(GetInterfaceRequest req = waitNext(echoServer.getInterface.getFuture())) { + req.reply.send(echoServer); + } + when(EchoRequest req = waitNext(echoServer.echo.getFuture())) { req.reply.send(req.message); } + when(ReverseRequest req = waitNext(echoServer.reverse.getFuture())) { + req.reply.send(std::string(req.message.rbegin(), req.message.rend())); + } + } + } +} + +ACTOR Future echoClient() { + state EchoServerInterface server; + server.getInterface = RequestStream(Endpoint({ serverAddress }, UID(-1, ++tokenCounter))); + EchoServerInterface s = wait(server.getInterface.getReply(GetInterfaceRequest())); + server = s; + EchoRequest echoRequest; + echoRequest.message = "Hello World"; + std::string echoMessage = wait(server.echo.getReply(echoRequest)); + std::cout << format("Sent {} to echo, received %s\n", "Hello World", echoMessage.c_str()); + ReverseRequest reverseRequest; + reverseRequest.message = "Hello World"; + std::string reverseString = wait(server.reverse.getReply(reverseRequest)); + std::cout << format("Sent {} to reverse, received {}\n", "Hello World", reverseString.c_str()); + return Void(); +} + +struct SimpleKeyValueStoreInteface { + constexpr static FileIdentifier file_identifier = 8226647; + RequestStream connect; + RequestStream get; + RequestStream set; + RequestStream clear; + + template + void serialize(Ar& ar) { + serializer(ar, connect, get, set, clear); + } +}; + +struct GetKVInterface { + constexpr static FileIdentifier file_identifier = 8062308; + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, reply); + } +}; + +struct GetRequest { + constexpr static FileIdentifier file_identifier = 6983506; + std::string key; + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, key, reply); + } +}; + +struct SetRequest { + constexpr static FileIdentifier file_identifier = 7554186; + std::string key; + std::string value; + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, key, value, reply); + } +}; + +struct ClearRequest { + constexpr static FileIdentifier file_identifier = 8500026; + std::string from; + std::string to; + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, from, to, reply); + } +}; + +ACTOR Future kvStoreServer() { + state SimpleKeyValueStoreInteface inf; + state std::map store; + inf.connect.makeWellKnownEndpoint(UID(-1, ++tokenCounter), TaskPriority::DefaultEndpoint); + loop { + choose { + when(GetKVInterface req = waitNext(inf.connect.getFuture())) { + std::cout << "Received connection attempt\n"; + req.reply.send(inf); + } + when(GetRequest req = waitNext(inf.get.getFuture())) { + auto iter = store.find(req.key); + if (iter == store.end()) { + req.reply.sendError(io_error()); + } else { + req.reply.send(iter->second); + } + } + when(SetRequest req = waitNext(inf.set.getFuture())) { + store[req.key] = req.value; + req.reply.send(Void()); + } + when(ClearRequest req = waitNext(inf.clear.getFuture())) { + auto from = store.lower_bound(req.from); + auto to = store.lower_bound(req.to); + while (from != store.end() && from != to) { + auto next = from; + ++next; + store.erase(from); + from = next; + } + req.reply.send(Void()); + } + } + } +} + +ACTOR Future connect() { + std::cout << format("%ull: Connect...\n", uint64_t(g_network->now())); + SimpleKeyValueStoreInteface c; + c.connect = RequestStream(Endpoint({ serverAddress }, UID(-1, ++tokenCounter))); + SimpleKeyValueStoreInteface result = wait(c.connect.getReply(GetKVInterface())); + std::cout << format("%ull: done..\n", uint64_t(g_network->now())); + return result; +} + +ACTOR Future kvSimpleClient() { + state SimpleKeyValueStoreInteface server = wait(connect()); + std::cout << format("Set %s -> %s\n", "foo", "bar"); + SetRequest setRequest; + setRequest.key = "foo"; + setRequest.value = "bar"; + wait(server.set.getReply(setRequest)); + GetRequest getRequest; + getRequest.key = "foo"; + std::string value = wait(server.get.getReply(getRequest)); + std::cout << format("get(%s) -> %s\n", "foo", value.c_str()); + return Void(); +} + +ACTOR Future kvClient(SimpleKeyValueStoreInteface server, std::shared_ptr ops) { + state Future timeout = delay(20); + state int rangeSize = 2 << 12; + loop { + SetRequest setRequest; + setRequest.key = std::to_string(deterministicRandom()->randomInt(0, rangeSize)); + setRequest.value = "foo"; + wait(server.set.getReply(setRequest)); + ++(*ops); + try { + GetRequest getRequest; + getRequest.key = std::to_string(deterministicRandom()->randomInt(0, rangeSize)); + std::string _ = wait(server.get.getReply(getRequest)); + ++(*ops); + } catch (Error& e) { + if (e.code() != error_code_io_error) { + throw e; + } + } + int from = deterministicRandom()->randomInt(0, rangeSize); + ClearRequest clearRequest; + clearRequest.from = std::to_string(from); + clearRequest.to = std::to_string(from + 100); + wait(server.clear.getReply(clearRequest)); + ++(*ops); + if (timeout.isReady()) { + // we are done + return Void(); + } + } +} + +ACTOR Future throughputMeasurement(std::shared_ptr operations) { + loop { + wait(delay(1.0)); + std::cout << format("%ull op/s\n", *operations); + *operations = 0; + } +} + +ACTOR Future multipleClients() { + SimpleKeyValueStoreInteface server = wait(connect()); + auto ops = std::make_shared(0); + std::vector> clients(100); + for (auto& f : clients) { + f = kvClient(server, ops); + } + auto done = waitForAll(clients); + wait(done || throughputMeasurement(ops)); + return Void(); +} + +std::string clusterFile = "fdb.cluster"; + +ACTOR Future fdbClient() { + wait(delay(30)); + state Database db = Database::createDatabase(clusterFile, 300); + state Transaction tx(db); + state std::string keyPrefix = "/tut/"; + state Key startKey; + state KeyRef endKey = LiteralStringRef("/tut0"); + state int beginIdx = 0; + loop { + try { + tx.reset(); + // this workload is stupidly simple: + // 1. select a random key between 1 + // and 1e8 + // 2. select this key plus the 100 + // next ones + // 3. write 10 values in [k, k+100] + beginIdx = deterministicRandom()->randomInt(0, 1e8 - 100); + startKey = keyPrefix + std::to_string(beginIdx); + Standalone range = wait(tx.getRange(KeyRangeRef(startKey, endKey), 100)); + for (int i = 0; i < 10; ++i) { + Key k = Key(keyPrefix + std::to_string(beginIdx + deterministicRandom()->randomInt(0, 100))); + tx.set(k, LiteralStringRef("foo")); + } + wait(tx.commit()); + std::cout << "Committed\n"; + wait(delay(2.0)); + } catch (Error& e) { + wait(tx.onError(e)); + } + } +} + +ACTOR Future fdbStatusStresser() { + state Database db = Database::createDatabase(clusterFile, 300); + state ReadYourWritesTransaction tx(db); + state Key statusJson(std::string("\xff\xff/status/json")); + loop { + try { + tx.reset(); + Optional _ = wait(tx.get(statusJson)); + } catch (Error& e) { + wait(tx.onError(e)); + } + } +} + +std::unordered_map()>> actors = { { "timer", &simpleTimer }, + { "promiseDemo", &promiseDemo }, + { "triggerDemo", &triggerDemo }, + { "echoServer", &echoServer }, + { "echoClient", &echoClient }, + { "kvStoreServer", &kvStoreServer }, + { "kvSimpleClient", &kvSimpleClient }, + { "multipleClients", &multipleClients }, + { "fdbClient", &fdbClient }, + { "fdbStatusStresser", &fdbStatusStresser } }; + +int main(int argc, char* argv[]) { + bool isServer = false; + std::string port; + std::vector()>> toRun; + // parse arguments + for (int i = 1; i < argc; ++i) { + std::string arg(argv[i]); + if (arg == "-p") { + isServer = true; + if (i + 1 >= argc) { + std::cout << "Expecting an argument after -p\n"; + return 1; + } + port = std::string(argv[++i]); + continue; + } else if (arg == "-s") { + if (i + 1 >= argc) { + std::cout << "Expecting an argument after -s\n"; + return 1; + } + serverAddress = NetworkAddress::parse(argv[++i]); + continue; + } else if (arg == "-C") { + clusterFile = argv[++i]; + std::cout << "Using cluster file " << clusterFile << std::endl; + continue; + } + auto actor = actors.find(arg); + if (actor == actors.end()) { + std::cout << format("Error: actor %s does not exist\n", arg.c_str()); + return 1; + } + toRun.push_back(actor->second); + } + platformInit(); + g_network = newNet2(false, true); + NetworkAddress publicAddress = NetworkAddress::parse("0.0.0.0:0"); + if (isServer) { + publicAddress = NetworkAddress::parse("0.0.0.0:" + port); + } + // openTraceFile(publicAddress, TRACE_DEFAULT_ROLL_SIZE, + // TRACE_DEFAULT_MAX_LOGS_SIZE); + try { + if (isServer) { + auto listenError = FlowTransport::transport().bind(publicAddress, publicAddress); + if (listenError.isError()) { + listenError.get(); + } + } + } catch (Error& e) { + std::cout << format("Error while binding to address (%d): %s\n", e.code(), e.what()); + } + // now we start the actors + std::vector> all; + for (auto& f : toRun) { + all.emplace_back(f()); + } + auto f = stopAfter(waitForAll(all)); + g_network->run(); + return 0; +} diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 1a78d1f807..38e7d0fb73 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -826,7 +826,7 @@ const KeyRef exeFastRestoreAgent = LiteralStringRef("fastrestore_agent"); // mus const KeyRef exeDatabaseAgent = LiteralStringRef("dr_agent"); const KeyRef exeDatabaseBackup = LiteralStringRef("fdbdr"); -extern const char* getHGVersion(); +extern const char* getSourceVersion(); #ifdef _WIN32 void parentWatcher(void *parentHandle) { @@ -842,7 +842,7 @@ void parentWatcher(void *parentHandle) { static void printVersion() { printf("FoundationDB " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n"); - printf("source version %s\n", getHGVersion()); + printf("source version %s\n", getSourceVersion()); printf("protocol %llx\n", (long long) currentProtocolVersion.version()); } @@ -913,7 +913,7 @@ void printBackupContainerInfo() { static void printBackupUsage(bool devhelp) { printf("FoundationDB " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n"); - printf("Usage: %s (start | status | abort | wait | discontinue | pause | resume | expire | delete | describe | list) [OPTIONS]\n\n", exeBackup.toString().c_str()); + printf("Usage: %s (start | status | abort | wait | discontinue | pause | resume | expire | delete | describe | list | cleanup) [OPTIONS]\n\n", exeBackup.toString().c_str()); printf(" -C CONNFILE The path of a file containing the connection string for the\n" " FoundationDB cluster. The default is first the value of the\n" " FDB_CLUSTER_FILE environment variable, then `./fdb.cluster',\n" @@ -964,6 +964,11 @@ static void printBackupUsage(bool devhelp) { printf(" --trace_format FORMAT\n" " Select the format of the trace files. xml (the default) and json are supported.\n" " Has no effect unless --log is specified.\n"); + printf(" --max_cleanup_seconds SECONDS\n" + " Specifies the amount of time a backup or DR needs to be stale before cleanup will\n" + " remove mutations for it. By default this is set to one hour.\n"); + printf(" --delete_data\n" + " This flag will cause cleanup to remove mutations for the most stale backup or DR.\n"); #ifndef TLS_DISABLED printf(TLS_HELP); #endif @@ -3454,7 +3459,7 @@ int main(int argc, char* argv[]) { TraceEvent("ProgramStart") .setMaxEventLength(12000) - .detail("SourceVersion", getHGVersion()) + .detail("SourceVersion", getSourceVersion()) .detail("Version", FDB_VT_VERSION ) .detail("PackageName", FDB_VT_PACKAGE_NAME) .detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(NULL)) @@ -3948,7 +3953,7 @@ ACTOR static Future _fastRestore(Database cx, Key tagName, Key url, boo ACTOR Future fastRestore(Database cx, Standalone tagName, Standalone url, bool waitForComplete, long targetVersion, bool verbose, Standalone range, Standalone addPrefix, Standalone removePrefix) { - Version targetVersion = + Version result = wait(_fastRestore(cx, tagName, url, waitForComplete, targetVersion, verbose, range, addPrefix, removePrefix)); - return targetVersion; -} \ No newline at end of file + return result; +} diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index 334cf16005..3807ab7cf9 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -54,7 +54,7 @@ #include "flow/actorcompiler.h" // This must be the last #include. -extern const char* getHGVersion(); +extern const char* getSourceVersion(); std::vector validOptions; @@ -563,7 +563,7 @@ void initHelp() { void printVersion() { printf("FoundationDB CLI " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n"); - printf("source version %s\n", getHGVersion()); + printf("source version %s\n", getSourceVersion()); printf("protocol %" PRIx64 "\n", currentProtocolVersion.version()); } @@ -2632,7 +2632,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { if (opt.trace) { TraceEvent("CLIProgramStart") .setMaxEventLength(12000) - .detail("SourceVersion", getHGVersion()) + .detail("SourceVersion", getSourceVersion()) .detail("Version", FDB_VT_VERSION) .detail("PackageName", FDB_VT_PACKAGE_NAME) .detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(NULL)) @@ -3511,7 +3511,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { printf("Data distribution is turned off.\n"); } else if (tokencmp(tokens[1], "disable")) { if (tokencmp(tokens[2], "ssfailure")) { - bool _ = wait(makeInterruptable(setHealthyZone(db, ignoreSSFailuresZoneString, 0))); + wait(success(makeInterruptable(setHealthyZone(db, ignoreSSFailuresZoneString, 0)))); printf("Data distribution is disabled for storage server failures.\n"); } else if (tokencmp(tokens[2], "rebalance")) { wait(makeInterruptable(setDDIgnoreRebalanceSwitch(db, true))); @@ -3523,7 +3523,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { } } else if (tokencmp(tokens[1], "enable")) { if (tokencmp(tokens[2], "ssfailure")) { - bool _ = wait(makeInterruptable(clearHealthyZone(db, false, true))); + wait(success(makeInterruptable(clearHealthyZone(db, false, true)))); printf("Data distribution is enabled for storage server failures.\n"); } else if (tokencmp(tokens[2], "rebalance")) { wait(makeInterruptable(setDDIgnoreRebalanceSwitch(db, false))); diff --git a/fdbclient/BackupAgentBase.actor.cpp b/fdbclient/BackupAgentBase.actor.cpp index 5627a1a349..6a02bac4b3 100644 --- a/fdbclient/BackupAgentBase.actor.cpp +++ b/fdbclient/BackupAgentBase.actor.cpp @@ -862,29 +862,33 @@ ACTOR Future cleanupLogMutations(Database cx, Value destUidValue, bool del wait(success(foundDRKey) && success(foundBackupKey)); if(foundDRKey.get().present() && foundBackupKey.get().present()) { - printf("WARNING: Found a tag which looks like both a backup and a DR. This tag was %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); + printf("WARNING: Found a tag that looks like both a backup and a DR. This tag is %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); } else if(foundDRKey.get().present() && !foundBackupKey.get().present()) { - printf("Found a DR which was %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); + printf("Found a DR that is %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); } else if(!foundDRKey.get().present() && foundBackupKey.get().present()) { - printf("Found a Backup which was %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); + printf("Found a Backup that is %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); } else { - printf("WARNING: Found a unknown tag which was %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); + printf("WARNING: Found an unknown tag that is %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); } loggedLogUids.insert(currLogUid); } } - if( readVer - minVersion > CLIENT_KNOBS->MIN_CLEANUP_SECONDS*CLIENT_KNOBS->CORE_VERSIONSPERSECOND && deleteData && (!removingLogUid.present() || minVersionLogUid == removingLogUid.get()) ) { - removingLogUid = minVersionLogUid; - wait(eraseLogData(tr, minVersionLogUid, destUidValue)); - wait(tr->commit()); - printf("\nSuccessfully removed the tag which was %.4f hours behind.\n", (readVer - minVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); - } else if(removingLogUid.present() && minVersionLogUid != removingLogUid.get()) { - printf("\nWARNING: The oldest tag was possibly removed, run again without `--delete_data' to check.\n"); - } else if( deleteData ) { - printf("\nWARNING: Did not delete data because the tag was not at least %.4f hours behind. Change `--min_cleanup_seconds' to adjust this threshold.\n", CLIENT_KNOBS->MIN_CLEANUP_SECONDS/3600.0); + if(deleteData) { + if(readVer - minVersion > CLIENT_KNOBS->MIN_CLEANUP_SECONDS*CLIENT_KNOBS->CORE_VERSIONSPERSECOND && (!removingLogUid.present() || minVersionLogUid == removingLogUid.get())) { + removingLogUid = minVersionLogUid; + wait(eraseLogData(tr, minVersionLogUid, destUidValue)); + wait(tr->commit()); + printf("\nSuccessfully removed the tag that was %.4f hours behind.\n\n", (readVer - minVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); + } else if(removingLogUid.present() && minVersionLogUid != removingLogUid.get()) { + printf("\nWARNING: The oldest tag was possibly removed, run again without `--delete_data' to check.\n\n"); + } else { + printf("\nWARNING: Did not delete data because the tag is not at least %.4f hours behind. Change `--min_cleanup_seconds' to adjust this threshold.\n\n", CLIENT_KNOBS->MIN_CLEANUP_SECONDS/3600.0); + } + } else if(readVer - minVersion > CLIENT_KNOBS->MIN_CLEANUP_SECONDS*CLIENT_KNOBS->CORE_VERSIONSPERSECOND) { + printf("\nPassing `--delete_data' would delete the tag that is %.4f hours behind.\n\n", (readVer - minVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); } else { - printf("\nPassing `--delete_data' would delete the tag which was %.4f hours behind.\n", (readVer - minVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); + printf("\nPassing `--delete_data' would not delete the tag that is %.4f hours behind. Change `--min_cleanup_seconds' to adjust the cleanup threshold.\n\n", (readVer - minVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); } return Void(); diff --git a/fdbclient/BackupContainer.h b/fdbclient/BackupContainer.h index 5671788c9a..b14ce7e37c 100644 --- a/fdbclient/BackupContainer.h +++ b/fdbclient/BackupContainer.h @@ -173,7 +173,7 @@ struct RestorableFileSet { Version targetVersion; std::vector logs; std::vector ranges; - KeyspaceSnapshotFile snapshot; + KeyspaceSnapshotFile snapshot; // Info. for debug purposes }; /* IBackupContainer is an interface to a set of backup data, which contains diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt index d47bdb8334..da58789a11 100644 --- a/fdbclient/CMakeLists.txt +++ b/fdbclient/CMakeLists.txt @@ -48,6 +48,7 @@ set(FDBCLIENT_SRCS Notified.h ReadYourWrites.actor.cpp ReadYourWrites.h + RestoreWorkerInterface.actor.h RunTransaction.actor.h RYWIterator.cpp RYWIterator.h diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 76c74c41b9..6f06e19432 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -93,7 +93,7 @@ struct struct_like_traits : std::true_type { } template - static const void assign(Member& m, const Type& t, Context&) { + static void assign(Member& m, const Type& t, Context&) { if constexpr (i == 0) { m.id = t; } else { @@ -105,15 +105,16 @@ struct struct_like_traits : std::true_type { static const Tag invalidTag {tagLocalitySpecial, 0}; static const Tag txsTag {tagLocalitySpecial, 1}; +static const Tag cacheTag {tagLocalitySpecial, 2}; enum { txsTagOld = -1, invalidTagOld = -100 }; struct TagsAndMessage { StringRef message; - std::vector tags; + VectorRef tags; TagsAndMessage() {} - TagsAndMessage(StringRef message, const std::vector& tags) : message(message), tags(tags) {} + TagsAndMessage(StringRef message, VectorRef tags) : message(message), tags(tags) {} // Loads tags and message from a serialized buffer. "rd" is checkpointed at // its begining position to allow the caller to rewind if needed. @@ -123,15 +124,11 @@ struct TagsAndMessage { int32_t messageLength; uint16_t tagCount; uint32_t sub; - tags.clear(); rd->checkpoint(); *rd >> messageLength >> sub >> tagCount; if (messageVersionSub) *messageVersionSub = sub; - tags.resize(tagCount); - for (int i = 0; i < tagCount; i++) { - *rd >> tags[i]; - } + tags = VectorRef((Tag*)rd->readBytes(tagCount*sizeof(Tag)), tagCount); const int32_t rawLength = messageLength + sizeof(messageLength); rd->rewind(); rd->checkpoint(); @@ -553,6 +550,10 @@ inline KeySelectorRef operator + (const KeySelectorRef& s, int off) { inline KeySelectorRef operator - (const KeySelectorRef& s, int off) { return KeySelectorRef(s.getKey(), s.orEqual, s.offset-off); } +inline bool selectorInRange( KeySelectorRef const& sel, KeyRangeRef const& range ) { + // Returns true if the given range suffices to at least begin to resolve the given KeySelectorRef + return sel.getKey() >= range.begin && (sel.isBackward() ? sel.getKey() <= range.end : sel.getKey() < range.end); +} template struct KeyRangeWith : KeyRange { diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp index 84efc5013b..4dd057a48e 100644 --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -572,8 +572,8 @@ namespace fileBackup { // Functions for consuming big endian (network byte order) integers. // Consumes a big endian number, swaps it to little endian, and returns it. - const int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume< int32_t>());} - const uint32_t consumeNetworkUInt32() { return bigEndian32( consume());} + int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume< int32_t>());} + uint32_t consumeNetworkUInt32() { return bigEndian32( consume());} bool eof() { return rptr == end; } diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp index e8f3e2d19e..4eb4229ca2 100644 --- a/fdbclient/Knobs.cpp +++ b/fdbclient/Knobs.cpp @@ -69,7 +69,7 @@ ClientKnobs::ClientKnobs(bool randomize) { init( GRV_BATCH_TIMEOUT, 0.005 ); if( randomize && BUGGIFY ) GRV_BATCH_TIMEOUT = 0.1; init( BROADCAST_BATCH_SIZE, 20 ); if( randomize && BUGGIFY ) BROADCAST_BATCH_SIZE = 1; - init( LOCATION_CACHE_EVICTION_SIZE, 300000 ); + init( LOCATION_CACHE_EVICTION_SIZE, 600000 ); init( LOCATION_CACHE_EVICTION_SIZE_SIM, 10 ); if( randomize && BUGGIFY ) LOCATION_CACHE_EVICTION_SIZE_SIM = 3; init( GET_RANGE_SHARD_LIMIT, 2 ); diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 7740b562b6..34bbc60ed3 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -60,7 +60,7 @@ #endif #include "flow/actorcompiler.h" // This must be the last #include. -extern const char* getHGVersion(); +extern const char* getSourceVersion(); using std::max; using std::min; @@ -791,7 +791,7 @@ Database Database::createDatabase( Reference connFile, in openTraceFile(NetworkAddress(publicIP, ::getpid()), networkOptions.traceRollSize, networkOptions.traceMaxLogsSize, networkOptions.traceDirectory.get(), "trace", networkOptions.traceLogGroup); TraceEvent("ClientStart") - .detail("SourceVersion", getHGVersion()) + .detail("SourceVersion", getSourceVersion()) .detail("Version", FDB_VT_VERSION) .detail("PackageName", FDB_VT_PACKAGE_NAME) .detail("ClusterFile", connFile->getFilename().c_str()) diff --git a/fdbclient/Notified.h b/fdbclient/Notified.h index 80a87192f0..d0cd4ec846 100644 --- a/fdbclient/Notified.h +++ b/fdbclient/Notified.h @@ -25,103 +25,70 @@ #include "fdbclient/FDBTypes.h" #include "flow/TDMetric.actor.h" -struct NotifiedVersion { - NotifiedVersion( StringRef& name, StringRef const &id, Version version = 0 ) : val(name, id, version) { val = version; } - NotifiedVersion( Version version = 0 ) : val(StringRef(), StringRef(), version) {} +template +struct IsMetricHandle : std::false_type {}; +template +struct IsMetricHandle> : std::true_type {}; - void initMetric(const StringRef& name, const StringRef &id) { - Version version = val; - val.init(name, id); - val = version; - } +template +struct Notified { + explicit Notified(ValueType v = 0) { val = v; } - Future whenAtLeast( Version limit ) { - if (val >= limit) - return Void(); + [[nodiscard]] Future whenAtLeast(const ValueType& limit) { + if (val >= limit) return Void(); Promise p; - waiting.push( std::make_pair(limit,p) ); + waiting.push(std::make_pair(limit, p)); return p.getFuture(); } - Version get() const { return val; } + [[nodiscard]] ValueType get() const { return val; } - void set( Version v ) { - ASSERT( v >= val ); + void initMetric(const StringRef& name, const StringRef& id) { + if constexpr (IsMetricHandle::value) { + ValueType v = val; + val.init(name, id); + val = v; + } else { + TraceEvent(SevError, "InvalidNotifiedOperation") + .detail("Reason", "Notified where T is not a metric: Can't use initMetric"); + } + } + + void set(const ValueType& v) { + ASSERT(v >= val); if (v != val) { val = v; std::vector> toSend; - while ( waiting.size() && v >= waiting.top().first ) { + while (waiting.size() && v >= waiting.top().first) { Promise p = std::move(waiting.top().second); waiting.pop(); toSend.push_back(p); } - for(auto& p : toSend) { + for (auto& p : toSend) { p.send(Void()); } } } - void operator=( Version v ) { - set( v ); + void operator=(const ValueType& v) { set(v); } + + Notified(Notified&& r) BOOST_NOEXCEPT : waiting(std::move(r.waiting)), val(std::move(r.val)) {} + void operator=(Notified&& r) BOOST_NOEXCEPT { + waiting = std::move(r.waiting); + val = std::move(r.val); } - NotifiedVersion(NotifiedVersion&& r) BOOST_NOEXCEPT : waiting(std::move(r.waiting)), val(std::move(r.val)) {} - void operator=(NotifiedVersion&& r) BOOST_NOEXCEPT { waiting = std::move(r.waiting); val = std::move(r.val); } - private: - typedef std::pair> Item; + using Item = std::pair>; struct ItemCompare { bool operator()(const Item& a, const Item& b) { return a.first > b.first; } }; std::priority_queue, ItemCompare> waiting; - VersionMetricHandle val; + T val; }; -struct NotifiedDouble { - explicit NotifiedDouble( double val = 0 ) : val(val) {} - - Future whenAtLeast( double limit ) { - if (val >= limit) - return Void(); - Promise p; - waiting.push( std::make_pair(limit,p) ); - return p.getFuture(); - } - - double get() const { return val; } - - void set( double v ) { - ASSERT( v >= val ); - if (v != val) { - val = v; - - std::vector> toSend; - while ( waiting.size() && v >= waiting.top().first ) { - Promise p = std::move(waiting.top().second); - waiting.pop(); - toSend.push_back(p); - } - for(auto& p : toSend) { - p.send(Void()); - } - } - } - - void operator=( double v ) { - set( v ); - } - - NotifiedDouble(NotifiedDouble&& r) BOOST_NOEXCEPT : waiting(std::move(r.waiting)), val(r.val) {} - void operator=(NotifiedDouble&& r) BOOST_NOEXCEPT { waiting = std::move(r.waiting); val = r.val; } - -private: - typedef std::pair> Item; - struct ItemCompare { - bool operator()(const Item& a, const Item& b) { return a.first > b.first; } - }; - std::priority_queue, ItemCompare> waiting; - double val; -}; +using NotifiedVersion = Notified; +using NotifiedDouble = Notified; #endif diff --git a/fdbserver/RestoreWorkerInterface.h b/fdbclient/RestoreWorkerInterface.actor.h similarity index 90% rename from fdbserver/RestoreWorkerInterface.h rename to fdbclient/RestoreWorkerInterface.actor.h index 278648a2ea..e0664a4b8a 100644 --- a/fdbserver/RestoreWorkerInterface.h +++ b/fdbclient/RestoreWorkerInterface.actor.h @@ -1,5 +1,5 @@ /* - * RestoreWorkerInterface.h + * RestoreWorkerInterface.actor.h * * This source file is part of the FoundationDB open source project * @@ -22,8 +22,11 @@ // which are RestoreMaster, RestoreLoader, and RestoreApplier #pragma once -#ifndef FDBSERVER_RESTORE_WORKER_INTERFACE_H -#define FDBSERVER_RESTORE_WORKER_INTERFACE_H +#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_RESTORE_WORKER_INTERFACE_ACTOR_G_H) + #define FDBCLIENT_RESTORE_WORKER_INTERFACE_ACTOR_G_H + #include "fdbclient/RestoreWorkerInterface.actor.g.h" +#elif !defined(FDBCLIENT_RESTORE_WORKER_INTERFACE_ACTOR_H) + #define FDBCLIENT_RESTORE_WORKER_INTERFACE_ACTOR_H #include #include "flow/Stats.h" @@ -35,6 +38,7 @@ #include "fdbserver/CoordinationInterface.h" #include "fdbserver/Knobs.h" #include "fdbserver/RestoreUtil.h" +#include "flow/actorcompiler.h" // This must be the last #include. class RestoreConfigFR; @@ -43,8 +47,8 @@ struct RestoreRecruitRoleRequest; struct RestoreSysInfoRequest; struct RestoreLoadFileRequest; struct RestoreVersionBatchRequest; +struct RestoreSendMutationsToAppliersRequest; struct RestoreSendMutationVectorVersionedRequest; -struct RestoreSetApplierKeyRangeVectorRequest; struct RestoreSysInfo; struct RestoreApplierInterface; @@ -121,10 +125,10 @@ struct RestoreLoaderInterface : RestoreRoleInterface { RequestStream heartbeat; RequestStream updateRestoreSysInfo; - RequestStream setApplierKeyRangeVectorRequest; RequestStream loadFile; + RequestStream sendMutations; RequestStream initVersionBatch; - RequestStream collectRestoreRoleInterfaces; // TODO: Change to collectRestoreRoleInterfaces + RequestStream collectRestoreRoleInterfaces; RequestStream finishRestore; bool operator==(RestoreWorkerInterface const& r) const { return id() == r.id(); } @@ -140,8 +144,8 @@ struct RestoreLoaderInterface : RestoreRoleInterface { void initEndpoints() { heartbeat.getEndpoint(TaskPriority::LoadBalancedEndpoint); updateRestoreSysInfo.getEndpoint(TaskPriority::LoadBalancedEndpoint); - setApplierKeyRangeVectorRequest.getEndpoint(TaskPriority::LoadBalancedEndpoint); loadFile.getEndpoint(TaskPriority::LoadBalancedEndpoint); + sendMutations.getEndpoint(TaskPriority::LoadBalancedEndpoint); initVersionBatch.getEndpoint(TaskPriority::LoadBalancedEndpoint); collectRestoreRoleInterfaces.getEndpoint(TaskPriority::LoadBalancedEndpoint); finishRestore.getEndpoint(TaskPriority::LoadBalancedEndpoint); @@ -149,8 +153,8 @@ struct RestoreLoaderInterface : RestoreRoleInterface { template void serialize(Ar& ar) { - serializer(ar, *(RestoreRoleInterface*)this, heartbeat, updateRestoreSysInfo, setApplierKeyRangeVectorRequest, - loadFile, initVersionBatch, collectRestoreRoleInterfaces, finishRestore); + serializer(ar, *(RestoreRoleInterface*)this, heartbeat, updateRestoreSysInfo, loadFile, sendMutations, + initVersionBatch, collectRestoreRoleInterfaces, finishRestore); } }; @@ -338,6 +342,31 @@ struct RestoreLoadFileRequest : TimedRequest { } }; +struct RestoreSendMutationsToAppliersRequest : TimedRequest { + constexpr static FileIdentifier file_identifier = 68827305; + + std::map rangeToApplier; + bool useRangeFile; // Send mutations parsed from range file? + + ReplyPromise reply; + + RestoreSendMutationsToAppliersRequest() = default; + explicit RestoreSendMutationsToAppliersRequest(std::map rangeToApplier, bool useRangeFile) + : rangeToApplier(rangeToApplier), useRangeFile(useRangeFile) {} + + template + void serialize(Ar& ar) { + serializer(ar, rangeToApplier, useRangeFile, reply); + } + + std::string toString() { + std::stringstream ss; + ss << "RestoreSendMutationsToAppliersRequest keyToAppliers.size:" << rangeToApplier.size() + << " useRangeFile:" << useRangeFile; + return ss.str(); + } +}; + struct RestoreSendMutationVectorVersionedRequest : TimedRequest { constexpr static FileIdentifier file_identifier = 69764565; @@ -356,7 +385,7 @@ struct RestoreSendMutationVectorVersionedRequest : TimedRequest { std::string toString() { std::stringstream ss; - ss << "fileIndex" << fileIndex << "prevVersion:" << prevVersion << " version:" << version + ss << "fileIndex:" << fileIndex << " prevVersion:" << prevVersion << " version:" << version << " isRangeFile:" << isRangeFile << " mutations.size:" << mutations.size(); return ss.str(); } @@ -389,29 +418,6 @@ struct RestoreVersionBatchRequest : TimedRequest { } }; -struct RestoreSetApplierKeyRangeVectorRequest : TimedRequest { - constexpr static FileIdentifier file_identifier = 92038306; - - std::map, UID> rangeToApplier; - - ReplyPromise reply; - - RestoreSetApplierKeyRangeVectorRequest() = default; - explicit RestoreSetApplierKeyRangeVectorRequest(std::map, UID> rangeToApplier) - : rangeToApplier(rangeToApplier) {} - - template - void serialize(Ar& ar) { - serializer(ar, rangeToApplier, reply); - } - - std::string toString() { - std::stringstream ss; - ss << "RestoreVersionBatchRequest rangeToApplierSize:" << rangeToApplier.size(); - return ss.str(); - } -}; - struct RestoreRequest { constexpr static FileIdentifier file_identifier = 49589770; @@ -467,7 +473,8 @@ struct RestoreRequest { std::string getRoleStr(RestoreRole role); ////--- Interface functions -Future _restoreWorker(Database const& cx, LocalityData const& locality); -Future restoreWorker(Reference const& ccf, LocalityData const& locality); +ACTOR Future _restoreWorker(Database cx, LocalityData locality); +ACTOR Future restoreWorker(Reference ccf, LocalityData locality); -#endif \ No newline at end of file +#include "flow/unactorcompiler.h" +#endif diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index d0f93884da..53fd7641c6 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -49,7 +49,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "resolution", "proxy", "master", - "test" + "test", + "storage_cache" ] }, "degraded":true, @@ -86,6 +87,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "cluster_controller", "data_distributor", "ratekeeper", + "storage_cache", "router", "coordinator" ] diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index 2505bf5a31..423b099018 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -189,8 +189,9 @@ struct GetKeyValuesReply : public LoadBalancedReply { VectorRef data; Version version; // useful when latestVersion was requested bool more; + bool cached; - GetKeyValuesReply() : version(invalidVersion), more(false) {} + GetKeyValuesReply() : version(invalidVersion), more(false), cached(false) {} template void serialize( Ar& ar ) { diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 8d80d50f3e..99bbed5429 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -58,6 +58,28 @@ void decodeKeyServersValue( const ValueRef& value, vector& src, vector } } +// "\xff/storageCache/[[begin]]" := "[[vector]]" +const KeyRangeRef storageCacheKeys( LiteralStringRef("\xff/storageCache/"), LiteralStringRef("\xff/storageCache0") ); +const KeyRef storageCachePrefix = storageCacheKeys.begin; + +const Key storageCacheKey( const KeyRef& k ) { + return k.withPrefix( storageCachePrefix ); +} + +const Value storageCacheValue( const vector& serverIndices ) { + BinaryWriter wr((IncludeVersion())); + wr << serverIndices; + return wr.toValue(); +} + +void decodeStorageCacheValue( const ValueRef& value, vector& serverIndices ) { + serverIndices.clear(); + if (value.size()) { + BinaryReader rd(value, IncludeVersion()); + rd >> serverIndices; + } +} + const Value logsValue( const vector>& logs, const vector>& oldLogs ) { BinaryWriter wr(IncludeVersion()); wr << logs; @@ -73,7 +95,6 @@ std::pair>,vector> idx; + return idx; +} +KeyRef cacheKeysDecodeKey( const KeyRef& key ) { + return key.substr( cacheKeysPrefix.size() + sizeof(uint16_t) + 1); +} + +const KeyRef cacheChangeKey = LiteralStringRef("\xff\x02/cacheChangeKey"); +const KeyRangeRef cacheChangeKeys( LiteralStringRef("\xff\x02/cacheChangeKeys/"), LiteralStringRef("\xff\x02/cacheChangeKeys0") ); +const KeyRef cacheChangePrefix = cacheChangeKeys.begin; +const Key cacheChangeKeyFor( uint16_t idx ) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes( cacheChangePrefix ); + wr << idx; + return wr.toValue(); +} +uint16_t cacheChangeKeyDecodeIndex( const KeyRef& key ) { + uint16_t idx; + BinaryReader rd( key.removePrefix(cacheChangePrefix), Unversioned() ); + rd >> idx; + return idx; +} + const KeyRangeRef serverTagKeys( LiteralStringRef("\xff/serverTag/"), LiteralStringRef("\xff/serverTag0") ); @@ -641,13 +705,22 @@ const KeyRangeRef restoreApplierKeys(LiteralStringRef("\xff\x02/restoreApplier/" const KeyRef restoreApplierTxnValue = LiteralStringRef("1"); // restoreApplierKeys: track atomic transaction progress to ensure applying atomicOp exactly once +// Version is passed in as LittleEndian, it must be converted to BigEndian to maintain ordering in lexical order const Key restoreApplierKeyFor(UID const& applierID, Version version) { BinaryWriter wr(Unversioned()); - wr.serializeBytes(restoreWorkersKeys.begin); - wr << applierID << version; + wr.serializeBytes(restoreApplierKeys.begin); + wr << applierID << bigEndian64(version); return wr.toValue(); } +std::pair decodeRestoreApplierKey(ValueRef const& key) { + BinaryReader rd(key, Unversioned()); + UID applierID; + Version version; + rd >> applierID >> version; + return std::make_pair(applierID, bigEndian64(version)); +} + // Encode restore worker key for workerID const Key restoreWorkerKeyFor(UID const& workerID) { BinaryWriter wr(Unversioned()); @@ -678,7 +751,7 @@ const Value restoreRequestTriggerValue(UID randomID, int const numRequests) { wr << randomID; return wr.toValue(); } -const int decodeRestoreRequestTriggerValue(ValueRef const& value) { +int decodeRestoreRequestTriggerValue(ValueRef const& value) { int s; UID randomID; BinaryReader reader(value, IncludeVersion()); diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h index 066c3e5dc1..c3debfac3f 100644 --- a/fdbclient/SystemData.h +++ b/fdbclient/SystemData.h @@ -26,7 +26,7 @@ #include "fdbclient/FDBTypes.h" #include "fdbclient/StorageServerInterface.h" -#include "fdbserver/RestoreWorkerInterface.h" +#include "fdbclient/RestoreWorkerInterface.actor.h" struct RestoreLoaderInterface; struct RestoreApplierInterface; @@ -49,6 +49,13 @@ const Value keyServersValue( void decodeKeyServersValue( const ValueRef& value, vector& src, vector& dest ); +// "\xff/storageCache/[[begin]]" := "[[vector]]" +extern const KeyRangeRef storageCacheKeys; +extern const KeyRef storageCachePrefix; +const Key storageCacheKey( const KeyRef& k ); +const Value storageCacheValue( const vector& serverIndices ); +void decodeStorageCacheValue( const ValueRef& value, vector& serverIndices ); + // "\xff/serverKeys/[[serverID]]/[[begin]]" := "" | "1" | "2" extern const KeyRef serverKeysPrefix; extern const ValueRef serverKeysTrue, serverKeysFalse; @@ -57,6 +64,19 @@ const Key serverKeysPrefixFor( UID serverID ); UID serverKeysDecodeServer( const KeyRef& key ); bool serverHasKey( ValueRef storedValue ); +extern const KeyRef cacheKeysPrefix; + +const Key cacheKeysKey( uint16_t idx, const KeyRef& key ); +const Key cacheKeysPrefixFor( uint16_t idx ); +uint16_t cacheKeysDecodeIndex( const KeyRef& key ); +KeyRef cacheKeysDecodeKey( const KeyRef& key ); + +extern const KeyRef cacheChangeKey; +extern const KeyRangeRef cacheChangeKeys; +extern const KeyRef cacheChangePrefix; +const Key cacheChangeKeyFor( uint16_t idx ); +uint16_t cacheChangeKeyDecodeIndex( const KeyRef& key ); + extern const KeyRangeRef serverTagKeys; extern const KeyRef serverTagPrefix; extern const KeyRangeRef serverTagMaxKeys; @@ -298,11 +318,12 @@ extern const KeyRangeRef restoreApplierKeys; extern const KeyRef restoreApplierTxnValue; const Key restoreApplierKeyFor(UID const& applierID, Version version); +std::pair decodeRestoreApplierKey(ValueRef const& key); const Key restoreWorkerKeyFor(UID const& workerID); const Value restoreWorkerInterfaceValue(RestoreWorkerInterface const& server); RestoreWorkerInterface decodeRestoreWorkerInterfaceValue(ValueRef const& value); const Value restoreRequestTriggerValue(UID randomUID, int const numRequests); -const int decodeRestoreRequestTriggerValue(ValueRef const& value); +int decodeRestoreRequestTriggerValue(ValueRef const& value); const Value restoreRequestDoneVersionValue(Version readVersion); Version decodeRestoreRequestDoneVersionValue(ValueRef const& value); const Key restoreRequestKeyFor(int const& index); diff --git a/fdbclient/ThreadSafeTransaction.actor.cpp b/fdbclient/ThreadSafeTransaction.actor.cpp index 7772aae862..c71482b3b9 100644 --- a/fdbclient/ThreadSafeTransaction.actor.cpp +++ b/fdbclient/ThreadSafeTransaction.actor.cpp @@ -333,9 +333,9 @@ void ThreadSafeTransaction::reset() { onMainThreadVoid( [tr](){ tr->reset(); }, NULL ); } -extern const char* getHGVersion(); +extern const char* getSourceVersion(); -ThreadSafeApi::ThreadSafeApi() : apiVersion(-1), clientVersion(format("%s,%s,%llx", FDB_VT_VERSION, getHGVersion(), currentProtocolVersion)), transportId(0) {} +ThreadSafeApi::ThreadSafeApi() : apiVersion(-1), clientVersion(format("%s,%s,%llx", FDB_VT_VERSION, getSourceVersion(), currentProtocolVersion)), transportId(0) {} void ThreadSafeApi::selectApiVersion(int apiVersion) { this->apiVersion = apiVersion; diff --git a/fdbclient/VersionedMap.h b/fdbclient/VersionedMap.h index c82ce673c8..8f49f4e25e 100644 --- a/fdbclient/VersionedMap.h +++ b/fdbclient/VersionedMap.h @@ -414,16 +414,19 @@ namespace PTreeImpl { if (p->left(at)) printTree(p->left(at), at, depth+1); for (int i=0;idata).c_str()); + //printf(":%s\n", describe(p->data.value.first).c_str()); + printf(":%s\n", describe(p->data.key).c_str()); if (p->right(at)) printTree(p->right(at), at, depth+1); } template void printTreeDetails(const Reference>& p, int depth = 0) { - printf("Node %p (depth %d): %s\n", p.getPtr(), depth, describe(p->data).c_str()); + //printf("Node %p (depth %d): %s\n", p.getPtr(), depth, describe(p->data.value.first).c_str()); + printf("Node %p (depth %d): %s\n", p.getPtr(), depth, describe(p->data.key).c_str()); printf(" Left: %p\n", p->pointer[0].getPtr()); printf(" Right: %p\n", p->pointer[1].getPtr()); - if (p->pointer[2]) + //if (p->pointer[2]) + if (p->updated) printf(" Version %lld %s: %p\n", p->lastUpdateVersion, p->replacedPointer ? "Right" : "Left", p->pointer[2].getPtr()); for(int i=0; i<3; i++) if (p->pointer[i]) printTreeDetails(p->pointer[i], depth+1); @@ -462,8 +465,47 @@ namespace PTreeImpl { } } + //Remove pointers to any child nodes that have been updated at or before the given version + //This essentially gets rid of node versions that will never be read (beyond 5s worth of versions) + //TODO look into making this per-version compaction. (We could keep track of updated nodes at each version for example) + template + void compact(Reference>& p, Version newOldestVersion){ + if (!p) { + return; + } + if (p->updated && p->lastUpdateVersion <= newOldestVersion) { + /* If the node has been updated, figure out which pointer was repalced. And delete that pointer */ + auto which = p->replacedPointer; + p->pointer[which] = p->pointer[2]; + p->updated = false; + p->pointer[2] = Reference>(); + //p->pointer[which] = Reference>(); + } + Reference> left = p->left(newOldestVersion); + Reference> right = p->right(newOldestVersion); + compact(left, newOldestVersion); + compact(right, newOldestVersion); + } + } +class ValueOrClearToRef { +public: + static ValueOrClearToRef value(ValueRef const& v) { return ValueOrClearToRef(v, false); } + static ValueOrClearToRef clearTo(KeyRef const& k) { return ValueOrClearToRef(k, true); } + + bool isValue() const { return !isClear; }; + bool isClearTo() const { return isClear; } + + ValueRef const& getValue() const { ASSERT( isValue() ); return item; }; + KeyRef const& getEndKey() const { ASSERT(isClearTo()); return item; }; + +private: + ValueOrClearToRef( StringRef item, bool isClear ) : item(item), isClear(isClear) {} + StringRef item; + bool isClear; +}; + // VersionedMap provides an interface to a partially persistent tree, allowing you to read the values at a particular version, // create new versions, modify the current version of the tree, and forget versions prior to a specific version. template @@ -597,6 +639,26 @@ public: erase(key); } + void printDetail() { + PTreeImpl::printTreeDetails(roots.back().second, 0); + } + + void printTree(Version at) { + PTreeImpl::printTree(roots.back().second, at, 0); + } + + void compact(Version newOldestVersion) { + ASSERT( newOldestVersion <= latestVersion ); + //auto newBegin = roots.lower_bound(newOldestVersion); + auto newBegin = lower_bound(roots.begin(), roots.end(), newOldestVersion, rootsComparator()); + for(auto root = roots.begin(); root != newBegin; ++root) { + if(root->second) + PTreeImpl::compact(root->second, newOldestVersion); + } + //printf("\nPrinting the tree at latest version after compaction.\n"); + //PTreeImpl::printTreeDetails(roots.back().second(), 0); + } + // for(auto i = vm.at(version).lower_bound(range.begin); i < range.end; ++i) struct iterator{ explicit iterator(Tree const& root, Version at) : root(root), at(at) {} @@ -686,6 +748,11 @@ public: ViewAtVersion at( Version v ) const { return ViewAtVersion(getRoot(v), v); } ViewAtVersion atLatest() const { return ViewAtVersion(roots.back().second, latestVersion); } + bool isClearContaining( ViewAtVersion const& view, KeyRef key ) { + auto i = view.lastLessOrEqual(key); + return i && i->isClearTo() && i->getEndKey() > key; + } + // TODO: getHistory? }; diff --git a/fdbclient/fdbclient.vcxproj b/fdbclient/fdbclient.vcxproj index be793d900d..974aa896a8 100644 --- a/fdbclient/fdbclient.vcxproj +++ b/fdbclient/fdbclient.vcxproj @@ -89,6 +89,9 @@ + + false + diff --git a/fdbrpc/AsyncFileEIO.actor.h b/fdbrpc/AsyncFileEIO.actor.h index f786266888..f3450af847 100644 --- a/fdbrpc/AsyncFileEIO.actor.h +++ b/fdbrpc/AsyncFileEIO.actor.h @@ -45,7 +45,8 @@ class AsyncFileEIO : public IAsyncFile, public ReferenceCounted { public: static void init() { - if (eio_init( &eio_want_poll, NULL )) { + eio_set_max_parallel(FLOW_KNOBS->EIO_MAX_PARALLELISM); + if (eio_init( &eio_want_poll, NULL )) { TraceEvent("EioInitError").detail("ErrorNo", errno); throw platform_error(); } @@ -246,6 +247,9 @@ private: if( flags & OPEN_READONLY ) oflags |= O_RDONLY; if( flags & OPEN_READWRITE ) oflags |= O_RDWR; if( flags & OPEN_ATOMIC_WRITE_AND_CREATE ) oflags |= O_TRUNC; +#if defined(__linux__) + if ( flags & OPEN_UNBUFFERED && FLOW_KNOBS->EIO_USE_ODIRECT ) oflags |= O_DIRECT; +#endif return oflags; } diff --git a/fdbrpc/FlowTests.actor.cpp b/fdbrpc/FlowTests.actor.cpp index 7317c81ff0..2939f96b6f 100644 --- a/fdbrpc/FlowTests.actor.cpp +++ b/fdbrpc/FlowTests.actor.cpp @@ -50,24 +50,27 @@ TEST_CASE("/flow/actorcompiler/lineNumbers") { return Void(); } -TEST_CASE("/flow/delayOrdering") { - state double x = deterministicRandom()->random01(); - state double y = deterministicRandom()->random01(); - if (BUGGIFY) { - y = x; +TEST_CASE("/flow/buggifiedDelay") { + if (FLOW_KNOBS->MAX_BUGGIFIED_DELAY == 0) { + return Void(); + } + loop { + state double x = deterministicRandom()->random01(); + state int last = 0; + state Future f1 = map(delay(x), [last = &last](const Void&) { + *last = 1; + return Void(); + }); + state Future f2 = map(delay(x), [last = &last](const Void&) { + *last = 2; + return Void(); + }); + wait(f1 && f2); + if (last == 1) { + TEST(true); // Delays can become ready out of order + return Void(); + } } - state int last = 0; - state Future f1 = map(delay(x), [last = &last](const Void&) { - *last = 1; - return Void(); - }); - state Future f2 = map(delay(y), [last = &last](const Void&) { - *last = 2; - return Void(); - }); - wait(f1 && f2); - ASSERT((x <= y) == (last == 2)); - return Void(); } template diff --git a/fdbrpc/Locality.cpp b/fdbrpc/Locality.cpp index 424cf81733..27fa654372 100644 --- a/fdbrpc/Locality.cpp +++ b/fdbrpc/Locality.cpp @@ -40,8 +40,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons case ProcessClass::LogClass: return ProcessClass::WorstFit; case ProcessClass::CoordinatorClass: - return ProcessClass::NeverAssign; case ProcessClass::TesterClass: + case ProcessClass::StorageCacheClass: return ProcessClass::NeverAssign; default: return ProcessClass::NeverAssign; @@ -57,8 +57,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons case ProcessClass::StorageClass: return ProcessClass::WorstFit; case ProcessClass::CoordinatorClass: - return ProcessClass::NeverAssign; case ProcessClass::TesterClass: + case ProcessClass::StorageCacheClass: return ProcessClass::NeverAssign; default: return ProcessClass::NeverAssign; @@ -76,8 +76,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons case ProcessClass::TransactionClass: return ProcessClass::OkayFit; case ProcessClass::CoordinatorClass: - return ProcessClass::NeverAssign; case ProcessClass::TesterClass: + case ProcessClass::StorageCacheClass: return ProcessClass::NeverAssign; default: return ProcessClass::WorstFit; @@ -93,8 +93,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons case ProcessClass::ResolutionClass: return ProcessClass::OkayFit; case ProcessClass::CoordinatorClass: - return ProcessClass::NeverAssign; case ProcessClass::TesterClass: + case ProcessClass::StorageCacheClass: return ProcessClass::NeverAssign; default: return ProcessClass::WorstFit; @@ -110,8 +110,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons case ProcessClass::TransactionClass: return ProcessClass::OkayFit; case ProcessClass::CoordinatorClass: - return ProcessClass::NeverAssign; case ProcessClass::TesterClass: + case ProcessClass::StorageCacheClass: return ProcessClass::NeverAssign; default: return ProcessClass::WorstFit; @@ -129,8 +129,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons case ProcessClass::TransactionClass: return ProcessClass::OkayFit; case ProcessClass::CoordinatorClass: - return ProcessClass::NeverAssign; case ProcessClass::TesterClass: + case ProcessClass::StorageCacheClass: return ProcessClass::NeverAssign; default: return ProcessClass::WorstFit; @@ -154,8 +154,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons case ProcessClass::LogRouterClass: return ProcessClass::OkayFit; case ProcessClass::CoordinatorClass: - return ProcessClass::NeverAssign; case ProcessClass::TesterClass: + case ProcessClass::StorageCacheClass: return ProcessClass::NeverAssign; default: return ProcessClass::WorstFit; @@ -172,6 +172,7 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons return ProcessClass::OkayFit; case ProcessClass::CoordinatorClass: case ProcessClass::TesterClass: + case ProcessClass::StorageCacheClass: return ProcessClass::NeverAssign; default: return ProcessClass::WorstFit; @@ -188,10 +189,18 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons return ProcessClass::OkayFit; case ProcessClass::CoordinatorClass: case ProcessClass::TesterClass: + case ProcessClass::StorageCacheClass: return ProcessClass::NeverAssign; default: return ProcessClass::WorstFit; } + case ProcessClass::StorageCache: + switch( _class ) { + case ProcessClass::StorageCacheClass: + return ProcessClass::BestFit; + default: + return ProcessClass::NeverAssign; + } default: return ProcessClass::NeverAssign; } diff --git a/fdbrpc/Locality.h b/fdbrpc/Locality.h index c8f2b096ae..78cb49b638 100644 --- a/fdbrpc/Locality.h +++ b/fdbrpc/Locality.h @@ -43,11 +43,12 @@ struct ProcessClass { DataDistributorClass, CoordinatorClass, RatekeeperClass, + StorageCacheClass, InvalidClass = -1 }; enum Fitness { BestFit, GoodFit, UnsetFit, OkayFit, WorstFit, ExcludeFit, NeverAssign }; //cannot be larger than 7 because of leader election mask - enum ClusterRole { Storage, TLog, Proxy, Master, Resolver, LogRouter, ClusterController, DataDistributor, Ratekeeper, NoRole }; + enum ClusterRole { Storage, TLog, Proxy, Master, Resolver, LogRouter, ClusterController, DataDistributor, Ratekeeper, StorageCache, NoRole }; enum ClassSource { CommandLineSource, AutoSource, DBSource, InvalidSource = -1 }; int16_t _class; int16_t _source; @@ -72,6 +73,7 @@ public: else if (s=="data_distributor") _class = DataDistributorClass; else if (s=="coordinator") _class = CoordinatorClass; else if (s=="ratekeeper") _class = RatekeeperClass; + else if (s=="storage_cache") _class = StorageCacheClass; else _class = InvalidClass; } @@ -91,6 +93,7 @@ public: else if (classStr=="data_distributor") _class = DataDistributorClass; else if (classStr=="coordinator") _class = CoordinatorClass; else if (classStr=="ratekeeper") _class = RatekeeperClass; + else if (classStr=="storage_cache") _class = StorageCacheClass; else _class = InvalidClass; if (sourceStr=="command_line") _source = CommandLineSource; @@ -125,6 +128,7 @@ public: case DataDistributorClass: return "data_distributor"; case CoordinatorClass: return "coordinator"; case RatekeeperClass: return "ratekeeper"; + case StorageCacheClass: return "storage_cache"; default: return "invalid"; } } diff --git a/fdbrpc/Net2FileSystem.cpp b/fdbrpc/Net2FileSystem.cpp index 31ce9f6095..ea5e3e3539 100644 --- a/fdbrpc/Net2FileSystem.cpp +++ b/fdbrpc/Net2FileSystem.cpp @@ -59,9 +59,10 @@ Future< Reference > Net2FileSystem::open( std::string filename Future> f; #ifdef __linux__ // In the vast majority of cases, we wish to use Kernel AIO. However, some systems - // dont properly support don’t properly support kernel async I/O without O_DIRECT - // or AIO at all. In such cases, DISABLE_POSIX_KERNEL_AIO knob can be enabled to fallback to - // EIO instead of Kernel AIO. + // don’t properly support kernel async I/O without O_DIRECT or AIO at all. In such + // cases, DISABLE_POSIX_KERNEL_AIO knob can be enabled to fallback to EIO instead + // of Kernel AIO. And EIO_USE_ODIRECT can be used to turn on or off O_DIRECT within + // EIO. if ((flags & IAsyncFile::OPEN_UNBUFFERED) && !(flags & IAsyncFile::OPEN_NO_AIO) && !FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO) f = AsyncFileKAIO::open(filename, flags, mode, NULL); diff --git a/fdbrpc/fdbrpc.vcxproj b/fdbrpc/fdbrpc.vcxproj index b77c8d24f8..801321b336 100644 --- a/fdbrpc/fdbrpc.vcxproj +++ b/fdbrpc/fdbrpc.vcxproj @@ -163,8 +163,8 @@ - echo const char *hgVersion = "Current version id not currently supported within Windows."; > hgVersion.temp.h && fc /b hgVersion.temp.h hgVersion.h > nul || copy hgVersion.temp.h hgVersion.h > nul - Checking HG source version + echo const char *sourceVersion = "Current version id not currently supported within Windows."; > SourceVersion.temp.h && fc /b SourceVersion.temp.h SourceVersion.h > nul || copy SourceVersion.temp.h SourceVersion.h > nul + Checking source version fake.out diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h index c78fd82edb..bff5cf93b9 100644 --- a/fdbrpc/simulator.h +++ b/fdbrpc/simulator.h @@ -98,6 +98,7 @@ public: case ProcessClass::ClusterControllerClass: return false; case ProcessClass::DataDistributorClass: return false; case ProcessClass::RatekeeperClass: return false; + case ProcessClass::StorageCacheClass: return false; default: return false; } } diff --git a/fdbserver/ApplyMetadataMutation.cpp b/fdbserver/ApplyMetadataMutation.cpp index 7dea8d1723..ffe84e8e4d 100644 --- a/fdbserver/ApplyMetadataMutation.cpp +++ b/fdbserver/ApplyMetadataMutation.cpp @@ -46,8 +46,10 @@ Reference getStorageInfo(UID id, std::map const& mutations, IKeyValueStore* txnStateStore, LogPushData* toCommit, bool *confChange, Reference logSystem, Version popVersion, - KeyRangeMap >* vecBackupKeys, KeyRangeMap* keyInfo, std::map* uid_applyMutationsData, RequestStream commit, - Database cx, NotifiedVersion* commitVersion, std::map>* storageCache, std::map* tag_popped, bool initialCommit ) { + KeyRangeMap >* vecBackupKeys, KeyRangeMap* keyInfo, KeyRangeMap* cacheInfo, std::map* uid_applyMutationsData, RequestStream commit, + Database cx, NotifiedVersion* commitVersion, std::map>* storageCache, std::map* tag_popped, bool initialCommit ) { + //std::map> cacheRangeInfo; + std::map cachedRangeInfo; for (auto const& m : mutations) { //TraceEvent("MetadataMutation", dbgid).detail("M", m.toString()); @@ -129,6 +131,37 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRefrangeContaining(k).end(); + vector serverIndices; + decodeStorageCacheValue(m.param2, serverIndices); + cacheInfo->insert(KeyRangeRef(k,end),serverIndices.size() > 0); + } + } + if(!initialCommit) txnStateStore->set(KeyValueRef(m.param1, m.param2)); + } else if (m.param1.startsWith(cacheKeysPrefix)) { + // Create a private mutation for cache servers + // This is done to make the cache servers aware of the cached key-ranges + if(toCommit) { + MutationRef privatized = m; + privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena); + TraceEvent(SevDebug, "SendingPrivateMutation", dbgid).detail("Original", m.toString()).detail("Privatized", privatized.toString()); + toCommit->addTag( cacheTag ); + toCommit->addTypedMessage(privatized); + } } else if (m.param1.startsWith(configKeysPrefix) || m.param1 == coordinatorsKey) { if(Optional(m.param2) != txnStateStore->readValue(m.param1).get().castTo()) { // FIXME: Make this check more specific, here or by reading configuration whenever there is a change @@ -138,7 +171,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRefset(KeyValueRef(m.param1, m.param2)); - if(uid_applyMutationsData != NULL) { + if(uid_applyMutationsData != nullptr) { Key uid = m.param1.removePrefix(applyMutationsEndRange.begin); auto &p = (*uid_applyMutationsData)[uid]; p.endVersion = BinaryReader::fromStringRef(m.param2, Unversioned()); @@ -190,7 +223,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRefset(KeyValueRef(m.param1, m.param2)); - if(uid_applyMutationsData != NULL) { + if(uid_applyMutationsData != nullptr) { if(m.param1.size() >= applyMutationsKeyVersionMapRange.begin.size() + sizeof(UID)) { Key uid = m.param1.substr(applyMutationsKeyVersionMapRange.begin.size(), sizeof(UID)); Key k = m.param1.substr(applyMutationsKeyVersionMapRange.begin.size() + sizeof(UID)); @@ -205,7 +238,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRefset(KeyValueRef(m.param1, m.param2)); if (vecBackupKeys) { Key logDestination; - KeyRef logRangeBegin = logRangesDecodeKey(m.param1, NULL); + KeyRef logRangeBegin = logRangesDecodeKey(m.param1, nullptr); Key logRangeEnd = logRangesDecodeValue(m.param2, &logDestination); // Insert the logDestination into each range of vecBackupKeys overlapping the decoded range @@ -345,7 +378,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRefclear(commonEndRange); - if(uid_applyMutationsData != NULL) { + if(uid_applyMutationsData != nullptr) { uid_applyMutationsData->erase(uid_applyMutationsData->lower_bound(m.param1.substr(applyMutationsEndRange.begin.size())), m.param2 == applyMutationsEndRange.end ? uid_applyMutationsData->end() : uid_applyMutationsData->lower_bound(m.param2.substr(applyMutationsEndRange.begin.size()))); } @@ -353,7 +386,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRefclear(commonApplyRange); - if(uid_applyMutationsData != NULL) { + if(uid_applyMutationsData != nullptr) { if(m.param1.size() >= applyMutationsKeyVersionMapRange.begin.size() + sizeof(UID) && m.param2.size() >= applyMutationsKeyVersionMapRange.begin.size() + sizeof(UID)) { Key uid = m.param1.substr(applyMutationsKeyVersionMapRange.begin.size(), sizeof(UID)); Key uid2 = m.param2.substr(applyMutationsKeyVersionMapRange.begin.size(), sizeof(UID)); @@ -389,7 +422,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef::iterator itr; + KeyRef keyBegin, keyEnd; + vector serverIndices; + MutationRef mutationBegin, mutationEnd; + + for (itr = cachedRangeInfo.begin(); itr != cachedRangeInfo.end(); ++itr) { + // first figure out the begin and end keys for the cached-range, + // the begin and end mutations can be in any order + decodeStorageCacheValue(itr->second.param2, serverIndices); + // serverIndices count should be greater than zero for beginKey mutations + if (serverIndices.size() > 0) { + keyBegin = itr->first; + mutationBegin = itr->second; + ++itr; + keyEnd = itr->first; + mutationEnd = itr->second; + } else { + keyEnd = itr->first; + mutationEnd = itr->second; + ++itr; + keyBegin = itr->first; + mutationBegin = itr->second; + } + + // Now get all the storage server tags for the cached key-ranges + std::set allTags; + auto ranges = keyInfo->intersectingRanges(KeyRangeRef(keyBegin, keyEnd)); + for(auto it : ranges) { + auto& r = it.value(); + for(auto info : r.src_info) { + allTags.insert(info->tag); + } + for(auto info : r.dest_info) { + allTags.insert(info->tag); + } + } + + // Add the tags to both begin and end mutations + toCommit->addTags(allTags); + toCommit->addTypedMessage(mutationBegin); + toCommit->addTags(allTags); + toCommit->addTypedMessage(mutationEnd); + } + } } diff --git a/fdbserver/ApplyMetadataMutation.h b/fdbserver/ApplyMetadataMutation.h index 78bd1cc5d2..54cd140f3c 100644 --- a/fdbserver/ApplyMetadataMutation.h +++ b/fdbserver/ApplyMetadataMutation.h @@ -45,7 +45,7 @@ struct applyMutationsData { Reference getStorageInfo(UID id, std::map>* storageCache, IKeyValueStore* txnStateStore); void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef const& mutations, IKeyValueStore* txnStateStore, LogPushData* toCommit, bool *confChange, Reference logSystem = Reference(), Version popVersion = 0, - KeyRangeMap >* vecBackupKeys = NULL, KeyRangeMap* keyInfo = NULL, std::map* uid_applyMutationsData = NULL, RequestStream commit = RequestStream(), - Database cx = Database(), NotifiedVersion* commitVersion = NULL, std::map>* storageCache = NULL, std::map* tag_popped = NULL, bool initialCommit = false ); + KeyRangeMap >* vecBackupKeys = nullptr, KeyRangeMap* keyInfo = nullptr, KeyRangeMap* cacheInfo = nullptr, std::map* uid_applyMutationsData = nullptr, RequestStream commit = RequestStream(), + Database cx = Database(), NotifiedVersion* commitVersion = nullptr, std::map>* storageCache = nullptr, std::map* tag_popped = nullptr, bool initialCommit = false ); #endif diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index 6e69968ed4..7883802e0d 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -24,8 +24,6 @@ set(FDBSERVER_SRCS IKeyValueStore.h IPager.h IVersionedStore.h - IndirectShadowPager.actor.cpp - IndirectShadowPager.h KeyValueStoreCompressTestData.actor.cpp KeyValueStoreMemory.actor.cpp KeyValueStoreSQLite.actor.cpp @@ -45,8 +43,6 @@ set(FDBSERVER_SRCS MasterInterface.h MasterProxyServer.actor.cpp masterserver.actor.cpp - MemoryPager.actor.cpp - MemoryPager.h MoveKeys.actor.cpp MoveKeys.actor.h networktest.actor.cpp @@ -76,7 +72,6 @@ set(FDBSERVER_SRCS RestoreLoader.actor.cpp RestoreWorker.actor.h RestoreWorker.actor.cpp - RestoreWorkerInterface.h Resolver.actor.cpp ResolverInterface.h ServerDBInfo.h @@ -85,6 +80,7 @@ set(FDBSERVER_SRCS SkipList.cpp Status.actor.cpp Status.h + StorageCache.actor.cpp StorageMetrics.actor.h StorageMetrics.h storageserver.actor.cpp diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index be97ec41a3..ffcacce08c 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -57,13 +57,15 @@ struct WorkerInfo : NonCopyable { WorkerDetails details; Future haltRatekeeper; Future haltDistributor; + Optional storageCacheInfo; WorkerInfo() : gen(-1), reboots(0), lastAvailableTime(now()), priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {} WorkerInfo( Future watcher, ReplyPromise reply, Generation gen, WorkerInterface interf, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, bool degraded ) : watcher(watcher), reply(reply), gen(gen), reboots(0), lastAvailableTime(now()), initialClass(initialClass), priorityInfo(priorityInfo), details(interf, processClass, degraded) {} WorkerInfo( WorkerInfo&& r ) BOOST_NOEXCEPT : watcher(std::move(r.watcher)), reply(std::move(r.reply)), gen(r.gen), - reboots(r.reboots), lastAvailableTime(r.lastAvailableTime), initialClass(r.initialClass), priorityInfo(r.priorityInfo), details(std::move(r.details)) {} + reboots(r.reboots), lastAvailableTime(r.lastAvailableTime), initialClass(r.initialClass), priorityInfo(r.priorityInfo), details(std::move(r.details)), + haltRatekeeper(r.haltRatekeeper), haltDistributor(r.haltDistributor), storageCacheInfo(r.storageCacheInfo) {} void operator=( WorkerInfo&& r ) BOOST_NOEXCEPT { watcher = std::move(r.watcher); reply = std::move(r.reply); @@ -73,6 +75,9 @@ struct WorkerInfo : NonCopyable { initialClass = r.initialClass; priorityInfo = r.priorityInfo; details = std::move(r.details); + haltRatekeeper = r.haltRatekeeper; + haltDistributor = r.haltDistributor; + storageCacheInfo = r.storageCacheInfo; } }; @@ -101,9 +106,11 @@ public: Database db; int unfinishedRecoveries; int logGenerations; + std::map, Optional>> cacheInterfaces; + bool cachePopulated; std::map> clientStatus; - DBInfo() : masterRegistrationCount(0), recoveryStalled(false), forceRecovery(false), unfinishedRecoveries(0), logGenerations(0), + DBInfo() : masterRegistrationCount(0), recoveryStalled(false), forceRecovery(false), unfinishedRecoveries(0), logGenerations(0), cachePopulated(false), clientInfo( new AsyncVar( ClientDBInfo() ) ), serverInfo( new AsyncVar>( CachedSerialization() ) ), db( DatabaseContext::create( clientInfo, Future(), LocalityData(), true, TaskPriority::DefaultEndpoint, true ) ) // SOMEDAY: Locality! @@ -126,6 +133,27 @@ public: serverInfo->set( newInfoCache ); } + void setStorageCache(uint16_t id, const StorageServerInterface& interf) { + CachedSerialization newInfoCache = serverInfo->get(); + auto& newInfo = newInfoCache.mutate(); + bool found = false; + for(auto& it : newInfo.storageCaches) { + if(it.first == id) { + if(it.second != interf) { + newInfo.id = deterministicRandom()->randomUniqueID(); + it.second = interf; + } + found = true; + break; + } + } + if(!found) { + newInfo.id = deterministicRandom()->randomUniqueID(); + newInfo.storageCaches.push_back(std::make_pair(id, interf)); + } + serverInfo->set( newInfoCache ); + } + void clearInterf(ProcessClass::ClassType t) { CachedSerialization newInfoCache = serverInfo->get(); auto& newInfo = newInfoCache.mutate(); @@ -137,6 +165,19 @@ public: } serverInfo->set( newInfoCache ); } + + void clearStorageCache(uint16_t id) { + CachedSerialization newInfoCache = serverInfo->get(); + auto& newInfo = newInfoCache.mutate(); + for(auto it = newInfo.storageCaches.begin(); it != newInfo.storageCaches.end(); ++it) { + if(it->first == id) { + newInfo.id = deterministicRandom()->randomUniqueID(); + newInfo.storageCaches.erase(it); + break; + } + } + serverInfo->set( newInfoCache ); + } }; struct UpdateWorkerList { @@ -201,6 +242,11 @@ public: return ( now() - startTime < 2 * FLOW_KNOBS->SERVER_REQUEST_INTERVAL ) || ( IFailureMonitor::failureMonitor().getState(worker.details.interf.storage.getEndpoint()).isAvailable() && ( !checkStable || worker.reboots < 2 ) ); } + bool isLongLivedStateless( Optional const& processId ) { + return (db.serverInfo->get().read().distributor.present() && db.serverInfo->get().read().distributor.get().locality.processId() == processId) || + (db.serverInfo->get().read().ratekeeper.present() && db.serverInfo->get().read().ratekeeper.get().locality.processId() == processId); + } + WorkerDetails getStorageWorker( RecruitStorageRequest const& req ) { std::set>> excludedMachines( req.excludeMachines.begin(), req.excludeMachines.end() ); std::set>> includeDCs( req.includeDCs.begin(), req.includeDCs.end() ); @@ -453,8 +499,7 @@ public: fitness = std::max(fitness, ProcessClass::ExcludeFit); } if( workerAvailable(it.second, checkStable) && fitness < unacceptableFitness && it.second.details.interf.locality.dcId()==dcId ) { - if ((db.serverInfo->get().read().distributor.present() && db.serverInfo->get().read().distributor.get().locality.processId() == it.first) || - (db.serverInfo->get().read().ratekeeper.present() && db.serverInfo->get().read().ratekeeper.get().locality.processId() == it.first)) { + if (isLongLivedStateless(it.first)) { fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].second.push_back(it.second.details); } else { fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].first.push_back(it.second.details); @@ -486,8 +531,7 @@ public: auto fitness = it.second.details.processClass.machineClassFitness( role ); if( workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.details.interf.address()) && it.second.details.interf.locality.dcId() == dcId && ( !minWorker.present() || ( it.second.details.interf.id() != minWorker.get().worker.interf.id() && ( fitness < minWorker.get().fitness || (fitness == minWorker.get().fitness && id_used[it.first] <= minWorker.get().used ) ) ) ) ) { - if ((db.serverInfo->get().read().distributor.present() && db.serverInfo->get().read().distributor.get().locality.processId() == it.first) || - (db.serverInfo->get().read().ratekeeper.present() && db.serverInfo->get().read().ratekeeper.get().locality.processId() == it.first)) { + if (isLongLivedStateless(it.first)) { fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].second.push_back(it.second.details); } else { fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].first.push_back(it.second.details); @@ -1328,6 +1372,7 @@ ACTOR Future clusterWatchDatabase( ClusterControllerData* cluster, Cluster dbInfo.clusterInterface = db->serverInfo->get().read().clusterInterface; dbInfo.distributor = db->serverInfo->get().read().distributor; dbInfo.ratekeeper = db->serverInfo->get().read().ratekeeper; + dbInfo.storageCaches = db->serverInfo->get().read().storageCaches; TraceEvent("CCWDB", cluster->id).detail("Lifetime", dbInfo.masterLifetime.toString()).detail("ChangeID", dbInfo.id); db->serverInfo->set( cachedInfo ); @@ -1580,8 +1625,27 @@ ACTOR Future workerAvailabilityWatch( WorkerInterface worker, ProcessClass } when( wait( failed ) ) { // remove workers that have failed WorkerInfo& failedWorkerInfo = cluster->id_worker[ worker.locality.processId() ]; + if(failedWorkerInfo.storageCacheInfo.present()) { + bool found = false; + for(auto& it : cluster->id_worker) { + if(!it.second.storageCacheInfo.present() && it.second.details.processClass == ProcessClass::StorageCacheClass) { + found = true; + it.second.storageCacheInfo = failedWorkerInfo.storageCacheInfo; + cluster->db.cacheInterfaces[failedWorkerInfo.storageCacheInfo.get()] = std::make_pair(Optional(), it.first); + if(!it.second.reply.isSet()) { + it.second.reply.send( RegisterWorkerReply(it.second.details.processClass, it.second.priorityInfo, failedWorkerInfo.storageCacheInfo) ); + } + break; + } + } + if(!found) { + cluster->db.cacheInterfaces[failedWorkerInfo.storageCacheInfo.get()] = std::make_pair(Optional(), Optional()); + } + cluster->db.clearStorageCache(failedWorkerInfo.storageCacheInfo.get()); + } + if (!failedWorkerInfo.reply.isSet()) { - failedWorkerInfo.reply.send( RegisterWorkerReply(failedWorkerInfo.details.processClass, failedWorkerInfo.priorityInfo) ); + failedWorkerInfo.reply.send( RegisterWorkerReply(failedWorkerInfo.details.processClass, failedWorkerInfo.priorityInfo, Optional()) ); } if (worker.locality.processId() == cluster->masterProcessId) { cluster->masterProcessId = Optional(); @@ -1855,7 +1919,7 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c if ( it.second.priorityInfo.isExcluded != isExcludedFromConfig ) { it.second.priorityInfo.isExcluded = isExcludedFromConfig; if( !it.second.reply.isSet() ) { - it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo ) ); + it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo, it.second.storageCacheInfo ) ); } } } @@ -1957,11 +2021,6 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { if ( self->gotFullyRecoveredConfig ) { newPriorityInfo.isExcluded = self->db.fullyRecoveredConfig.isExcludedServer(w.address()); } - - // Notify the worker to register again with new process class/exclusive property - if ( !req.reply.isSet() && newPriorityInfo != req.priorityInfo ) { - req.reply.send( RegisterWorkerReply(newProcessClass, newPriorityInfo) ); - } } if( info == self->id_worker.end() ) { @@ -2021,6 +2080,57 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { } } } + Optional newStorageCache = req.storageCacheInterf.present() ? req.storageCacheInterf.get().first : Optional(); + auto& cacheInfo = self->id_worker[w.locality.processId()].storageCacheInfo; + if (req.storageCacheInterf.present()) { + auto it = self->db.cacheInterfaces.find(req.storageCacheInterf.get().first); + if(it == self->db.cacheInterfaces.end()) { + if(self->db.cachePopulated) { + if(cacheInfo.present()) { + self->db.clearStorageCache(cacheInfo.get()); + } + newStorageCache = Optional(); + cacheInfo = Optional(); + } else { + self->db.setStorageCache(req.storageCacheInterf.get().first, req.storageCacheInterf.get().second); + self->db.cacheInterfaces[req.storageCacheInterf.get().first] = std::make_pair(req.storageCacheInterf.get().second, w.locality.processId()); + cacheInfo = req.storageCacheInterf.get().first; + } + } else { + if(!it->second.second.present() || (cacheInfo.present() && cacheInfo.get() == it->first) ) { + self->db.setStorageCache(req.storageCacheInterf.get().first, req.storageCacheInterf.get().second); + it->second = std::make_pair(req.storageCacheInterf.get().second, w.locality.processId()); + cacheInfo = req.storageCacheInterf.get().first; + } + else { + if(cacheInfo.present()) { + self->db.clearStorageCache(cacheInfo.get()); + } + newStorageCache = Optional(); + cacheInfo = Optional(); + } + } + } else { + newStorageCache = cacheInfo; + } + + if(self->gotProcessClasses && newProcessClass == ProcessClass::StorageCacheClass && !newStorageCache.present()) { + for(auto& it : self->db.cacheInterfaces) { + if(!it.second.second.present()) { + it.second.second = w.locality.processId(); + self->id_worker[w.locality.processId()].storageCacheInfo = it.first; + newStorageCache = it.first; + break; + } + } + } + + // Notify the worker to register again with new process class/exclusive property + if ( !req.reply.isSet() && ( newPriorityInfo != req.priorityInfo || + newStorageCache.present() != req.storageCacheInterf.present() || + (newStorageCache.present() && newStorageCache.get() != req.storageCacheInterf.get().first) ) ) { + req.reply.send( RegisterWorkerReply(newProcessClass, newPriorityInfo, newStorageCache) ); + } } #define TIME_KEEPER_VERSION LiteralStringRef("1") @@ -2240,7 +2350,7 @@ ACTOR Future monitorProcessClasses(ClusterControllerData *self) { w.second.details.processClass = newProcessClass; w.second.priorityInfo.processClassFitness = newProcessClass.machineClassFitness(ProcessClass::ClusterController); if (!w.second.reply.isSet()) { - w.second.reply.send( RegisterWorkerReply(w.second.details.processClass, w.second.priorityInfo) ); + w.second.reply.send( RegisterWorkerReply(w.second.details.processClass, w.second.priorityInfo, w.second.storageCacheInfo) ); } } } @@ -2300,6 +2410,80 @@ ACTOR Future monitorServerInfoConfig(ClusterControllerData::DBInfo* db) { } } +ACTOR Future monitorStorageCache(ClusterControllerData* self) { + loop { + state ReadYourWritesTransaction tr(self->db.db); + loop { + try { + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); + + Optional changeVal = wait(tr.get(cacheChangeKey)); + Standalone changeKeys = wait(tr.getRange(cacheChangeKeys, CLIENT_KNOBS->TOO_MANY)); + ASSERT( !changeKeys.more && changeKeys.size() < CLIENT_KNOBS->TOO_MANY ); + + std::set changeIDs; + for(auto& it : changeKeys) { + changeIDs.insert(cacheChangeKeyDecodeIndex(it.key)); + } + + for(auto& it : changeIDs) { + if(!self->db.cacheInterfaces.count(it)) { + self->db.cacheInterfaces[it] = std::make_pair(Optional(), Optional()); + } + } + + std::vector removeIDs; + for(auto& it : self->db.cacheInterfaces) { + if(!changeIDs.count(it.first)) { + removeIDs.push_back(it.first); + if(it.second.second.present()) { + self->id_worker[it.second.second.get()].storageCacheInfo = Optional(); + } + self->db.clearStorageCache(it.first); + } + } + + for(auto& it : removeIDs) { + self->db.cacheInterfaces.erase(it); + } + + for(auto& c : self->db.cacheInterfaces) { + if(!c.second.second.present()) { + bool found = false; + for(auto& it : self->id_worker) { + if(!it.second.storageCacheInfo.present() && it.second.details.processClass == ProcessClass::StorageCacheClass) { + found = true; + it.second.storageCacheInfo = c.first; + c.second.second = it.first; + if(!it.second.reply.isSet()) { + it.second.reply.send( RegisterWorkerReply(it.second.details.processClass, it.second.priorityInfo, c.first) ); + } + break; + } + } + if(!found) { + break; + } + } + } + + state Future configChangeFuture = tr.watch(cacheChangeKey); + + self->db.cachePopulated = true; + wait(tr.commit()); + wait(configChangeFuture); + + break; + } + catch (Error &e) { + wait(tr.onError(e)); + } + } + } +} + ACTOR Future monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db) { loop { state ReadYourWritesTransaction tr(db->db); @@ -2350,7 +2534,7 @@ ACTOR Future updatedChangingDatacenters(ClusterControllerData *self) { if ( worker.priorityInfo.dcFitness > newFitness ) { worker.priorityInfo.dcFitness = newFitness; if(!worker.reply.isSet()) { - worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo ) ); + worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo, worker.storageCacheInfo ) ); } } else { state int currentFit = ProcessClass::BestFit; @@ -2363,7 +2547,7 @@ ACTOR Future updatedChangingDatacenters(ClusterControllerData *self) { updated = true; it.second.priorityInfo.dcFitness = fitness; if(!it.second.reply.isSet()) { - it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo ) ); + it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo, it.second.storageCacheInfo ) ); } } } @@ -2402,7 +2586,7 @@ ACTOR Future updatedChangedDatacenters(ClusterControllerData *self) { if( worker.priorityInfo.dcFitness != newFitness ) { worker.priorityInfo.dcFitness = newFitness; if(!worker.reply.isSet()) { - worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo ) ); + worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo, worker.storageCacheInfo ) ); } } } else { @@ -2416,7 +2600,7 @@ ACTOR Future updatedChangedDatacenters(ClusterControllerData *self) { updated = true; it.second.priorityInfo.dcFitness = fitness; if(!it.second.reply.isSet()) { - it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo ) ); + it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo, it.second.storageCacheInfo ) ); } } } @@ -2703,8 +2887,8 @@ ACTOR Future clusterControllerCore( ClusterControllerFullInterface interf, self.addActor.send( handleForcedRecoveries(&self, interf) ); self.addActor.send( monitorDataDistributor(&self) ); self.addActor.send( monitorRatekeeper(&self) ); + self.addActor.send( monitorStorageCache(&self) ); self.addActor.send( traceCounters("ClusterControllerMetrics", self.id, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &self.clusterControllerMetrics, self.id.toString() + "/ClusterControllerMetrics") ); - //printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str()); loop choose { diff --git a/fdbserver/ClusterRecruitmentInterface.h b/fdbserver/ClusterRecruitmentInterface.h index d8432c7d1e..cf238f1b79 100644 --- a/fdbserver/ClusterRecruitmentInterface.h +++ b/fdbserver/ClusterRecruitmentInterface.h @@ -175,13 +175,14 @@ struct RegisterWorkerReply { constexpr static FileIdentifier file_identifier = 16475696; ProcessClass processClass; ClusterControllerPriorityInfo priorityInfo; + Optional storageCache; RegisterWorkerReply() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {} - RegisterWorkerReply(ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo) : processClass(processClass), priorityInfo(priorityInfo) {} + RegisterWorkerReply(ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Optional storageCache) : processClass(processClass), priorityInfo(priorityInfo), storageCache(storageCache) {} template void serialize( Ar& ar ) { - serializer(ar, processClass, priorityInfo); + serializer(ar, processClass, priorityInfo, storageCache); } }; @@ -194,16 +195,17 @@ struct RegisterWorkerRequest { Generation generation; Optional distributorInterf; Optional ratekeeperInterf; + Optional> storageCacheInterf; ReplyPromise reply; bool degraded; RegisterWorkerRequest() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown), degraded(false) {} - RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional ddInterf, Optional rkInterf, bool degraded) : - wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf), degraded(degraded) {} + RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional ddInterf, Optional rkInterf, Optional> storageCacheInterf, bool degraded) : + wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf), storageCacheInterf(storageCacheInterf), degraded(degraded) {} template void serialize( Ar& ar ) { - serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, ratekeeperInterf, reply, degraded); + serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, ratekeeperInterf, storageCacheInterf, reply, degraded); } }; diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 4a71fbb5e2..a65493e4f3 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -4986,7 +4986,6 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/NotEnoughServers") { state int desiredTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * processSize; state int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * processSize; state int teamSize = 3; - state int targetTeamsPerServer = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (teamSize + 1) / 2; state DDTeamCollection* collection = testTeamCollection(teamSize, policy, processSize); collection->addTeam(std::set({ UID(1, 0), UID(2, 0), UID(3, 0) }), true); diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 55e954ddbf..d4d3645cdc 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -443,6 +443,7 @@ Future shardMerger( bool forwardComplete = false; KeyRangeRef merged; StorageMetrics endingStats = shardSize->get().get(); + int64_t systemBytes = keys.begin >= systemKeys.begin ? shardSize->get().get().bytes : 0; loop { Optional newMetrics; @@ -480,6 +481,9 @@ Future shardMerger( merged = KeyRangeRef( prevIter->range().begin, nextIter->range().end ); endingStats += newMetrics.get(); + if((forwardComplete ? prevIter->range().begin : nextIter->range().begin) >= systemKeys.begin) { + systemBytes += newMetrics.get().bytes; + } shardsMerged++; auto shardBounds = getShardSizeBounds( merged, maxShardSize ); @@ -498,6 +502,9 @@ Future shardMerger( // If going forward, remove most recently added range endingStats -= newMetrics.get(); + if(nextIter->range().begin >= systemKeys.begin) { + systemBytes -= newMetrics.get().bytes; + } shardsMerged--; --nextIter; merged = KeyRangeRef( prevIter->range().begin, nextIter->range().end ); @@ -514,6 +521,9 @@ Future shardMerger( .detail("EndingSize", endingStats.bytes) .detail("BatchedMerges", shardsMerged); + if(mergeRange.begin < systemKeys.begin) { + self->systemSizeEstimate -= systemBytes; + } restartShardTrackers( self, mergeRange, endingStats ); self->shardsAffectedByTeamFailure->defineShard( mergeRange ); self->output.send( RelocateShard( mergeRange, SERVER_KNOBS->PRIORITY_MERGE_SHARD ) ); diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index 6797d87a77..b1eb53dfff 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -1,5 +1,5 @@ /* - * MutablePrefixTree.h + * DeltaTree.h * * This source file is part of the FoundationDB open source project * @@ -24,9 +24,85 @@ #include "flow/Arena.h" #include "fdbclient/FDBTypes.h" #include "fdbserver/Knobs.h" -#include "fdbserver/PrefixTree.h" #include +typedef uint64_t Word; +static inline int commonPrefixLength(uint8_t const* ap, uint8_t const* bp, int cl) { + int i = 0; + const int wordEnd = cl - sizeof(Word) + 1; + + for(; i < wordEnd; i += sizeof(Word)) { + Word a = *(Word *)ap; + Word b = *(Word *)bp; + if(a != b) { + return i + ctzll(a ^ b) / 8; + } + ap += sizeof(Word); + bp += sizeof(Word); + } + + for (; i < cl; i++) { + if (*ap != *bp) { + return i; + } + ++ap; + ++bp; + } + return cl; +} + +static int commonPrefixLength(StringRef a, StringRef b) { + return commonPrefixLength(a.begin(), b.begin(), std::min(a.size(), b.size())); +} + +// This appears to be the fastest version +static int lessOrEqualPowerOfTwo(int n) { + int p; + for (p = 1; p+p <= n; p+=p); + return p; +} + +/* +static int _lessOrEqualPowerOfTwo(uint32_t n) { + if(n == 0) + return n; + int trailing = __builtin_ctz(n); + int leading = __builtin_clz(n); + if(trailing + leading == ((sizeof(n) * 8) - 1)) + return n; + return 1 << ( (sizeof(n) * 8) - leading - 1); +} + +static int __lessOrEqualPowerOfTwo(unsigned int n) { + int p = 1; + for(; p <= n; p <<= 1); + return p >> 1; +} +*/ + +static int perfectSubtreeSplitPoint(int subtree_size) { + // return the inorder index of the root node in a subtree of the given size + // consistent with the resulting binary search tree being "perfect" (having minimal height + // and all missing nodes as far right as possible). + // There has to be a simpler way to do this. + int s = lessOrEqualPowerOfTwo((subtree_size - 1) / 2 + 1) - 1; + return std::min(s * 2 + 1, subtree_size - s - 1); +} + +static int perfectSubtreeSplitPointCached(int subtree_size) { + static uint16_t *points = nullptr; + static const int max = 500; + if(points == nullptr) { + points = new uint16_t[max]; + for(int i = 0; i < max; ++i) + points[i] = perfectSubtreeSplitPoint(i); + } + + if(subtree_size < max) + return points[subtree_size]; + return perfectSubtreeSplitPoint(subtree_size); +} + // Delta Tree is a memory mappable binary tree of T objects such that each node's item is // stored as a Delta which can reproduce the node's T item given the node's greatest // lesser ancestor and the node's least greater ancestor. @@ -209,7 +285,7 @@ public: } }; - // Cursor provides a way to seek into a PrefixTree and iterate over its contents + // Cursor provides a way to seek into a DeltaTree and iterate over its contents // All Cursors from a Reader share the same decoded node 'cache' (tree of DecodedNodes) struct Cursor { Cursor() : reader(nullptr), node(nullptr) { @@ -342,7 +418,7 @@ public: // The boundary leading to the new page acts as the last time we branched right if(begin != end) { - nodeBytes = build(root(), begin, end, prev, next); + nodeBytes = build(root(), begin, end, prev, next, prev->getCommonPrefixLen(*next, 0)); } else { nodeBytes = 0; @@ -351,7 +427,7 @@ public: } private: - static OffsetT build(Node &root, const T *begin, const T *end, const T *prev, const T *next) { + static OffsetT build(Node &root, const T *begin, const T *end, const T *prev, const T *next, int subtreeCommon) { //printf("build: %s to %s\n", begin->toString().c_str(), (end - 1)->toString().c_str()); //printf("build: root at %p sizeof(Node) %d delta at %p \n", &root, sizeof(Node), &root.delta()); ASSERT(end != begin); @@ -361,12 +437,8 @@ private: int mid = perfectSubtreeSplitPointCached(count); const T &item = begin[mid]; - // Get the common prefix length between next and prev - // Since mid is between them, we can skip that length to determine the common prefix length - // between mid and prev and between mid and next. - int nextPrevCommon = prev->getCommonPrefixLen(*next, 0); - int commonWithPrev = item.getCommonPrefixLen(*prev, nextPrevCommon); - int commonWithNext = item.getCommonPrefixLen(*next, nextPrevCommon); + int commonWithPrev = item.getCommonPrefixLen(*prev, subtreeCommon); + int commonWithNext = item.getCommonPrefixLen(*next, subtreeCommon); bool prefixSourcePrev; int commonPrefix; @@ -391,7 +463,7 @@ private: // Serialize left child if(count > 1) { - wptr += build(*(Node *)wptr, begin, begin + mid, prev, &item); + wptr += build(*(Node *)wptr, begin, begin + mid, prev, &item, commonWithPrev); root.leftChildOffset = deltaSize; } else { @@ -401,7 +473,7 @@ private: // Serialize right child if(count > 2) { root.rightChildOffset = wptr - (uint8_t *)&root.delta(); - wptr += build(*(Node *)wptr, begin + mid + 1, end, &item, next); + wptr += build(*(Node *)wptr, begin + mid + 1, end, &item, next, commonWithNext); } else { root.rightChildOffset = 0; diff --git a/fdbserver/FDBExecHelper.actor.cpp b/fdbserver/FDBExecHelper.actor.cpp index 3daa798036..d435320989 100644 --- a/fdbserver/FDBExecHelper.actor.cpp +++ b/fdbserver/FDBExecHelper.actor.cpp @@ -142,7 +142,7 @@ ACTOR Future spawnProcess(std::string binPath, std::vector par #endif ACTOR Future execHelper(ExecCmdValueString* execArg, UID snapUID, std::string folder, std::string role) { - state Standalone uidStr = snapUID.toString(); + state Standalone uidStr(snapUID.toString()); state int err = 0; state Future cmdErr; state double maxWaitTime = SERVER_KNOBS->SNAP_CREATE_MAX_TIMEOUT; diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 5823588a17..8f79d9c57f 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -29,25 +29,33 @@ #define REDWOOD_DEBUG 0 -#define debug_printf_always(...) { fprintf(stdout, "%s %f ", g_network->getLocalAddress().toString().c_str(), now()), fprintf(stdout, __VA_ARGS__); fflush(stdout); } +#define debug_printf_stream stderr +#define debug_printf_always(...) { fprintf(debug_printf_stream, "%s %f %04d ", g_network->getLocalAddress().toString().c_str(), now(), __LINE__); fprintf(debug_printf_stream, __VA_ARGS__); fflush(debug_printf_stream); } #define debug_printf_noop(...) -#if REDWOOD_DEBUG - #define debug_printf debug_printf_always +#if defined(NO_INTELLISENSE) + #if REDWOOD_DEBUG + #define debug_printf debug_printf_always + #else + #define debug_printf debug_printf_noop + #endif #else -#define debug_printf debug_printf_noop + // To get error-checking on debug_printf statements in IDE + #define debug_printf printf #endif -#define BEACON fprintf(stderr, "%s: %s line %d \n", __FUNCTION__, __FILE__, __LINE__) +#define BEACON debug_printf_always("HERE\n") +#define TRACE debug_printf_always("%s: %s line %d %s\n", __FUNCTION__, __FILE__, __LINE__, platform::get_backtrace().c_str()); #ifndef VALGRIND #define VALGRIND_MAKE_MEM_UNDEFINED(x, y) #define VALGRIND_MAKE_MEM_DEFINED(x, y) #endif -typedef uint32_t LogicalPageID; // uint64_t? -static const int invalidLogicalPageID = LogicalPageID(-1); +typedef uint32_t LogicalPageID; +typedef uint32_t PhysicalPageID; +#define invalidLogicalPageID std::numeric_limits::max() class IPage { public: @@ -78,72 +86,96 @@ public: class IPagerSnapshot { public: - virtual Future> getPhysicalPage(LogicalPageID pageID) = 0; + virtual Future> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool nohit) = 0; virtual Version getVersion() const = 0; + virtual Key getMetaKey() const = 0; + virtual ~IPagerSnapshot() {} virtual void addref() = 0; virtual void delref() = 0; }; -class IPager : public IClosable { +// This API is probably customized to the behavior of DWALPager and probably needs some changes to be more generic. +class IPager2 : public IClosable { public: // Returns an IPage that can be passed to writePage. The data in the returned IPage might not be zeroed. virtual Reference newPageBuffer() = 0; // Returns the usable size of pages returned by the pager (i.e. the size of the page that isn't pager overhead). // For a given pager instance, separate calls to this function must return the same value. + // Only valid to call after recovery is complete. virtual int getUsablePageSize() = 0; - - virtual StorageBytes getStorageBytes() = 0; - // Permitted to fail (ASSERT) during recovery. - virtual Reference getReadSnapshot(Version version) = 0; + // Allocate a new page ID for a subsequent write. The page will be considered in-use after the next commit + // regardless of whether or not it was written to. + virtual Future newPageID() = 0; - // Returns an unused LogicalPageID. - // LogicalPageIDs in the range [0, SERVER_KNOBS->PAGER_RESERVED_PAGES) do not need to be allocated. - // Permitted to fail (ASSERT) during recovery. - virtual LogicalPageID allocateLogicalPage() = 0; + // Replace the contents of a page with new data across *all* versions. + // Existing holders of a page reference for pageID, read from any version, + // may see the effects of this write. + virtual void updatePage(LogicalPageID pageID, Reference data) = 0; - // Signals that the page will no longer be used as of the specified version. Versions prior to the specified version must be kept. - // Permitted to fail (ASSERT) during recovery. - virtual void freeLogicalPage(LogicalPageID pageID, Version version) = 0; + // Try to atomically update the contents of a page as of version v in the next commit. + // If the pager is unable to do this at this time, it may choose to write the data to a new page ID + // instead and return the new page ID to the caller. Otherwise the original pageID argument will be returned. + // If a new page ID is returned, the old page ID will be freed as of version v + virtual Future atomicUpdatePage(LogicalPageID pageID, Reference data, Version v) = 0; - // Writes a page with the given LogicalPageID at the specified version. LogicalPageIDs in the range [0, SERVER_KNOBS->PAGER_RESERVED_PAGES) - // can be written without being allocated. All other LogicalPageIDs must be allocated using allocateLogicalPage before writing them. - // - // If updateVersion is 0, we are signalling to the pager that we are reusing the LogicalPageID entry at the current latest version of pageID. - // - // Otherwise, we will add a new entry for LogicalPageID at the specified version. In that case, updateVersion must be larger than any version - // written to this page previously, and it must be larger than any version committed. If referencePageID is given, the latest version of that - // page will be used for the write, which *can* be less than the latest committed version. - // - // Permitted to fail (ASSERT) during recovery. - virtual void writePage(LogicalPageID pageID, Reference contents, Version updateVersion, LogicalPageID referencePageID = invalidLogicalPageID) = 0; + // Free pageID to be used again after the commit that moves oldestVersion past v + virtual void freePage(LogicalPageID pageID, Version v) = 0; - // Signals to the pager that no more reads will be performed in the range [begin, end). - // Permitted to fail (ASSERT) during recovery. - virtual void forgetVersions(Version begin, Version end) = 0; + // Returns the latest data (regardless of version) for a page by LogicalPageID + // The data returned will be the later of + // - the most recent committed atomic + // - the most recent non-atomic write + // Cacheable indicates that the page should be added to the page cache (if applicable?) as a result of this read. + // NoHit indicates that the read should not be considered a cache hit, such as when preloading pages that are + // considered likely to be needed soon. + virtual Future> readPage(LogicalPageID pageID, bool cacheable = true, bool noHit = false) = 0; - // Makes durable all writes and any data structures used for recovery. - // Permitted to fail (ASSERT) during recovery. + // Get a snapshot of the metakey and all pages as of the version v which must be >= getOldestVersion() + // Note that snapshots at any version may still see the results of updatePage() calls. + // The snapshot shall be usable until setOldVersion() is called with a version > v. + virtual Reference getReadSnapshot(Version v) = 0; + + // Atomically make durable all pending page writes, page frees, and update the metadata string. virtual Future commit() = 0; - // Returns the latest version of the pager. Permitted to block until recovery is complete, at which point it should always be set immediately. - // Some functions in the IPager interface are permitted to fail (ASSERT) during recovery, so users should wait for getLatestVersion to complete - // before doing anything else. - virtual Future getLatestVersion() = 0; + // Get the latest meta key set or committed + virtual Key getMetaKey() const = 0; - // Sets the latest version of the pager. Must be monotonically increasing. - // - // Must be called prior to reading the specified version. SOMEDAY: It may be desirable in the future to relax this constraint for performance reasons. - // - // Permitted to fail (ASSERT) during recovery. - virtual void setLatestVersion(Version version) = 0; + // Set the metakey which will be stored in the next commit + virtual void setMetaKey(KeyRef metaKey) = 0; + + // Sets the next commit version + virtual void setCommitVersion(Version v) = 0; + + virtual StorageBytes getStorageBytes() = 0; + + // Count of pages in use by the pager client + virtual Future getUserPageCount() = 0; + + // Future returned is ready when pager has been initialized from disk and is ready for reads and writes. + // It is invalid to call most other functions until init() is ready. + // TODO: Document further. + virtual Future init() = 0; + + // Returns latest committed version + virtual Version getLatestVersion() = 0; + + // Returns the oldest readable version as of the most recent committed version + virtual Version getOldestVersion() = 0; + + // Sets the oldest readable version to be put into affect at the next commit. + // The pager can reuse pages that were freed at a version less than v. + // If any snapshots are in use at a version less than v, the pager can either forcefully + // invalidate them or keep their versions around until the snapshots are no longer in use. + virtual void setOldestVersion(Version v) = 0; protected: - ~IPager() {} // Destruction should be done using close()/dispose() from the IClosable interface + ~IPager2() {} // Destruction should be done using close()/dispose() from the IClosable interface }; #endif diff --git a/fdbserver/IVersionedStore.h b/fdbserver/IVersionedStore.h index dd7b0f4bea..9baf5c4469 100644 --- a/fdbserver/IVersionedStore.h +++ b/fdbserver/IVersionedStore.h @@ -30,22 +30,17 @@ class IStoreCursor { public: virtual Future findEqual(KeyRef key) = 0; - virtual Future findFirstEqualOrGreater(KeyRef key, bool needValue, int prefetchNextBytes) = 0; - virtual Future findLastLessOrEqual(KeyRef key, bool needValue, int prefetchPriorBytes) = 0; - virtual Future next(bool needValue) = 0; - virtual Future prev(bool needValue) = 0; + virtual Future findFirstEqualOrGreater(KeyRef key, int prefetchBytes = 0) = 0; + virtual Future findLastLessOrEqual(KeyRef key, int prefetchBytes = 0) = 0; + virtual Future next() = 0; + virtual Future prev() = 0; virtual bool isValid() = 0; virtual KeyRef getKey() = 0; - //virtual StringRef getCompressedKey() = 0; virtual ValueRef getValue() = 0; - virtual void invalidateReturnedStrings() = 0; - virtual void addref() = 0; virtual void delref() = 0; - - virtual std::string toString() const = 0; }; class IVersionedStore : public IClosable { @@ -61,10 +56,12 @@ public: virtual void clear(KeyRangeRef range) = 0; virtual void mutate(int op, StringRef param1, StringRef param2) = 0; virtual void setWriteVersion(Version) = 0; // The write version must be nondecreasing - virtual void forgetVersions(Version begin, Version end) = 0; // Versions [begin, end) no longer readable + virtual void setOldestVersion(Version v) = 0; // Set oldest readable version to be used in next commit + virtual Version getOldestVersion() = 0; // Get oldest readable version virtual Future commit() = 0; - virtual Future getLatestVersion() = 0; + virtual Future init() = 0; + virtual Version getLatestVersion() = 0; // readAtVersion() may only be called on a version which has previously been passed to setWriteVersion() and never previously passed // to forgetVersion. The returned results when violating this precondition are unspecified; the store is not required to be able to detect violations. diff --git a/fdbserver/IndirectShadowPager.actor.cpp b/fdbserver/IndirectShadowPager.actor.cpp deleted file mode 100644 index 7a6457a3f8..0000000000 --- a/fdbserver/IndirectShadowPager.actor.cpp +++ /dev/null @@ -1,960 +0,0 @@ -/* - * IndirectShadowPager.actor.cpp - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "fdbserver/IndirectShadowPager.h" -#include "fdbserver/Knobs.h" - -#include "flow/UnitTest.h" -#include "flow/actorcompiler.h" -#include "fdbrpc/crc32c.h" - -struct SumType { - bool operator==(const SumType &rhs) const { return crc == rhs.crc; } - uint32_t crc; - std::string toString() { return format("0x%08x", crc); } -}; - -bool checksum(IAsyncFile *file, uint8_t *page, int pageSize, LogicalPageID logical, PhysicalPageID physical, bool write) { - // Calculates and then stores or verifies the checksum at the end of the page. - // If write is true then the checksum is written into the page - // If write is false then the checksum is compared to the in-page sum and - // and error will be thrown if they do not match. - ASSERT(sizeof(SumType) == IndirectShadowPage::PAGE_OVERHEAD_BYTES); - // Adjust pageSize to refer to only usable storage bytes - pageSize -= IndirectShadowPage::PAGE_OVERHEAD_BYTES; - SumType sum; - SumType *pSumInPage = (SumType *)(page + pageSize); - // Write sum directly to page or to sum variable based on mode - SumType *sumOut = write ? pSumInPage : ∑ - sumOut->crc = crc32c_append(logical, page, pageSize); - VALGRIND_MAKE_MEM_DEFINED(sumOut, sizeof(SumType)); - - debug_printf("checksum %s%s logical %d physical %d size %d checksums page %s calculated %s data at %p %s\n", - write ? "write" : "read", - (!write && sum != *pSumInPage) ? " MISMATCH" : "", - logical, physical, pageSize, - write ? "NA" : pSumInPage->toString().c_str(), - sumOut->toString().c_str(), page, ""); - - // Verify if not in write mode - if(!write && sum != *pSumInPage) { - TraceEvent (SevError, "IndirectShadowPagerPageChecksumFailure") - .detail("UserPageSize", pageSize) - .detail("Filename", file->getFilename()) - .detail("LogicalPage", logical) - .detail("PhysicalPage", physical) - .detail("ChecksumInPage", pSumInPage->toString()) - .detail("ChecksumCalculated", sum.toString()); - return false; - } - return true; -} - -inline bool checksumRead(IAsyncFile *file, uint8_t *page, int pageSize, LogicalPageID logical, PhysicalPageID physical) { - return checksum(file, page, pageSize, logical, physical, false); -} - -inline void checksumWrite(IAsyncFile *file, uint8_t *page, int pageSize, LogicalPageID logical, PhysicalPageID physical) { - checksum(file, page, pageSize, logical, physical, true); -} - -IndirectShadowPage::IndirectShadowPage() : fastAllocated(true) { - data = (uint8_t*)FastAllocator<4096>::allocate(); -} - -IndirectShadowPage::~IndirectShadowPage() { - if(fastAllocated) { - FastAllocator<4096>::release(data); - } - else if(file) { - file->releaseZeroCopy(data, PAGE_BYTES, (int64_t) physicalPageID * PAGE_BYTES); - } -} - -uint8_t const* IndirectShadowPage::begin() const { - return data; -} - -uint8_t* IndirectShadowPage::mutate() { - return data; -} - -int IndirectShadowPage::size() const { - return PAGE_BYTES - PAGE_OVERHEAD_BYTES; -} - -const int IndirectShadowPage::PAGE_BYTES = 4096; -const int IndirectShadowPage::PAGE_OVERHEAD_BYTES = sizeof(SumType); - -IndirectShadowPagerSnapshot::IndirectShadowPagerSnapshot(IndirectShadowPager *pager, Version version) - : pager(pager), version(version), pagerError(pager->getError()) -{ -} - -Future> IndirectShadowPagerSnapshot::getPhysicalPage(LogicalPageID pageID) { - if(pagerError.isReady()) - pagerError.get(); - return pager->getPage(Reference::addRef(this), pageID, version); -} - -template -T bigEndian(T val) { - static_assert(sizeof(T) <= 8, "Can't compute bigEndian on integers larger than 8 bytes"); - uint64_t b = bigEndian64(val); - return *(T*)((uint8_t*)&b+8-sizeof(T)); -} - -ACTOR Future recover(IndirectShadowPager *pager) { - try { - TraceEvent("PagerRecovering").detail("Filename", pager->pageFileName); - pager->pageTableLog = keyValueStoreMemory(pager->basename, UID(), 1e9, "pagerlog"); - - // TODO: this can be done synchronously with the log recovery - int64_t flags = IAsyncFile::OPEN_READWRITE | IAsyncFile::OPEN_LOCK; - state bool exists = fileExists(pager->pageFileName); - if(!exists) { - flags |= IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_CREATE; - } - - Reference dataFile = wait(IAsyncFileSystem::filesystem()->open(pager->pageFileName, flags, 0600)); - pager->dataFile = dataFile; - - TraceEvent("PagerOpenedDataFile").detail("Filename", pager->pageFileName); - - if(!exists) { - wait(pager->dataFile->sync()); - } - TraceEvent("PagerSyncdDataFile").detail("Filename", pager->pageFileName); - - state int64_t fileSize = wait(pager->dataFile->size()); - TraceEvent("PagerGotFileSize").detail("Size", fileSize).detail("Filename", pager->pageFileName); - - if(fileSize > 0) { - TraceEvent("PagerRecoveringFromLogs").detail("Filename", pager->pageFileName); - Optional pagesAllocatedValue = wait(pager->pageTableLog->readValue(IndirectShadowPager::PAGES_ALLOCATED_KEY)); - if(pagesAllocatedValue.present()) { - BinaryReader pr(pagesAllocatedValue.get(), Unversioned()); - uint32_t pagesAllocated; - pr >> pagesAllocated; - pager->pagerFile.init(fileSize, pagesAllocated); - - debug_printf("%s: Recovered pages allocated: %d\n", pager->pageFileName.c_str(), pager->pagerFile.pagesAllocated); - ASSERT(pager->pagerFile.pagesAllocated != PagerFile::INVALID_PAGE); - - Optional latestVersionValue = wait(pager->pageTableLog->readValue(IndirectShadowPager::LATEST_VERSION_KEY)); - ASSERT(latestVersionValue.present()); - - BinaryReader vr(latestVersionValue.get(), Unversioned()); - vr >> pager->latestVersion; - - Optional oldestVersionValue = wait(pager->pageTableLog->readValue(IndirectShadowPager::OLDEST_VERSION_KEY)); - - if(oldestVersionValue.present()) { - BinaryReader vr(oldestVersionValue.get(), Unversioned()); - vr >> pager->oldestVersion; - } - - debug_printf("%s: Recovered version info: earliest v%lld latest v%lld\n", pager->pageFileName.c_str(), pager->oldestVersion, pager->latestVersion); - pager->committedVersion = pager->latestVersion; - - Standalone> tableEntries = wait(pager->pageTableLog->readRange(KeyRangeRef(IndirectShadowPager::TABLE_ENTRY_PREFIX, strinc(IndirectShadowPager::TABLE_ENTRY_PREFIX)))); - - if(tableEntries.size() > 0) { - BinaryReader kr(tableEntries.back().key, Unversioned()); - - uint8_t prefix; - LogicalPageID logicalPageID; - - kr >> prefix; - ASSERT(prefix == IndirectShadowPager::TABLE_ENTRY_PREFIX.begin()[0]); - - kr >> logicalPageID; - logicalPageID = bigEndian(logicalPageID); - - LogicalPageID pageTableSize = std::max(logicalPageID+1, SERVER_KNOBS->PAGER_RESERVED_PAGES); - pager->pageTable.resize(pageTableSize); - debug_printf("%s: Recovered page table size: %d\n", pager->pageFileName.c_str(), pageTableSize); - } - else { - debug_printf("%s: Recovered no page table entries\n", pager->pageFileName.c_str()); - } - - LogicalPageID nextPageID = SERVER_KNOBS->PAGER_RESERVED_PAGES; - std::set allocatedPhysicalPages; - for(auto entry : tableEntries) { - BinaryReader kr(entry.key, Unversioned()); - BinaryReader vr(entry.value, Unversioned()); - - uint8_t prefix; - LogicalPageID logicalPageID; - Version version; - PhysicalPageID physicalPageID; - - kr >> prefix; - ASSERT(prefix == IndirectShadowPager::TABLE_ENTRY_PREFIX.begin()[0]); - - kr >> logicalPageID; - logicalPageID = bigEndian(logicalPageID); - - kr >> version; - version = bigEndian(version); - vr >> physicalPageID; - - ASSERT(version <= pager->latestVersion); - - pager->pageTable[logicalPageID].push_back(std::make_pair(version, physicalPageID)); - - if(physicalPageID != PagerFile::INVALID_PAGE) { - allocatedPhysicalPages.insert(physicalPageID); - pager->pagerFile.markPageAllocated(logicalPageID, version, physicalPageID); - } - - while(nextPageID < logicalPageID) { - pager->logicalFreeList.push_back(nextPageID++); - } - if(logicalPageID == nextPageID) { - ++nextPageID; - } - - debug_printf("%s: Recovered page table entry logical %d -> (v%lld, physical %d)\n", pager->pageFileName.c_str(), logicalPageID, version, physicalPageID); - } - - debug_printf("%s: Building physical free list\n", pager->pageFileName.c_str()); - // TODO: can we do this better? does it require storing extra info in the log? - PhysicalPageID nextPhysicalPageID = 0; - for(auto itr = allocatedPhysicalPages.begin(); itr != allocatedPhysicalPages.end(); ++itr) { - while(nextPhysicalPageID < *itr) { - pager->pagerFile.freePage(nextPhysicalPageID++); - } - ++nextPhysicalPageID; - } - - while(nextPhysicalPageID < pager->pagerFile.pagesAllocated) { - pager->pagerFile.freePage(nextPhysicalPageID++); - } - } - } - - if(pager->pageTable.size() < SERVER_KNOBS->PAGER_RESERVED_PAGES) { - pager->pageTable.resize(SERVER_KNOBS->PAGER_RESERVED_PAGES); - } - - pager->pagerFile.finishedMarkingPages(); - pager->pagerFile.startVacuuming(); - - debug_printf("%s: Finished recovery at v%lld\n", pager->pageFileName.c_str(), pager->latestVersion); - TraceEvent("PagerFinishedRecovery").detail("LatestVersion", pager->latestVersion).detail("OldestVersion", pager->oldestVersion).detail("Filename", pager->pageFileName); - } - catch(Error &e) { - if(e.code() != error_code_actor_cancelled) { - TraceEvent(SevError, "PagerRecoveryFailed").error(e, true).detail("Filename", pager->pageFileName); - } - throw; - } - - return Void(); -} - -ACTOR Future housekeeper(IndirectShadowPager *pager) { - wait(pager->recovery); - wait(Never()); - loop { - state LogicalPageID pageID = 0; - for(; pageID < pager->pageTable.size(); ++pageID) { - // TODO: pick an appropriate rate for this loop and determine the right way to implement it - // Right now, this delays 10ms every 400K pages, which means we have 1s of delay for every - // 40M pages. In total, we introduce 100s delay for a max size 4B page file. - if(pageID % 400000 == 0) { - wait(delay(0.01)); - } - else { - wait(yield()); - } - - auto& pageVersionMap = pager->pageTable[pageID]; - - if(pageVersionMap.size() > 0) { - auto itr = pageVersionMap.begin(); - for(auto prev = itr; prev != pageVersionMap.end() && prev->first < pager->oldestVersion; prev=itr) { - pager->pagerFile.markPageAllocated(pageID, itr->first, itr->second); - ++itr; - if(prev->second != PagerFile::INVALID_PAGE && (itr == pageVersionMap.end() || itr->first <= pager->oldestVersion)) { - pager->freePhysicalPageID(prev->second); - } - if(itr == pageVersionMap.end() || itr->first >= pager->oldestVersion) { - debug_printf("%s: Updating oldest version for logical %u: v%lld\n", pager->pageFileName.c_str(), pageID, pager->oldestVersion); - pager->logPageTableClear(pageID, 0, pager->oldestVersion); - - if(itr != pageVersionMap.end() && itr->first > pager->oldestVersion) { - debug_printf("%s: Erasing pages to prev from pageVersionMap for %d (itr=%lld, prev=%lld)\n", pager->pageFileName.c_str(), pageID, itr->first, prev->first); - prev->first = pager->oldestVersion; - pager->logPageTableUpdate(pageID, pager->oldestVersion, prev->second); - itr = pageVersionMap.erase(pageVersionMap.begin(), prev); - } - else { - debug_printf("%s: Erasing pages to itr from pageVersionMap for %d (%d) (itr=%lld, prev=%lld)\n", pager->pageFileName.c_str(), pageID, itr == pageVersionMap.end(), itr==pageVersionMap.end() ? -1 : itr->first, prev->first); - itr = pageVersionMap.erase(pageVersionMap.begin(), itr); - } - } - } - - for(; itr != pageVersionMap.end(); ++itr) { - pager->pagerFile.markPageAllocated(pageID, itr->first, itr->second); - } - - if(pageVersionMap.size() == 0) { - pager->freeLogicalPageID(pageID); - } - } - } - - pager->pagerFile.finishedMarkingPages(); - } -} - -ACTOR Future forwardError(Future f, Promise target) { - try { - wait(f); - } - catch(Error &e) { - if(e.code() != error_code_actor_cancelled && target.canBeSet()) { - target.sendError(e); - } - - throw e; - } - - return Void(); -} - -IndirectShadowPager::IndirectShadowPager(std::string basename) - : basename(basename), latestVersion(0), committedVersion(0), committing(Void()), oldestVersion(0), pagerFile(this) -{ - pageFileName = basename; - recovery = forwardError(recover(this), errorPromise); - housekeeping = forwardError(housekeeper(this), errorPromise); -} - -StorageBytes IndirectShadowPager::getStorageBytes() { - int64_t free; - int64_t total; - g_network->getDiskBytes(parentDirectory(basename), free, total); - return StorageBytes(free, total, pagerFile.size(), free + IndirectShadowPage::PAGE_BYTES * pagerFile.getFreePages()); -} - -Reference IndirectShadowPager::newPageBuffer() { - return Reference(new IndirectShadowPage()); -} - -int IndirectShadowPager::getUsablePageSize() { - return IndirectShadowPage::PAGE_BYTES - IndirectShadowPage::PAGE_OVERHEAD_BYTES; -} - -Reference IndirectShadowPager::getReadSnapshot(Version version) { - debug_printf("%s: Getting read snapshot v%lld latest v%lld oldest v%lld\n", pageFileName.c_str(), version, latestVersion, oldestVersion); - ASSERT(recovery.isReady()); - ASSERT(version <= latestVersion); - ASSERT(version >= oldestVersion); - - return Reference(new IndirectShadowPagerSnapshot(this, version)); -} - -LogicalPageID IndirectShadowPager::allocateLogicalPage() { - ASSERT(recovery.isReady()); - - LogicalPageID allocatedPage; - if(logicalFreeList.size() > 0) { - allocatedPage = logicalFreeList.front(); - logicalFreeList.pop_front(); - } - else { - ASSERT(pageTable.size() < std::numeric_limits::max()); // TODO: different error? - allocatedPage = pageTable.size(); - pageTable.push_back(PageVersionMap()); - } - - ASSERT(allocatedPage >= SERVER_KNOBS->PAGER_RESERVED_PAGES); - debug_printf("%s: op=allocate id=%u\n", pageFileName.c_str(), allocatedPage); - return allocatedPage; -} - -void IndirectShadowPager::freeLogicalPage(LogicalPageID pageID, Version version) { - ASSERT(recovery.isReady()); - ASSERT(committing.isReady()); - - ASSERT(pageID < pageTable.size()); - - PageVersionMap &pageVersionMap = pageTable[pageID]; - ASSERT(!pageVersionMap.empty()); - - // 0 will mean delete as of latest version, similar to write at latest version - if(version == 0) { - version = pageVersionMap.back().first; - } - - auto itr = pageVersionMapLowerBound(pageVersionMap, version); - // TODO: Is this correct, that versions from the past *forward* can be deleted? - for(auto i = itr; i != pageVersionMap.end(); ++i) { - freePhysicalPageID(i->second); - } - - if(itr != pageVersionMap.end()) { - debug_printf("%s: Clearing newest versions for logical %u: v%lld\n", pageFileName.c_str(), pageID, version); - logPageTableClearToEnd(pageID, version); - pageVersionMap.erase(itr, pageVersionMap.end()); - } - - if(pageVersionMap.size() == 0) { - debug_printf("%s: Freeing logical %u (freeLogicalPage)\n", pageFileName.c_str(), pageID); - logicalFreeList.push_back(pageID); - } - else if(pageVersionMap.back().second != PagerFile::INVALID_PAGE) { - pageVersionMap.push_back(std::make_pair(version, PagerFile::INVALID_PAGE)); - logPageTableUpdate(pageID, version, PagerFile::INVALID_PAGE); - } -} - -ACTOR Future waitAndFreePhysicalPageID(IndirectShadowPager *pager, PhysicalPageID pageID, Future canFree) { - wait(canFree); - pager->pagerFile.freePage(pageID); - return Void(); -} - -// TODO: Freeing physical pages must be done *after* committing the page map changes that cause the physical page to no longer be used. -// Otherwise, the physical page could be reused by a write followed by a power loss in which case the mapping change would not -// have been committed and so the physical page should still contain its previous data but it's been overwritten. -void IndirectShadowPager::freePhysicalPageID(PhysicalPageID pageID) { - debug_printf("%s: Freeing physical %u\n", pageFileName.c_str(), pageID); - pagerFile.freePage(pageID); -} - -void IndirectShadowPager::writePage(LogicalPageID pageID, Reference contents, Version updateVersion, LogicalPageID referencePageID) { - ASSERT(recovery.isReady()); - ASSERT(committing.isReady()); - - ASSERT(updateVersion > latestVersion || updateVersion == 0); - ASSERT(pageID < pageTable.size()); - - PageVersionMap &pageVersionMap = pageTable[pageID]; - - ASSERT(pageVersionMap.empty() || pageVersionMap.back().second != PagerFile::INVALID_PAGE); - - // TODO: should this be conditional on the write succeeding? - bool updateExisting = updateVersion == 0; - if(updateExisting) { - // If there is no existing latest version to update then there must be a referencePageID from which to get a latest version - // so get that version and change this to a normal update - if(pageVersionMap.empty()) { - ASSERT(referencePageID != invalidLogicalPageID); - PageVersionMap &rpv = pageTable[referencePageID]; - ASSERT(!rpv.empty()); - updateVersion = rpv.back().first; - updateExisting = false; - } - else { - ASSERT(pageVersionMap.size()); - updateVersion = pageVersionMap.back().first; - } - } - - PhysicalPageID physicalPageID = pagerFile.allocatePage(pageID, updateVersion); - - debug_printf("%s: Writing logical %d v%lld physical %d\n", pageFileName.c_str(), pageID, updateVersion, physicalPageID); - - if(updateExisting) { - // TODO: Physical page cannot be freed now, it must be done after the page mapping change above is committed - //freePhysicalPageID(pageVersionMap.back().second); - pageVersionMap.back().second = physicalPageID; - } - else { - ASSERT(pageVersionMap.empty() || pageVersionMap.back().first < updateVersion); - pageVersionMap.push_back(std::make_pair(updateVersion, physicalPageID)); - } - - logPageTableUpdate(pageID, updateVersion, physicalPageID); - - checksumWrite(dataFile.getPtr(), contents->mutate(), IndirectShadowPage::PAGE_BYTES, pageID, physicalPageID); - - Future write = holdWhile(contents, dataFile->write(contents->begin(), IndirectShadowPage::PAGE_BYTES, (int64_t) physicalPageID * IndirectShadowPage::PAGE_BYTES)); - - if(write.isError()) { - if(errorPromise.canBeSet()) { - errorPromise.sendError(write.getError()); - } - throw write.getError(); - } - writeActors.add(forwardError(write, errorPromise)); -} - -void IndirectShadowPager::forgetVersions(Version begin, Version end) { - ASSERT(recovery.isReady()); - ASSERT(begin <= end); - ASSERT(end <= latestVersion); - - // TODO: support forgetting arbitrary ranges - if(begin <= oldestVersion) { - oldestVersion = std::max(end, oldestVersion); - logVersion(OLDEST_VERSION_KEY, oldestVersion); - } -} - -ACTOR Future commitImpl(IndirectShadowPager *pager, Future previousCommit) { - state Future outstandingWrites = pager->writeActors.signalAndCollapse(); - state Version commitVersion = pager->latestVersion; - - wait(previousCommit); - - pager->logVersion(IndirectShadowPager::LATEST_VERSION_KEY, commitVersion); - - // TODO: we need to prevent writes that happen now from being committed in the subsequent log commit - // This is probably best done once we have better control of the log, where we can write a commit entry - // here without syncing the file. - - wait(outstandingWrites); - - wait(pager->dataFile->sync()); - wait(pager->pageTableLog->commit()); - - pager->committedVersion = std::max(pager->committedVersion, commitVersion); - - return Void(); -} - -Future IndirectShadowPager::commit() { - ASSERT(recovery.isReady()); - Future f = commitImpl(this, committing); - committing = f; - return committing; -} - -void IndirectShadowPager::setLatestVersion(Version version) { - ASSERT(recovery.isReady()); - latestVersion = version; -} - -ACTOR Future getLatestVersionImpl(IndirectShadowPager *pager) { - wait(pager->recovery); - return pager->latestVersion; -} - -Future IndirectShadowPager::getLatestVersion() { - return getLatestVersionImpl(this); -} - -Future IndirectShadowPager::getError() { - return errorPromise.getFuture(); -} - -Future IndirectShadowPager::onClosed() { - return closed.getFuture(); -} - -ACTOR void shutdown(IndirectShadowPager *pager, bool dispose) { - if(pager->errorPromise.canBeSet()) - pager->errorPromise.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress - - // Cancel all outstanding reads - auto i = pager->busyPages.begin(); - auto iEnd = pager->busyPages.end(); - - while(i != iEnd) { - // Advance before calling cancel as the rawRead cancel will destroy the map entry it lives in - (i++)->second.read.cancel(); - } - ASSERT(pager->busyPages.empty()); - - wait(ready(pager->writeActors.signal())); - wait(ready(pager->operations.signal())); - wait(ready(pager->committing)); - - pager->housekeeping.cancel(); - pager->pagerFile.shutdown(); - - state Future pageTableClosed = pager->pageTableLog->onClosed(); - if(dispose) { - wait(ready(IAsyncFileSystem::filesystem()->deleteFile(pager->pageFileName, true))); - pager->pageTableLog->dispose(); - } - else { - pager->pageTableLog->close(); - } - - wait(ready(pageTableClosed)); - - pager->closed.send(Void()); - delete pager; -} - -void IndirectShadowPager::dispose() { - shutdown(this, true); -} - -void IndirectShadowPager::close() { - shutdown(this, false); -} - -ACTOR Future> rawRead(IndirectShadowPager *pager, LogicalPageID logicalPageID, PhysicalPageID physicalPageID) { - state void *data; - state int len = IndirectShadowPage::PAGE_BYTES; - state bool readSuccess = false; - - try { - wait(pager->dataFile->readZeroCopy(&data, &len, (int64_t) physicalPageID * IndirectShadowPage::PAGE_BYTES)); - readSuccess = true; - - if(!checksumRead(pager->dataFile.getPtr(), (uint8_t *)data, len, logicalPageID, physicalPageID)) { - throw checksum_failed(); - } - - pager->busyPages.erase(physicalPageID); - return Reference(new IndirectShadowPage((uint8_t *)data, pager->dataFile, physicalPageID)); - } - catch(Error &e) { - pager->busyPages.erase(physicalPageID); - if(readSuccess || e.code() == error_code_actor_cancelled) { - pager->dataFile->releaseZeroCopy(data, len, (int64_t) physicalPageID * IndirectShadowPage::PAGE_BYTES); - } - throw; - } -} - -Future> getPageImpl(IndirectShadowPager *pager, Reference snapshot, LogicalPageID logicalPageID, Version version) { - ASSERT(logicalPageID < pager->pageTable.size()); - PageVersionMap &pageVersionMap = pager->pageTable[logicalPageID]; - - auto itr = IndirectShadowPager::pageVersionMapUpperBound(pageVersionMap, version); - if(itr == pageVersionMap.begin()) { - debug_printf("%s: Page version map empty! op=error id=%u @%lld\n", pager->pageFileName.c_str(), logicalPageID, version); - ASSERT(false); - } - --itr; - PhysicalPageID physicalPageID = itr->second; - ASSERT(physicalPageID != PagerFile::INVALID_PAGE); - - debug_printf("%s: Reading logical %d v%lld physical %d mapSize %lu\n", pager->pageFileName.c_str(), logicalPageID, version, physicalPageID, pageVersionMap.size()); - - IndirectShadowPager::BusyPage &bp = pager->busyPages[physicalPageID]; - if(!bp.read.isValid()) { - Future> get = rawRead(pager, logicalPageID, physicalPageID); - if(!get.isReady()) { - bp.read = get; - } - return get; - } - return bp.read; -} - -Future> IndirectShadowPager::getPage(Reference snapshot, LogicalPageID pageID, Version version) { - if(!recovery.isReady()) { - debug_printf("%s: getPage failure, recovery not ready - op=error id=%u @%lld\n", pageFileName.c_str(), pageID, version); - ASSERT(false); - } - - Future> f = getPageImpl(this, snapshot, pageID, version); - operations.add(forwardError(ready(f), errorPromise)); // For some reason if success is ready() then shutdown hangs when waiting on operations - return f; -} - -PageVersionMap::iterator IndirectShadowPager::pageVersionMapLowerBound(PageVersionMap &pageVersionMap, Version version) { - return std::lower_bound(pageVersionMap.begin(), pageVersionMap.end(), version, [](std::pair p, Version v) { - return p.first < v; - }); -} - -PageVersionMap::iterator IndirectShadowPager::pageVersionMapUpperBound(PageVersionMap &pageVersionMap, Version version) { - return std::upper_bound(pageVersionMap.begin(), pageVersionMap.end(), version, [](Version v, std::pair p) { - return v < p.first; - }); -} - -void IndirectShadowPager::freeLogicalPageID(LogicalPageID pageID) { - if(pageID >= SERVER_KNOBS->PAGER_RESERVED_PAGES) { - debug_printf("%s: Freeing logical %u\n", pageFileName.c_str(), pageID); - logicalFreeList.push_back(pageID); - } -} - -void IndirectShadowPager::logVersion(StringRef versionKey, Version version) { - BinaryWriter v(Unversioned()); - v << version; - - pageTableLog->set(KeyValueRef(versionKey, v.toValue())); -} - -void IndirectShadowPager::logPagesAllocated() { - BinaryWriter v(Unversioned()); - v << pagerFile.getPagesAllocated(); - - pageTableLog->set(KeyValueRef(PAGES_ALLOCATED_KEY, v.toValue())); -} - -void IndirectShadowPager::logPageTableUpdate(LogicalPageID logicalPageID, Version version, PhysicalPageID physicalPageID) { - BinaryWriter k(Unversioned()); - k << TABLE_ENTRY_PREFIX.begin()[0] << bigEndian(logicalPageID) << bigEndian(version); - - BinaryWriter v(Unversioned()); - v << physicalPageID; - - pageTableLog->set(KeyValueRef(k.toValue(), v.toValue())); -} - -void IndirectShadowPager::logPageTableClearToEnd(LogicalPageID logicalPageID, Version start) { - BinaryWriter b(Unversioned()); - b << TABLE_ENTRY_PREFIX.begin()[0] << bigEndian(logicalPageID) << bigEndian(start); - - BinaryWriter e(Unversioned()); - e << TABLE_ENTRY_PREFIX.begin()[0] << bigEndian(logicalPageID); - - pageTableLog->clear(KeyRangeRef(b.toValue(), strinc(e.toValue()))); -} - -void IndirectShadowPager::logPageTableClear(LogicalPageID logicalPageID, Version start, Version end) { - BinaryWriter b(Unversioned()); - b << TABLE_ENTRY_PREFIX.begin()[0] << bigEndian(logicalPageID) << bigEndian(start); - - BinaryWriter e(Unversioned()); - e << TABLE_ENTRY_PREFIX.begin()[0] << bigEndian(logicalPageID) << bigEndian(end); - - pageTableLog->clear(KeyRangeRef(b.toValue(), e.toValue())); -} - -const StringRef IndirectShadowPager::LATEST_VERSION_KEY = LiteralStringRef("\xff/LatestVersion"); -const StringRef IndirectShadowPager::OLDEST_VERSION_KEY = LiteralStringRef("\xff/OldestVersion"); -const StringRef IndirectShadowPager::PAGES_ALLOCATED_KEY = LiteralStringRef("\xff/PagesAllocated"); -const StringRef IndirectShadowPager::TABLE_ENTRY_PREFIX = LiteralStringRef("\x00"); - -ACTOR Future copyPage(IndirectShadowPager *pager, Reference page, LogicalPageID logical, PhysicalPageID from, PhysicalPageID to) { - state bool zeroCopied = true; - state int bytes = IndirectShadowPage::PAGE_BYTES; - state void *data = nullptr; - - try { - try { - wait(pager->dataFile->readZeroCopy(&data, &bytes, (int64_t)from * IndirectShadowPage::PAGE_BYTES)); - } - catch(Error &e) { - zeroCopied = false; - data = page->mutate(); - int _bytes = wait(pager->dataFile->read(data, page->size(), (int64_t)from * IndirectShadowPage::PAGE_BYTES)); - bytes = _bytes; - } - - ASSERT(bytes == IndirectShadowPage::PAGE_BYTES); - checksumWrite(pager->dataFile.getPtr(), page->mutate(), bytes, logical, to); - wait(pager->dataFile->write(data, bytes, (int64_t)to * IndirectShadowPage::PAGE_BYTES)); - if(zeroCopied) { - pager->dataFile->releaseZeroCopy(data, bytes, (int64_t)from * IndirectShadowPage::PAGE_BYTES); - } - } - catch(Error &e) { - if(zeroCopied) { - pager->dataFile->releaseZeroCopy(data, bytes, (int64_t)from * IndirectShadowPage::PAGE_BYTES); - } - pager->pagerFile.freePage(to); - throw e; - } - - return Void(); -} - -ACTOR Future vacuumer(IndirectShadowPager *pager, PagerFile *pagerFile) { - state Reference page(new IndirectShadowPage()); - - loop { - state double start = now(); - while(!pagerFile->canVacuum()) { - wait(delay(1.0)); - } - - ASSERT(!pagerFile->freePages.empty()); - - if(!pagerFile->vacuumQueue.empty()) { - state PhysicalPageID lastUsedPage = pagerFile->vacuumQueue.rbegin()->first; - PhysicalPageID lastFreePage = *pagerFile->freePages.rbegin(); - debug_printf("%s: Vacuuming: evaluating (free list size=%lu, lastFreePage=%u, lastUsedPage=%u, pagesAllocated=%u)\n", pager->pageFileName.c_str(), pagerFile->freePages.size(), lastFreePage, lastUsedPage, pagerFile->pagesAllocated); - ASSERT(lastFreePage < pagerFile->pagesAllocated); - ASSERT(lastUsedPage < pagerFile->pagesAllocated); - ASSERT(lastFreePage != lastUsedPage); - - if(lastFreePage < lastUsedPage) { - state std::pair logicalPageInfo = pagerFile->vacuumQueue[lastUsedPage]; - state PhysicalPageID newPage = pagerFile->allocatePage(logicalPageInfo.first, logicalPageInfo.second); - - debug_printf("%s: Vacuuming: copying page %u to %u\n", pager->pageFileName.c_str(), lastUsedPage, newPage); - wait(copyPage(pager, page, logicalPageInfo.first, lastUsedPage, newPage)); - - auto &pageVersionMap = pager->pageTable[logicalPageInfo.first]; - auto itr = IndirectShadowPager::pageVersionMapLowerBound(pageVersionMap, logicalPageInfo.second); - if(itr != pageVersionMap.end() && itr->second == lastUsedPage) { - itr->second = newPage; - pager->logPageTableUpdate(logicalPageInfo.first, itr->first, newPage); - pagerFile->freePage(lastUsedPage); - } - else { - TEST(true); // page was freed while vacuuming - pagerFile->freePage(newPage); - } - } - } - - PhysicalPageID firstFreePage = pagerFile->vacuumQueue.empty() ? pagerFile->minVacuumQueuePage : (pagerFile->vacuumQueue.rbegin()->first + 1); - ASSERT(pagerFile->pagesAllocated >= firstFreePage); - - uint64_t pagesToErase = 0; - if(pagerFile->freePages.size() >= SERVER_KNOBS->FREE_PAGE_VACUUM_THRESHOLD) { - pagesToErase = std::min(pagerFile->freePages.size() - SERVER_KNOBS->FREE_PAGE_VACUUM_THRESHOLD + 1, pagerFile->pagesAllocated - firstFreePage); - } - - debug_printf("%s: Vacuuming: got %llu pages to erase (freePages=%lu, pagesAllocated=%u, vacuumQueueEmpty=%u, minVacuumQueuePage=%u, firstFreePage=%u)\n", pager->pageFileName.c_str(), pagesToErase, pagerFile->freePages.size(), pagerFile->pagesAllocated, pagerFile->vacuumQueue.empty(), pagerFile->minVacuumQueuePage, firstFreePage); - - if(pagesToErase > 0) { - PhysicalPageID eraseStartPage = pagerFile->pagesAllocated - pagesToErase; - debug_printf("%s: Vacuuming: truncating last %llu pages starting at %u\n", pager->pageFileName.c_str(), pagesToErase, eraseStartPage); - - ASSERT(pagesToErase <= pagerFile->pagesAllocated); - - pagerFile->pagesAllocated = eraseStartPage; - pager->logPagesAllocated(); - - auto freePageItr = pagerFile->freePages.find(eraseStartPage); - ASSERT(freePageItr != pagerFile->freePages.end()); - - pagerFile->freePages.erase(freePageItr, pagerFile->freePages.end()); - ASSERT(pagerFile->vacuumQueue.empty() || pagerFile->vacuumQueue.rbegin()->first < eraseStartPage); - - wait(pager->dataFile->truncate((int64_t)pagerFile->pagesAllocated * IndirectShadowPage::PAGE_BYTES)); - } - - wait(delayUntil(start + (double)IndirectShadowPage::PAGE_BYTES / SERVER_KNOBS->VACUUM_BYTES_PER_SECOND)); // TODO: figure out the correct mechanism here - } -} - -PagerFile::PagerFile(IndirectShadowPager *pager) : fileSize(0), pagesAllocated(0), pager(pager), vacuumQueueReady(false), minVacuumQueuePage(0) {} - -PhysicalPageID PagerFile::allocatePage(LogicalPageID logicalPageID, Version version) { - ASSERT((int64_t)pagesAllocated * IndirectShadowPage::PAGE_BYTES <= fileSize); - ASSERT(fileSize % IndirectShadowPage::PAGE_BYTES == 0); - - PhysicalPageID allocatedPage; - if(!freePages.empty()) { - allocatedPage = *freePages.begin(); - freePages.erase(freePages.begin()); - } - else { - if((int64_t)pagesAllocated * IndirectShadowPage::PAGE_BYTES == fileSize) { - fileSize += (1 << 24); - // TODO: extend the file before writing beyond the end. - } - - ASSERT(pagesAllocated < INVALID_PAGE); // TODO: we should throw a better error here - allocatedPage = pagesAllocated++; - pager->logPagesAllocated(); - } - - markPageAllocated(logicalPageID, version, allocatedPage); - - debug_printf("%s: Allocated physical %u\n", pager->pageFileName.c_str(), allocatedPage); - return allocatedPage; -} - -void PagerFile::freePage(PhysicalPageID pageID) { - freePages.insert(pageID); - - if(pageID >= minVacuumQueuePage) { - vacuumQueue.erase(pageID); - } -} - -void PagerFile::markPageAllocated(LogicalPageID logicalPageID, Version version, PhysicalPageID physicalPageID) { - if(physicalPageID != INVALID_PAGE && physicalPageID >= minVacuumQueuePage) { - vacuumQueue[physicalPageID] = std::make_pair(logicalPageID, version); - } -} - -void PagerFile::finishedMarkingPages() { - if(minVacuumQueuePage >= pagesAllocated) { - minVacuumQueuePage = pagesAllocated >= SERVER_KNOBS->VACUUM_QUEUE_SIZE ? pagesAllocated - SERVER_KNOBS->VACUUM_QUEUE_SIZE : 0; - vacuumQueueReady = false; - } - else { - if(!vacuumQueueReady) { - vacuumQueueReady = true; - } - if(pagesAllocated > SERVER_KNOBS->VACUUM_QUEUE_SIZE && minVacuumQueuePage < pagesAllocated - SERVER_KNOBS->VACUUM_QUEUE_SIZE) { - minVacuumQueuePage = pagesAllocated - SERVER_KNOBS->VACUUM_QUEUE_SIZE; - auto itr = vacuumQueue.lower_bound(minVacuumQueuePage); - vacuumQueue.erase(vacuumQueue.begin(), itr); - } - } -} - -uint64_t PagerFile::size() { - return fileSize; -} - -uint32_t PagerFile::getPagesAllocated() { - return pagesAllocated; -} - -uint32_t PagerFile::getFreePages() { - return freePages.size(); -} - -void PagerFile::init(uint64_t fileSize, uint32_t pagesAllocated) { - this->fileSize = fileSize; - this->pagesAllocated = pagesAllocated; - this->minVacuumQueuePage = pagesAllocated >= SERVER_KNOBS->VACUUM_QUEUE_SIZE ? pagesAllocated - SERVER_KNOBS->VACUUM_QUEUE_SIZE : 0; -} - -void PagerFile::startVacuuming() { - vacuuming = Never(); //vacuumer(pager, this); -} - -void PagerFile::shutdown() { - vacuuming.cancel(); -} - -bool PagerFile::canVacuum() { - if(freePages.size() < SERVER_KNOBS->FREE_PAGE_VACUUM_THRESHOLD // Not enough free pages - || minVacuumQueuePage >= pagesAllocated // We finished processing all pages in the vacuum queue - || !vacuumQueueReady) // Populating vacuum queue - { - debug_printf("%s: Vacuuming: waiting for vacuumable pages (free list size=%lu, minVacuumQueuePage=%u, pages allocated=%u, vacuumQueueReady=%d)\n", pager->pageFileName.c_str(), freePages.size(), minVacuumQueuePage, pagesAllocated, vacuumQueueReady); - return false; - } - - return true; -} - -const PhysicalPageID PagerFile::INVALID_PAGE = std::numeric_limits::max(); - -extern Future simplePagerTest(IPager* const& pager); - -TEST_CASE("/fdbserver/indirectshadowpager/simple") { - state IPager *pager = new IndirectShadowPager("unittest_pageFile"); - - wait(simplePagerTest(pager)); - - Future closedFuture = pager->onClosed(); - pager->close(); - wait(closedFuture); - - return Void(); -} diff --git a/fdbserver/IndirectShadowPager.h b/fdbserver/IndirectShadowPager.h deleted file mode 100644 index a711c7ba63..0000000000 --- a/fdbserver/IndirectShadowPager.h +++ /dev/null @@ -1,215 +0,0 @@ -/* - * IndirectShadowPager.h - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef FDBSERVER_INDIRECTSHADOWPAGER_H -#define FDBSERVER_INDIRECTSHADOWPAGER_H -#pragma once - -#include "fdbserver/IKeyValueStore.h" -#include "fdbserver/IPager.h" - -#include "flow/ActorCollection.h" -#include "fdbclient/Notified.h" - -#include "fdbrpc/IAsyncFile.h" - -typedef uint32_t PhysicalPageID; -typedef std::vector> PageVersionMap; -typedef std::vector LogicalPageTable; - -class IndirectShadowPager; - -class IndirectShadowPage : public IPage, ReferenceCounted { -public: - IndirectShadowPage(); - IndirectShadowPage(uint8_t *data, Reference file, PhysicalPageID pageID) - : file(file), physicalPageID(pageID), fastAllocated(false), data(data) {} - virtual ~IndirectShadowPage(); - - virtual void addref() const { - ReferenceCounted::addref(); - } - - virtual void delref() const { - ReferenceCounted::delref(); - } - - virtual int size() const; - virtual uint8_t const* begin() const; - virtual uint8_t* mutate(); - -//private: - static const int PAGE_BYTES; - static const int PAGE_OVERHEAD_BYTES; - -private: - Reference file; - PhysicalPageID physicalPageID; - bool fastAllocated; - uint8_t *data; -}; - -class IndirectShadowPagerSnapshot : public IPagerSnapshot, ReferenceCounted { -public: - IndirectShadowPagerSnapshot(IndirectShadowPager *pager, Version version); - - virtual Future> getPhysicalPage(LogicalPageID pageID); - - virtual Version getVersion() const { - return version; - } - - virtual ~IndirectShadowPagerSnapshot() { - } - - virtual void addref() { - ReferenceCounted::addref(); - } - - virtual void delref() { - ReferenceCounted::delref(); - } - -private: - IndirectShadowPager *pager; - Version version; - Future pagerError; -}; - -class PagerFile { -public: - PagerFile(IndirectShadowPager *pager); - - PhysicalPageID allocatePage(LogicalPageID logicalPageID, Version version); - void freePage(PhysicalPageID physicalPageID); - void markPageAllocated(LogicalPageID logicalPageID, Version version, PhysicalPageID physicalPageID); - - void finishedMarkingPages(); - - uint64_t size(); - uint32_t getPagesAllocated(); - uint32_t getFreePages(); - - void init(uint64_t fileSize, uint32_t pagesAllocated); - void startVacuuming(); - void shutdown(); - -//private: - Future vacuuming; - IndirectShadowPager *pager; - - uint32_t pagesAllocated; - uint64_t fileSize; - - std::set freePages; - - PhysicalPageID minVacuumQueuePage; - bool vacuumQueueReady; - std::map> vacuumQueue; - - bool canVacuum(); - - static const PhysicalPageID INVALID_PAGE; -}; - -class IndirectShadowPager : public IPager { -public: - IndirectShadowPager(std::string basename); - virtual ~IndirectShadowPager() { - } - - virtual Reference newPageBuffer(); - virtual int getUsablePageSize(); - - virtual Reference getReadSnapshot(Version version); - - virtual LogicalPageID allocateLogicalPage(); - virtual void freeLogicalPage(LogicalPageID pageID, Version version); - virtual void writePage(LogicalPageID pageID, Reference contents, Version updateVersion, LogicalPageID referencePageID); - virtual void forgetVersions(Version begin, Version end); - virtual Future commit(); - - virtual void setLatestVersion(Version version); - virtual Future getLatestVersion(); - - virtual StorageBytes getStorageBytes(); - - virtual Future getError(); - virtual Future onClosed(); - virtual void dispose(); - virtual void close(); - - Future> getPage(Reference snapshot, LogicalPageID pageID, Version version); - -//private: - std::string basename; - std::string pageFileName; - - Version latestVersion; - Version committedVersion; - - LogicalPageTable pageTable; - IKeyValueStore *pageTableLog; - - Reference dataFile; - Future recovery; - - Future housekeeping; - Future vacuuming; - Version oldestVersion; - - // TODO: This structure maybe isn't needed - struct BusyPage { - Future> read; - }; - - typedef std::map BusyPageMapT; - BusyPageMapT busyPages; - - SignalableActorCollection operations; - SignalableActorCollection writeActors; - Future committing; - - Promise closed; - Promise errorPromise; - - std::deque logicalFreeList; - PagerFile pagerFile; - - static PageVersionMap::iterator pageVersionMapLowerBound(PageVersionMap &pageVersionMap, Version v); - static PageVersionMap::iterator pageVersionMapUpperBound(PageVersionMap &pageVersionMap, Version v); - - void freeLogicalPageID(LogicalPageID pageID); - void freePhysicalPageID(PhysicalPageID pageID); - - void logVersion(StringRef versionKey, Version version); - void logPagesAllocated(); - void logPageTableUpdate(LogicalPageID logicalPageID, Version version, PhysicalPageID physicalPageID); - void logPageTableClearToEnd(LogicalPageID logicalPageID, Version start); - void logPageTableClear(LogicalPageID logicalPageID, Version start, Version end); - - static const StringRef LATEST_VERSION_KEY; - static const StringRef OLDEST_VERSION_KEY; - static const StringRef PAGES_ALLOCATED_KEY; - static const StringRef TABLE_ENTRY_PREFIX; - -}; - -#endif diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index a4184788fe..9ca58cb830 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -67,7 +67,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( PARALLEL_GET_MORE_REQUESTS, 32 ); if( randomize && BUGGIFY ) PARALLEL_GET_MORE_REQUESTS = 2; init( MULTI_CURSOR_PRE_FETCH_LIMIT, 10 ); init( MAX_QUEUE_COMMIT_BYTES, 15e6 ); if( randomize && BUGGIFY ) MAX_QUEUE_COMMIT_BYTES = 5000; - init( VERSIONS_PER_BATCH, VERSIONS_PER_SECOND/20 ); if( randomize && BUGGIFY ) VERSIONS_PER_BATCH = std::max(1,VERSIONS_PER_SECOND/1000); + init( DESIRED_OUTSTANDING_MESSAGES, 5000 ); if( randomize && BUGGIFY ) DESIRED_OUTSTANDING_MESSAGES = deterministicRandom()->randomInt(0,100); + init( DESIRED_GET_MORE_DELAY, 0.005 ); init( CONCURRENT_LOG_ROUTER_READS, 1 ); init( LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED, 1 ); if( randomize && BUGGIFY ) LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED = 0; init( DISK_QUEUE_ADAPTER_MIN_SWITCH_TIME, 1.0 ); @@ -80,6 +81,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( DISK_QUEUE_MAX_TRUNCATE_BYTES, 2<<30 ); if ( randomize && BUGGIFY ) DISK_QUEUE_MAX_TRUNCATE_BYTES = 0; init( TLOG_DEGRADED_DELAY_COUNT, 5 ); init( TLOG_DEGRADED_DURATION, 5.0 ); + init( MAX_CACHE_VERSIONS, 10e6 ); init( TLOG_IGNORE_POP_AUTO_ENABLE_DELAY, 300.0 ); init( TXS_POPPED_MAX_DELAY, 1.0 ); if ( randomize && BUGGIFY ) TXS_POPPED_MAX_DELAY = deterministicRandom()->random01(); @@ -130,8 +132,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( MAX_SHARD_BYTES, 500000000 ); init( KEY_SERVER_SHARD_BYTES, 500000000 ); bool buggifySmallReadBandwidth = randomize && BUGGIFY; - init( SHARD_MAX_BYTES_READ_PER_KSEC, 100LL*1000000*1000 ); if( buggifySmallReadBandwidth ) SHARD_MAX_BYTES_READ_PER_KSEC = 100LL*1000*1000; - /* 100*1MB/sec * 1000sec/ksec + init( SHARD_MAX_BYTES_READ_PER_KSEC, 8LL*1000000*1000 ); if( buggifySmallReadBandwidth ) SHARD_MAX_BYTES_READ_PER_KSEC = 100LL*1000*1000; + /* 8*1MB/sec * 1000sec/ksec Shards with more than this read bandwidth will be considered as a read cache candidate */ init( SHARD_MAX_BYTES_READ_PER_KSEC_JITTER, 0.1 ); @@ -327,6 +329,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( ENFORCED_MIN_RECOVERY_DURATION, 0.085 ); if( shortRecoveryDuration ) ENFORCED_MIN_RECOVERY_DURATION = 0.01; init( REQUIRED_MIN_RECOVERY_DURATION, 0.080 ); if( shortRecoveryDuration ) REQUIRED_MIN_RECOVERY_DURATION = 0.01; init( ALWAYS_CAUSAL_READ_RISKY, false ); + init( MAX_COMMIT_UPDATES, 100000 ); if( randomize && BUGGIFY ) MAX_COMMIT_UPDATES = 1; // Master Server // masterCommitter() in the master server will allow lower priority tasks (e.g. DataDistibution) @@ -456,7 +459,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( SPLIT_JITTER_AMOUNT, 0.05 ); if( randomize && BUGGIFY ) SPLIT_JITTER_AMOUNT = 0.2; init( IOPS_UNITS_PER_SAMPLE, 10000 * 1000 / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 100 ); init( BANDWIDTH_UNITS_PER_SAMPLE, SHARD_MIN_BYTES_PER_KSEC / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 25 ); - init( BYTES_READ_UNITS_PER_SAMPLE, 100); // Effectively weight up read on small or non-existing key/values. + init( BYTES_READ_UNITS_PER_SAMPLE, 100000 ); // 100K bytes + init( EMPTY_READ_PENALTY, 20 ); // 20 bytes //Storage Server init( STORAGE_LOGGING_DELAY, 5.0 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index f4309f9b13..3d12be885a 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -70,7 +70,8 @@ public: int PARALLEL_GET_MORE_REQUESTS; int MULTI_CURSOR_PRE_FETCH_LIMIT; int64_t MAX_QUEUE_COMMIT_BYTES; - int64_t VERSIONS_PER_BATCH; + int DESIRED_OUTSTANDING_MESSAGES; + double DESIRED_GET_MORE_DELAY; int CONCURRENT_LOG_ROUTER_READS; int LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED; // 0==peek from primary, non-zero==peek from satellites double DISK_QUEUE_ADAPTER_MIN_SWITCH_TIME; @@ -83,6 +84,7 @@ public: int DISK_QUEUE_MAX_TRUNCATE_BYTES; // A truncate larger than this will cause the file to be replaced instead. int TLOG_DEGRADED_DELAY_COUNT; double TLOG_DEGRADED_DURATION; + int64_t MAX_CACHE_VERSIONS; double TXS_POPPED_MAX_DELAY; // Data distribution queue @@ -269,6 +271,7 @@ public: double ENFORCED_MIN_RECOVERY_DURATION; double REQUIRED_MIN_RECOVERY_DURATION; bool ALWAYS_CAUSAL_READ_RISKY; + int MAX_COMMIT_UPDATES; // Master Server double COMMIT_SLEEP_TIME; @@ -394,6 +397,7 @@ public: int64_t IOPS_UNITS_PER_SAMPLE; int64_t BANDWIDTH_UNITS_PER_SAMPLE; int64_t BYTES_READ_UNITS_PER_SAMPLE; + int64_t EMPTY_READ_PENALTY; //Storage Server double STORAGE_LOGGING_DELAY; diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index 7579f7e908..a0f27e3f86 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -243,6 +243,7 @@ ACTOR Future pullAsyncData( LogRouterData *self ) { state Version ver = 0; state std::vector messages; + state Arena arena; while (true) { state bool foundMessage = r->hasMessage(); if (!foundMessage || r->version().version != ver) { @@ -258,6 +259,7 @@ ACTOR Future pullAsyncData( LogRouterData *self ) { lastVer = ver; ver = r->version().version; messages.clear(); + arena = Arena(); if (!foundMessage) { ver--; //ver is the next possible version we will get data for @@ -275,8 +277,9 @@ ACTOR Future pullAsyncData( LogRouterData *self ) { tagAndMsg.message = r->getMessageWithTags(); tags.clear(); self->logSet.getPushLocations(r->getTags(), tags, 0); + tagAndMsg.tags.reserve(arena, tags.size()); for (const auto& t : tags) { - tagAndMsg.tags.emplace_back(tagLocalityRemoteLog, t); + tagAndMsg.tags.push_back(arena, Tag(tagLocalityRemoteLog, t)); } messages.push_back(std::move(tagAndMsg)); @@ -337,6 +340,9 @@ ACTOR Future logRouterPeekMessages( LogRouterData* self, TLogPeekRequest r try { peekId = req.sequence.get().first; sequence = req.sequence.get().second; + if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && self->peekTracker.find(peekId) == self->peekTracker.end()) { + throw timed_out(); + } auto& trackerData = self->peekTracker[peekId]; if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); diff --git a/fdbserver/LogSystem.h b/fdbserver/LogSystem.h index e1c5e36807..8c672e0111 100644 --- a/fdbserver/LogSystem.h +++ b/fdbserver/LogSystem.h @@ -232,7 +232,7 @@ public: return resultEntries.size() == 0; } - void getPushLocations(std::vector const& tags, std::vector& locations, int locationOffset, + void getPushLocations(VectorRef tags, std::vector& locations, int locationOffset, bool allLocations = false) { if(locality == tagLocalitySatellite) { for(auto& t : tags) { @@ -310,7 +310,7 @@ struct ILogSystem { //pre: only callable if hasMessage() returns true //return the tags associated with the message for the current sequence - virtual const std::vector& getTags() = 0; + virtual VectorRef getTags() = 0; //pre: only callable if hasMessage() returns true //returns the arena containing the contents of getMessage(), getMessageWithTags(), and reader() @@ -405,7 +405,7 @@ struct ILogSystem { virtual void nextMessage(); virtual StringRef getMessage(); virtual StringRef getMessageWithTags(); - virtual const std::vector& getTags(); + virtual VectorRef getTags(); virtual void advanceTo(LogMessageVersion n); virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); @@ -438,6 +438,7 @@ struct ILogSystem { bool hasNextMessage; UID randomID; int tLogReplicationFactor; + Future more; MergedPeekCursor( std::vector< Reference > const& serverCursors, Version begin ); MergedPeekCursor( std::vector>>> const& logServers, int bestServer, int readQuorum, Tag tag, Version begin, Version end, bool parallelGetMore, std::vector const& tLogLocalities, Reference const tLogPolicy, int tLogReplicationFactor ); @@ -453,7 +454,7 @@ struct ILogSystem { virtual void nextMessage(); virtual StringRef getMessage(); virtual StringRef getMessageWithTags(); - virtual const std::vector& getTags(); + virtual VectorRef getTags(); virtual void advanceTo(LogMessageVersion n); virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); @@ -484,6 +485,7 @@ struct ILogSystem { bool hasNextMessage; bool useBestSet; UID randomID; + Future more; SetPeekCursor( std::vector> const& logSets, int bestSet, int bestServer, Tag tag, Version begin, Version end, bool parallelGetMore ); SetPeekCursor( std::vector> const& logSets, std::vector< std::vector< Reference > > const& serverCursors, LogMessageVersion const& messageVersion, int bestSet, int bestServer, Optional nextVersion, bool useBestSet ); @@ -498,7 +500,7 @@ struct ILogSystem { virtual void nextMessage(); virtual StringRef getMessage(); virtual StringRef getMessageWithTags(); - virtual const std::vector& getTags(); + virtual VectorRef getTags(); virtual void advanceTo(LogMessageVersion n); virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); @@ -532,7 +534,7 @@ struct ILogSystem { virtual void nextMessage(); virtual StringRef getMessage(); virtual StringRef getMessageWithTags(); - virtual const std::vector& getTags(); + virtual VectorRef getTags(); virtual void advanceTo(LogMessageVersion n); virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); @@ -555,12 +557,12 @@ struct ILogSystem { struct BufferedMessage { Arena arena; StringRef message; - std::vector tags; + VectorRef tags; LogMessageVersion version; BufferedMessage() {} explicit BufferedMessage( Version version ) : version(version) {} - BufferedMessage( Arena arena, StringRef message, const std::vector& tags, const LogMessageVersion& version ) : arena(arena), message(message), tags(tags), version(version) {} + BufferedMessage( Arena arena, StringRef message, const VectorRef& tags, const LogMessageVersion& version ) : arena(arena), message(message), tags(tags), version(version) {} bool operator < (BufferedMessage const& r) const { return version < r.version; @@ -572,23 +574,28 @@ struct ILogSystem { }; std::vector> cursors; + std::vector> cursorMessages; std::vector messages; int messageIndex; LogMessageVersion messageVersion; Version end; bool hasNextMessage; bool withTags; + bool knownUnique; + Version minKnownCommittedVersion; Version poppedVersion; Version initialPoppedVersion; bool canDiscardPopped; Future more; + int targetQueueSize; + UID randomID; //FIXME: collectTags is needed to support upgrades from 5.X to 6.0. Remove this code when we no longer support that upgrade. bool collectTags; - std::vector tags; void combineMessages(); BufferedCursor( std::vector> cursors, Version begin, Version end, bool withTags, bool collectTags, bool canDiscardPopped ); + BufferedCursor( std::vector>>> const& logServers, Tag tag, Version begin, Version end, bool parallelGetMore ); virtual Reference cloneNoMore(); virtual void setProtocolVersion( ProtocolVersion version ); @@ -598,7 +605,7 @@ struct ILogSystem { virtual void nextMessage(); virtual StringRef getMessage(); virtual StringRef getMessageWithTags(); - virtual const std::vector& getTags(); + virtual VectorRef getTags(); virtual void advanceTo(LogMessageVersion n); virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); @@ -644,7 +651,7 @@ struct ILogSystem { // Returns when the preceding changes are durable. (Later we will need multiple return signals for diffferent durability levels) // If the current epoch has ended, push will not return, and the pushed messages will not be visible in any subsequent epoch (but may become visible in this epoch) - virtual Reference peek( UID dbgid, Version begin, Tag tag, bool parallelGetMore = false ) = 0; + virtual Reference peek( UID dbgid, Version begin, Optional end, Tag tag, bool parallelGetMore = false ) = 0; // Returns (via cursor interface) a stream of messages with the given tag and message versions >= (begin, 0), ordered by message version // If pop was previously or concurrently called with upTo > begin, the cursor may not return all such messages. In that case cursor->popped() will // be greater than begin to reflect that. @@ -710,7 +717,11 @@ struct ILogSystem { virtual Future onLogSystemConfigChange() = 0; // Returns when the log system configuration has changed due to a tlog rejoin. - virtual void getPushLocations(std::vector const& tags, std::vector& locations, bool allLocations = false) = 0; + virtual void getPushLocations(VectorRef tags, std::vector& locations, bool allLocations = false) = 0; + + void getPushLocations(std::vector const& tags, std::vector& locations, bool allLocations = false) { + getPushLocations(VectorRef((Tag*)&tags.front(), tags.size()), locations, allLocations); + } virtual bool hasRemoteLogs() const = 0; diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 2c03c19027..2765900fe6 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -110,7 +110,7 @@ StringRef ILogSystem::ServerPeekCursor::getMessageWithTags() { return rawMessage; } -const std::vector& ILogSystem::ServerPeekCursor::getTags() { +VectorRef ILogSystem::ServerPeekCursor::getTags() { return messageAndTags.tags; } @@ -150,6 +150,12 @@ ACTOR Future serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self while(self->futureResults.size() < SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && self->interf->get().present()) { self->futureResults.push_back( brokenPromiseToNever( self->interf->get().interf().peekMessages.getReply(TLogPeekRequest(self->messageVersion.version,self->tag,self->returnIfBlocked, self->onlySpilled, std::make_pair(self->randomID, self->sequence++)), taskID) ) ); } + if (self->sequence == std::numeric_limitssequence)>::max()) { + throw timed_out(); + } + } else if (self->futureResults.size() == 1) { + self->randomID = deterministicRandom()->randomUniqueID(); + self->sequence = 0; } else if (self->futureResults.size() == 0) { return Void(); } @@ -430,7 +436,7 @@ StringRef ILogSystem::MergedPeekCursor::getMessageWithTags() { return serverCursors[currentCursor]->getMessageWithTags(); } -const std::vector& ILogSystem::MergedPeekCursor::getTags() { +VectorRef ILogSystem::MergedPeekCursor::getTags() { return serverCursors[currentCursor]->getTags(); } @@ -469,6 +475,10 @@ ACTOR Future mergedPeekGetMore(ILogSystem::MergedPeekCursor* self, LogMess } Future ILogSystem::MergedPeekCursor::getMore(TaskPriority taskID) { + if( more.isValid() && !more.isReady() ) { + return more; + } + if(!serverCursors.size()) return Never(); @@ -482,7 +492,8 @@ Future ILogSystem::MergedPeekCursor::getMore(TaskPriority taskID) { if (version() > startVersion) return Void(); - return mergedPeekGetMore(this, startVersion, taskID); + more = mergedPeekGetMore(this, startVersion, taskID); + return more; } Future ILogSystem::MergedPeekCursor::onFailed() { @@ -689,7 +700,7 @@ StringRef ILogSystem::SetPeekCursor::getMessage() { return serverCursors[current StringRef ILogSystem::SetPeekCursor::getMessageWithTags() { return serverCursors[currentSet][currentCursor]->getMessageWithTags(); } -const std::vector& ILogSystem::SetPeekCursor::getTags() { +VectorRef ILogSystem::SetPeekCursor::getTags() { return serverCursors[currentSet][currentCursor]->getTags(); } @@ -770,6 +781,10 @@ ACTOR Future setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVer } Future ILogSystem::SetPeekCursor::getMore(TaskPriority taskID) { + if( more.isValid() && !more.isReady() ) { + return more; + } + auto startVersion = version(); calcHasMessage(); if( hasMessage() ) @@ -780,7 +795,8 @@ Future ILogSystem::SetPeekCursor::getMore(TaskPriority taskID) { if (version() > startVersion) return Void(); - return setPeekGetMore(this, startVersion, taskID); + more = setPeekGetMore(this, startVersion, taskID); + return more; } Future ILogSystem::SetPeekCursor::onFailed() { @@ -851,7 +867,7 @@ StringRef ILogSystem::MultiCursor::getMessageWithTags() { return cursors.back()->getMessageWithTags(); } -const std::vector& ILogSystem::MultiCursor::getTags() { +VectorRef ILogSystem::MultiCursor::getTags() { return cursors.back()->getTags(); } @@ -901,8 +917,20 @@ Version ILogSystem::MultiCursor::popped() { return std::max(poppedVersion, cursors.back()->popped()); } -ILogSystem::BufferedCursor::BufferedCursor( std::vector> cursors, Version begin, Version end, bool withTags, bool collectTags, bool canDiscardPopped ) : cursors(cursors), messageVersion(begin), end(end), withTags(withTags), collectTags(collectTags), hasNextMessage(false), messageIndex(0), poppedVersion(0), initialPoppedVersion(0), canDiscardPopped(canDiscardPopped) { - messages.reserve(10000); +ILogSystem::BufferedCursor::BufferedCursor( std::vector> cursors, Version begin, Version end, bool withTags, bool collectTags, bool canDiscardPopped ) : cursors(cursors), messageVersion(begin), end(end), withTags(withTags), collectTags(collectTags), hasNextMessage(false), messageIndex(0), poppedVersion(0), initialPoppedVersion(0), canDiscardPopped(canDiscardPopped), knownUnique(false), minKnownCommittedVersion(0), randomID(deterministicRandom()->randomUniqueID()) { + targetQueueSize = SERVER_KNOBS->DESIRED_OUTSTANDING_MESSAGES/cursors.size(); + messages.reserve(SERVER_KNOBS->DESIRED_OUTSTANDING_MESSAGES); + cursorMessages.resize(cursors.size()); +} + +ILogSystem::BufferedCursor::BufferedCursor( std::vector>>> const& logServers, Tag tag, Version begin, Version end, bool parallelGetMore ) : messageVersion(begin), end(end), withTags(true), collectTags(false), hasNextMessage(false), messageIndex(0), poppedVersion(0), initialPoppedVersion(0), canDiscardPopped(false), knownUnique(true), minKnownCommittedVersion(0), randomID(deterministicRandom()->randomUniqueID()) { + targetQueueSize = SERVER_KNOBS->DESIRED_OUTSTANDING_MESSAGES/logServers.size(); + messages.reserve(SERVER_KNOBS->DESIRED_OUTSTANDING_MESSAGES); + cursorMessages.resize(logServers.size()); + for( int i = 0; i < logServers.size(); i++ ) { + Reference cursor( new ILogSystem::ServerPeekCursor( logServers[i], tag, begin, end, false, parallelGetMore ) ); + cursors.push_back( cursor ); + } } void ILogSystem::BufferedCursor::combineMessages() { @@ -910,7 +938,7 @@ void ILogSystem::BufferedCursor::combineMessages() { return; } - tags.clear(); + std::vector tags; tags.push_back(messages[messageIndex].tags[0]); for(int i = messageIndex + 1; i < messages.size() && messages[messageIndex].version == messages[i].version; i++) { tags.push_back(messages[i].tags[0]); @@ -919,14 +947,17 @@ void ILogSystem::BufferedCursor::combineMessages() { auto& msg = messages[messageIndex]; BinaryWriter messageWriter(Unversioned()); messageWriter << uint32_t(msg.message.size() + sizeof(uint32_t) + sizeof(uint16_t) + tags.size()*sizeof(Tag)) << msg.version.sub << uint16_t(tags.size()); - for(auto& t : tags) { + for(auto t : tags) { messageWriter << t; } messageWriter.serializeBytes(msg.message); Standalone val = messageWriter.toValue(); msg.arena = val.arena(); - msg.tags = tags; msg.message = val; + msg.tags = VectorRef(); + for(auto t : tags) { + msg.tags.push_back(msg.arena, t); + } } Reference ILogSystem::BufferedCursor::cloneNoMore() { @@ -973,7 +1004,7 @@ StringRef ILogSystem::BufferedCursor::getMessageWithTags() { return messages[messageIndex].message; } -const std::vector& ILogSystem::BufferedCursor::getTags() { +VectorRef ILogSystem::BufferedCursor::getTags() { ASSERT(withTags); return messages[messageIndex].tags; } @@ -982,24 +1013,25 @@ void ILogSystem::BufferedCursor::advanceTo(LogMessageVersion n) { ASSERT(false); } -ACTOR Future bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Reference cursor, Version maxVersion, TaskPriority taskID ) { +ACTOR Future bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Reference cursor, int idx, TaskPriority taskID ) { loop { wait(yield()); - if(cursor->version().version >= maxVersion) { + if(cursor->version().version >= self->end || self->cursorMessages[idx].size() > self->targetQueueSize) { return Void(); } - while(cursor->hasMessage()) { - self->messages.push_back(ILogSystem::BufferedCursor::BufferedMessage(cursor->arena(), (!self->withTags || self->collectTags) ? cursor->getMessage() : cursor->getMessageWithTags(), !self->withTags ? std::vector() : cursor->getTags(), cursor->version())); - cursor->nextMessage(); - if(cursor->version().version >= maxVersion) { - return Void(); - } - } wait(cursor->getMore(taskID)); self->poppedVersion = std::max(self->poppedVersion, cursor->popped()); + self->minKnownCommittedVersion = std::max(self->minKnownCommittedVersion, cursor->getMinKnownCommittedVersion()); if(self->canDiscardPopped) { self->initialPoppedVersion = std::max(self->initialPoppedVersion, cursor->popped()); } + if(cursor->version().version >= self->end) { + return Void(); + } + while(cursor->hasMessage()) { + self->cursorMessages[idx].push_back(ILogSystem::BufferedCursor::BufferedMessage(cursor->arena(), (!self->withTags || self->collectTags) ? cursor->getMessage() : cursor->getMessageWithTags(), !self->withTags ? VectorRef() : cursor->getTags(), cursor->version())); + cursor->nextMessage(); + } } } @@ -1009,39 +1041,57 @@ ACTOR Future bufferedGetMore( ILogSystem::BufferedCursor* self, TaskPriori throw internal_error(); } - state Version targetVersion = std::min(self->end, self->messageVersion.version + SERVER_KNOBS->VERSIONS_PER_BATCH); self->messages.clear(); std::vector> loaders; loaders.reserve(self->cursors.size()); - for(auto& cursor : self->cursors) { - loaders.push_back(bufferedGetMoreLoader(self, cursor, targetVersion, taskID)); - } - wait( waitForAll(loaders) ); - wait(yield()); - if(self->collectTags) { + for(int i = 0; i < self->cursors.size(); i++) { + loaders.push_back(bufferedGetMoreLoader(self, self->cursors[i], i, taskID)); + } + + state Future allLoaders = waitForAll(loaders); + state Version minVersion; + loop { + wait( allLoaders || delay(SERVER_KNOBS->DESIRED_GET_MORE_DELAY, taskID) ); + minVersion = self->end; + for(auto cursor : self->cursors) { + minVersion = std::min(minVersion, cursor->version().version); + } + if(minVersion > self->messageVersion.version) { + break; + } + if(allLoaders.isReady()) { + wait(Future(Never())); + } + } + wait( yield() ); + + for(auto &it : self->cursorMessages) { + while(!it.empty() && it.front().version.version < minVersion) { + self->messages.push_back(it.front()); + it.pop_front(); + } + } + if(self->collectTags || self->knownUnique) { std::sort(self->messages.begin(), self->messages.end()); } else { uniquify(self->messages); } + + self->messageVersion = LogMessageVersion(minVersion); self->messageIndex = 0; self->hasNextMessage = self->messages.size() > 0; - Version minVersion = self->end; - for(auto& cursor : self->cursors) { - minVersion = std::min(minVersion, cursor->version().version); - } - self->messageVersion = LogMessageVersion(minVersion); - + if(self->collectTags) { self->combineMessages(); } wait(yield()); if(self->canDiscardPopped && self->poppedVersion > self->version().version) { - TraceEvent(SevWarn, "DiscardingPoppedData").detail("Version", self->version().version).detail("Popped", self->poppedVersion); + TraceEvent(SevWarn, "DiscardingPoppedData", self->randomID).detail("Version", self->version().version).detail("Popped", self->poppedVersion); self->messageVersion = std::max(self->messageVersion, LogMessageVersion(self->poppedVersion)); - for(auto& cursor : self->cursors) { + for(auto cursor : self->cursors) { cursor->advanceTo(self->messageVersion); } self->messageIndex = self->messages.size(); @@ -1096,8 +1146,7 @@ const LogMessageVersion& ILogSystem::BufferedCursor::version() { } Version ILogSystem::BufferedCursor::getMinKnownCommittedVersion() { - ASSERT(false); - return invalidVersion; + return minKnownCommittedVersion; } Version ILogSystem::BufferedCursor::popped() { diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 3721b6c627..c573f33187 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -207,6 +207,7 @@ struct ProxyCommitData { uint64_t mostRecentProcessedRequestNumber; KeyRangeMap>> keyResolvers; KeyRangeMap keyInfo; + KeyRangeMap cacheInfo; std::map uid_applyMutationsData; bool firstProxy; double lastCoalesceTime; @@ -236,6 +237,7 @@ struct ProxyCommitData { Optional latencyBandConfig; double lastStartCommit; double lastCommitLatency; + int updateCommitRequests = 0; NotifiedDouble lastCommitTime; //The tag related to a storage server rarely change, so we keep a vector of tags for each key range to be slightly more CPU efficient. @@ -257,6 +259,16 @@ struct ProxyCommitData { return tags; } + const bool needsCacheTag(KeyRangeRef range) { + auto ranges = cacheInfo.intersectingRanges(range); + for(auto r : ranges) { + if(r.value()) { + return true; + } + } + return false; + } + ProxyCommitData(UID dbgid, MasterInterface master, RequestStream getConsistentReadVersion, Version recoveryTransactionVersion, RequestStream commit, Reference> db, bool firstProxy) : dbgid(dbgid), stats(dbgid, &version, &committedVersion, &commitBatchesMemBytesCount), master(master), logAdapter(NULL), txnStateStore(NULL), popRemoteTxs(false), @@ -657,7 +669,7 @@ ACTOR Future commitBatch( for (int resolver = 0; resolver < resolution.size(); resolver++) committed = committed && resolution[resolver].stateMutations[versionIndex][transactionIndex].committed; if (committed) - applyMetadataMutations( self->dbgid, arena, resolution[0].stateMutations[versionIndex][transactionIndex].mutations, self->txnStateStore, NULL, &forceRecovery, self->logSystem, 0, &self->vecBackupKeys, &self->keyInfo, self->firstProxy ? &self->uid_applyMutationsData : NULL, self->commit, self->cx, &self->committedVersion, &self->storageCache, &self->tag_popped); + applyMetadataMutations( self->dbgid, arena, resolution[0].stateMutations[versionIndex][transactionIndex].mutations, self->txnStateStore, nullptr, &forceRecovery, self->logSystem, 0, &self->vecBackupKeys, &self->keyInfo, &self->cacheInfo, self->firstProxy ? &self->uid_applyMutationsData : nullptr, self->commit, self->cx, &self->committedVersion, &self->storageCache, &self->tag_popped); if( resolution[0].stateMutations[versionIndex][transactionIndex].mutations.size() && firstStateMutations ) { ASSERT(committed); @@ -737,7 +749,7 @@ ACTOR Future commitBatch( { if (committed[t] == ConflictBatch::TransactionCommitted && (!locked || trs[t].isLockAware())) { commitCount++; - applyMetadataMutations(self->dbgid, arena, trs[t].transaction.mutations, self->txnStateStore, &toCommit, &forceRecovery, self->logSystem, commitVersion+1, &self->vecBackupKeys, &self->keyInfo, self->firstProxy ? &self->uid_applyMutationsData : NULL, self->commit, self->cx, &self->committedVersion, &self->storageCache, &self->tag_popped); + applyMetadataMutations(self->dbgid, arena, trs[t].transaction.mutations, self->txnStateStore, &toCommit, &forceRecovery, self->logSystem, commitVersion+1, &self->vecBackupKeys, &self->keyInfo, &self->cacheInfo, self->firstProxy ? &self->uid_applyMutationsData : NULL, self->commit, self->cx, &self->committedVersion, &self->storageCache, &self->tag_popped); } if(firstStateMutations) { ASSERT(committed[t] == ConflictBatch::TransactionCommitted); @@ -808,11 +820,16 @@ ACTOR Future commitBatch( if (debugMutation("ProxyCommit", commitVersion, m)) TraceEvent("ProxyCommitTo", self->dbgid).detail("To", describe(tags)).detail("Mutation", m.toString()).detail("Version", commitVersion); + toCommit.addTags(tags); + if(self->cacheInfo[m.param1]) { + toCommit.addTag(cacheTag); + } toCommit.addTypedMessage(m); } else if (m.type == MutationRef::ClearRange) { - auto ranges = self->keyInfo.intersectingRanges(KeyRangeRef(m.param1, m.param2)); + KeyRangeRef clearRange(KeyRangeRef(m.param1, m.param2)); + auto ranges = self->keyInfo.intersectingRanges(clearRange); auto firstRange = ranges.begin(); ++firstRange; if (firstRange == ranges.end()) { @@ -832,8 +849,12 @@ ACTOR Future commitBatch( } if (debugMutation("ProxyCommit", commitVersion, m)) TraceEvent("ProxyCommitTo", self->dbgid).detail("To", describe(allSources)).detail("Mutation", m.toString()).detail("Version", commitVersion); + toCommit.addTags(allSources); } + if(self->needsCacheTag(clearRange)) { + toCommit.addTag(cacheTag); + } toCommit.addTypedMessage(m); } else UNREACHABLE(); @@ -1052,7 +1073,9 @@ ACTOR Future commitBatch( ACTOR Future updateLastCommit(ProxyCommitData* self, Optional debugID = Optional()) { state double confirmStart = now(); self->lastStartCommit = confirmStart; + self->updateCommitRequests++; wait(self->logSystem->confirmEpochLive(debugID)); + self->updateCommitRequests--; self->lastCommitLatency = now()-confirmStart; self->lastCommitTime = std::max(self->lastCommitTime.get(), confirmStart); return Void(); @@ -1135,7 +1158,9 @@ ACTOR Future sendGrvReplies(Future replyFuture, std:: GetReadVersionReply reply = wait(replyFuture); double end = timer(); for(GetReadVersionRequest const& request : requests) { - stats->grvLatencyBands.addMeasurement(end - request.requestTime()); + if(request.priority() >= GetReadVersionRequest::PRIORITY_DEFAULT) { + stats->grvLatencyBands.addMeasurement(end - request.requestTime()); + } request.reply.send(reply); } @@ -1460,7 +1485,12 @@ ACTOR Future lastCommitUpdater(ProxyCommitData* self, PromiseStreamupdateCommitRequests < SERVER_KNOBS->MAX_COMMIT_UPDATES) { + addActor.send(updateLastCommit(self)); + } else { + TraceEvent(g_network->isSimulated() ? SevInfo : SevWarnAlways, "TooManyLastCommitUpdates").suppressFor(1.0); + self->lastStartCommit = now(); + } } } } @@ -1770,7 +1800,7 @@ ACTOR Future masterProxyServerCore( Arena arena; bool confChanges; - applyMetadataMutations(commitData.dbgid, arena, mutations, commitData.txnStateStore, NULL, &confChanges, Reference(), 0, &commitData.vecBackupKeys, &commitData.keyInfo, commitData.firstProxy ? &commitData.uid_applyMutationsData : NULL, commitData.commit, commitData.cx, &commitData.committedVersion, &commitData.storageCache, &commitData.tag_popped, true ); + applyMetadataMutations(commitData.dbgid, arena, mutations, commitData.txnStateStore, nullptr, &confChanges, Reference(), 0, &commitData.vecBackupKeys, &commitData.keyInfo, &commitData.cacheInfo, commitData.firstProxy ? &commitData.uid_applyMutationsData : nullptr, commitData.commit, commitData.cx, &commitData.committedVersion, &commitData.storageCache, &commitData.tag_popped, true ); } auto lockedKey = commitData.txnStateStore->readValue(databaseLockedKey).get(); diff --git a/fdbserver/MemoryPager.actor.cpp b/fdbserver/MemoryPager.actor.cpp deleted file mode 100644 index 52876ae397..0000000000 --- a/fdbserver/MemoryPager.actor.cpp +++ /dev/null @@ -1,456 +0,0 @@ -/* - * MemoryPager.actor.cpp - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include "fdbserver/MemoryPager.h" -#include "fdbserver/Knobs.h" - -#include "flow/Arena.h" -#include "flow/UnitTest.h" -#include "flow/actorcompiler.h" - -typedef uint8_t* PhysicalPageID; -typedef std::vector> PageVersionMap; -typedef std::vector LogicalPageTable; - -class MemoryPager; - -class MemoryPage : public IPage, ReferenceCounted { -public: - MemoryPage(); - MemoryPage(uint8_t *data); - virtual ~MemoryPage(); - - virtual void addref() const { - ReferenceCounted::addref(); - } - - virtual void delref() const { - ReferenceCounted::delref(); - } - - virtual int size() const; - virtual uint8_t const* begin() const; - virtual uint8_t* mutate(); - -private: - friend class MemoryPager; - uint8_t *data; - bool allocated; - - static const int PAGE_BYTES; -}; - -class MemoryPagerSnapshot : public IPagerSnapshot, ReferenceCounted { -public: - MemoryPagerSnapshot(MemoryPager *pager, Version version) : pager(pager), version(version) {} - virtual Future> getPhysicalPage(LogicalPageID pageID); - virtual Version getVersion() const { - return version; - } - - virtual void addref() { - ReferenceCounted::addref(); - } - - virtual void delref() { - ReferenceCounted::delref(); - } - -private: - MemoryPager *pager; - Version version; -}; - -class MemoryPager : public IPager, ReferenceCounted { -public: - MemoryPager(); - - virtual Reference newPageBuffer(); - virtual int getUsablePageSize(); - - virtual Reference getReadSnapshot(Version version); - - virtual LogicalPageID allocateLogicalPage(); - virtual void freeLogicalPage(LogicalPageID pageID, Version version); - virtual void writePage(LogicalPageID pageID, Reference contents, Version updateVersion, LogicalPageID referencePageID); - virtual void forgetVersions(Version begin, Version end); - virtual Future commit(); - - virtual StorageBytes getStorageBytes() { - // TODO: Get actual values for used and free memory - return StorageBytes(); - } - - virtual void setLatestVersion(Version version); - virtual Future getLatestVersion(); - - virtual Future getError(); - virtual Future onClosed(); - virtual void dispose(); - virtual void close(); - - virtual Reference getPage(LogicalPageID pageID, Version version); - -private: - Version latestVersion; - Version committedVersion; - Standalone>> data; - LogicalPageTable pageTable; - - Promise closed; - - std::vector freeList; // TODO: is this good enough for now? - - PhysicalPageID allocatePage(Reference contents); - void extendData(); - - static const PhysicalPageID INVALID_PAGE; -}; - -IPager * createMemoryPager() { - return new MemoryPager(); -} - -MemoryPage::MemoryPage() : allocated(true) { - data = (uint8_t*)FastAllocator<4096>::allocate(); -} - -MemoryPage::MemoryPage(uint8_t *data) : data(data), allocated(false) {} - -MemoryPage::~MemoryPage() { - if(allocated) { - FastAllocator<4096>::release(data); - } -} - -uint8_t const* MemoryPage::begin() const { - return data; -} - -uint8_t* MemoryPage::mutate() { - return data; -} - -int MemoryPage::size() const { - return PAGE_BYTES; -} - -const int MemoryPage::PAGE_BYTES = 4096; - -Future> MemoryPagerSnapshot::getPhysicalPage(LogicalPageID pageID) { - return pager->getPage(pageID, version); -} - -MemoryPager::MemoryPager() : latestVersion(0), committedVersion(0) { - extendData(); - pageTable.resize(SERVER_KNOBS->PAGER_RESERVED_PAGES); -} - -Reference MemoryPager::newPageBuffer() { - return Reference(new MemoryPage()); -} - -int MemoryPager::getUsablePageSize() { - return MemoryPage::PAGE_BYTES; -} - -Reference MemoryPager::getReadSnapshot(Version version) { - ASSERT(version <= latestVersion); - return Reference(new MemoryPagerSnapshot(this, version)); -} - -LogicalPageID MemoryPager::allocateLogicalPage() { - ASSERT(pageTable.size() >= SERVER_KNOBS->PAGER_RESERVED_PAGES); - pageTable.push_back(PageVersionMap()); - return pageTable.size() - 1; -} - -void MemoryPager::freeLogicalPage(LogicalPageID pageID, Version version) { - ASSERT(pageID < pageTable.size()); - - PageVersionMap &pageVersionMap = pageTable[pageID]; - ASSERT(!pageVersionMap.empty()); - - auto itr = std::lower_bound(pageVersionMap.begin(), pageVersionMap.end(), version, [](std::pair p, Version v) { - return p.first < v; - }); - - pageVersionMap.erase(itr, pageVersionMap.end()); - if(pageVersionMap.size() > 0 && pageVersionMap.back().second != INVALID_PAGE) { - pageVersionMap.push_back(std::make_pair(version, INVALID_PAGE)); - } -} - -void MemoryPager::writePage(LogicalPageID pageID, Reference contents, Version updateVersion, LogicalPageID referencePageID) { - ASSERT(updateVersion > latestVersion || updateVersion == 0); - ASSERT(pageID < pageTable.size()); - - if(referencePageID != invalidLogicalPageID) { - PageVersionMap &rpv = pageTable[referencePageID]; - ASSERT(!rpv.empty()); - updateVersion = rpv.back().first; - } - - PageVersionMap &pageVersionMap = pageTable[pageID]; - - ASSERT(updateVersion >= committedVersion || updateVersion == 0); - PhysicalPageID physicalPageID = allocatePage(contents); - - ASSERT(pageVersionMap.empty() || pageVersionMap.back().second != INVALID_PAGE); - - if(updateVersion == 0) { - ASSERT(pageVersionMap.size()); - updateVersion = pageVersionMap.back().first; - pageVersionMap.back().second = physicalPageID; - // TODO: what to do with old page? - } - else { - ASSERT(pageVersionMap.empty() || pageVersionMap.back().first < updateVersion); - pageVersionMap.push_back(std::make_pair(updateVersion, physicalPageID)); - } - -} - -void MemoryPager::forgetVersions(Version begin, Version end) { - ASSERT(begin <= end); - ASSERT(end <= latestVersion); - // TODO -} - -Future MemoryPager::commit() { - ASSERT(committedVersion < latestVersion); - committedVersion = latestVersion; - return Void(); -} - -void MemoryPager::setLatestVersion(Version version) { - ASSERT(version > latestVersion); - latestVersion = version; -} - -Future MemoryPager::getLatestVersion() { - return latestVersion; -} - -Reference MemoryPager::getPage(LogicalPageID pageID, Version version) { - ASSERT(pageID < pageTable.size()); - PageVersionMap const& pageVersionMap = pageTable[pageID]; - - auto itr = std::upper_bound(pageVersionMap.begin(), pageVersionMap.end(), version, [](Version v, std::pair p) { - return v < p.first; - }); - - if(itr == pageVersionMap.begin()) { - return Reference(); // TODO: should this be an error? - } - - --itr; - - ASSERT(itr->second != INVALID_PAGE); - return Reference(new MemoryPage(itr->second)); // TODO: Page memory owned by the pager. Change this? -} - -Future MemoryPager::getError() { - return Void(); -} - -Future MemoryPager::onClosed() { - return closed.getFuture(); -} - -void MemoryPager::dispose() { - closed.send(Void()); - delete this; -} - -void MemoryPager::close() { - dispose(); -} - -PhysicalPageID MemoryPager::allocatePage(Reference contents) { - if(freeList.size()) { - PhysicalPageID pageID = freeList.back(); - freeList.pop_back(); - - memcpy(pageID, contents->begin(), contents->size()); - return pageID; - } - else { - ASSERT(data.size() && data.back().capacity() - data.back().size() >= contents->size()); - PhysicalPageID pageID = data.back().end(); - - data.back().append(data.arena(), contents->begin(), contents->size()); - if(data.back().size() == data.back().capacity()) { - extendData(); - } - else { - ASSERT(data.back().size() <= data.back().capacity() - 4096); - } - - return pageID; - } -} - -void MemoryPager::extendData() { - if(data.size() > 1000) { // TODO: is this an ok way to handle large data size? - throw io_error(); - } - - VectorRef d; - d.reserve(data.arena(), 1 << 22); - data.push_back(data.arena(), d); -} - -// TODO: these tests are not MemoryPager specific, we should make them more general - -void fillPage(Reference page, LogicalPageID pageID, Version version) { - ASSERT(page->size() > sizeof(LogicalPageID) + sizeof(Version)); - - memset(page->mutate(), 0, page->size()); - memcpy(page->mutate(), (void*)&pageID, sizeof(LogicalPageID)); - memcpy(page->mutate() + sizeof(LogicalPageID), (void*)&version, sizeof(Version)); -} - -bool validatePage(Reference page, LogicalPageID pageID, Version version) { - bool valid = true; - - LogicalPageID readPageID = *(LogicalPageID*)page->begin(); - if(readPageID != pageID) { - fprintf(stderr, "Invalid PageID detected: %u (expected %u)\n", readPageID, pageID); - valid = false; - } - - Version readVersion = *(Version*)(page->begin()+sizeof(LogicalPageID)); - if(readVersion != version) { - fprintf(stderr, "Invalid Version detected on page %u: %" PRId64 "(expected %" PRId64 ")\n", pageID, readVersion, version); - valid = false; - } - - return valid; -} - -void writePage(IPager *pager, Reference page, LogicalPageID pageID, Version version, bool updateVersion=true) { - fillPage(page, pageID, version); - pager->writePage(pageID, page, updateVersion ? version : 0); -} - -ACTOR Future commit(IPager *pager) { - static int commitNum = 1; - state int myCommit = commitNum++; - - debug_printf("Commit%d\n", myCommit); - wait(pager->commit()); - debug_printf("FinishedCommit%d\n", myCommit); - return Void(); -} - -ACTOR Future read(IPager *pager, LogicalPageID pageID, Version version, Version expectedVersion=-1) { - static int readNum = 1; - state int myRead = readNum++; - state Reference readSnapshot = pager->getReadSnapshot(version); - debug_printf("Read%d\n", myRead); - Reference readPage = wait(readSnapshot->getPhysicalPage(pageID)); - debug_printf("FinishedRead%d\n", myRead); - ASSERT(validatePage(readPage, pageID, expectedVersion >= 0 ? expectedVersion : version)); - return Void(); -} - -ACTOR Future simplePagerTest(IPager *pager) { - state Reference page = pager->newPageBuffer(); - - Version latestVersion = wait(pager->getLatestVersion()); - debug_printf("Got latest version: %lld\n", latestVersion); - - state Version version = latestVersion+1; - state Version v1 = version; - - state LogicalPageID pageID1 = pager->allocateLogicalPage(); - - writePage(pager, page, pageID1, v1); - pager->setLatestVersion(v1); - wait(commit(pager)); - - state LogicalPageID pageID2 = pager->allocateLogicalPage(); - - state Version v2 = ++version; - - writePage(pager, page, pageID1, v2); - writePage(pager, page, pageID2, v2); - pager->setLatestVersion(v2); - wait(commit(pager)); - - wait(read(pager, pageID1, v2)); - wait(read(pager, pageID1, v1)); - - state Version v3 = ++version; - writePage(pager, page, pageID1, v3, false); - pager->setLatestVersion(v3); - - wait(read(pager, pageID1, v2, v3)); - wait(read(pager, pageID1, v3, v3)); - - state LogicalPageID pageID3 = pager->allocateLogicalPage(); - - state Version v4 = ++version; - writePage(pager, page, pageID2, v4); - writePage(pager, page, pageID3, v4); - pager->setLatestVersion(v4); - wait(commit(pager)); - - wait(read(pager, pageID2, v4, v4)); - - state Version v5 = ++version; - writePage(pager, page, pageID2, v5); - - state LogicalPageID pageID4 = pager->allocateLogicalPage(); - writePage(pager, page, pageID4, v5); - - state Version v6 = ++version; - pager->freeLogicalPage(pageID2, v5); - pager->freeLogicalPage(pageID3, v3); - pager->setLatestVersion(v6); - wait(commit(pager)); - - pager->forgetVersions(0, v4); - wait(commit(pager)); - - wait(delay(3.0)); - - wait(commit(pager)); - - return Void(); -} - -/* -TEST_CASE("/fdbserver/memorypager/simple") { - state IPager *pager = new MemoryPager(); - - wait(simplePagerTest(pager)); - - Future closedFuture = pager->onClosed(); - pager->dispose(); - - wait(closedFuture); - return Void(); -} -*/ - -const PhysicalPageID MemoryPager::INVALID_PAGE = nullptr; diff --git a/fdbserver/MemoryPager.h b/fdbserver/MemoryPager.h deleted file mode 100644 index 359c443de7..0000000000 --- a/fdbserver/MemoryPager.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * MemoryPager.h - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef FDBSERVER_MEMORYPAGER_H -#define FDBSERVER_MEMORYPAGER_H -#pragma once - -#include "fdbserver/IPager.h" - -IPager * createMemoryPager(); - -#endif \ No newline at end of file diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index c07f820f3e..0e02cd57b6 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -875,6 +875,9 @@ namespace oldTLog_4_6 { try { peekId = req.sequence.get().first; sequence = req.sequence.get().second; + if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && self->peekTracker.find(peekId) == self->peekTracker.end()) { + throw timed_out(); + } if(sequence > 0) { auto& trackerData = self->peekTracker[peekId]; trackerData.lastUpdate = now(); diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index d6c5ca1c4e..eb1b5b9dd3 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -284,6 +284,7 @@ struct TLogData : NonCopyable { std::map toBePopped; // map of Tag->Version for all the pops // that came when ignorePopRequest was set Reference> degraded; + std::vector tempTagMessages; TLogData(UID dbgid, IKeyValueStore* persistentData, IDiskQueue * persistentQueue, Reference> dbInfo, Reference> degraded, std::string folder) : dbgid(dbgid), instanceID(deterministicRandom()->randomUniqueID().first()), @@ -677,6 +678,80 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD return Void(); } +ACTOR Future tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference logData ) { + if (self->ignorePopRequest) { + TraceEvent(SevDebug, "IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline); + + if (self->toBePopped.find(inputTag) == self->toBePopped.end() + || to > self->toBePopped[inputTag]) { + self->toBePopped[inputTag] = to; + } + // add the pop to the toBePopped map + TraceEvent(SevDebug, "IgnoringPopRequest") + .detail("IgnorePopDeadline", self->ignorePopDeadline) + .detail("Tag", inputTag.toString()) + .detail("Version", to); + return Void(); + } + state Version upTo = to; + int8_t tagLocality = inputTag.locality; + if (logData->logSystem->get().isValid() && logData->logSystem->get()->isPseudoLocality(tagLocality)) { + upTo = logData->logSystem->get()->popPseudoLocalityTag(tagLocality, to); + tagLocality = tagLocalityLogRouter; + } + state Tag tag(tagLocality, inputTag.id); + auto tagData = logData->getTagData(tag); + if (!tagData) { + tagData = logData->createTagData(tag, upTo, true, true, false); + } else if (upTo > tagData->popped) { + tagData->popped = upTo; + tagData->poppedRecently = true; + + if(tagData->unpoppedRecovered && upTo > logData->recoveredAt) { + tagData->unpoppedRecovered = false; + logData->unpoppedRecoveredTags--; + TraceEvent("TLogPoppedTag", logData->logId).detail("Tags", logData->unpoppedRecoveredTags).detail("Tag", tag.toString()).detail("DurableKCVer", logData->durableKnownCommittedVersion).detail("RecoveredAt", logData->recoveredAt); + if(logData->unpoppedRecoveredTags == 0 && logData->durableKnownCommittedVersion >= logData->recoveredAt && logData->recoveryComplete.canBeSet()) { + logData->recoveryComplete.send(Void()); + } + } + + if (upTo > logData->persistentDataDurableVersion) + wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop)); + //TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo); + } + return Void(); +} + +ACTOR Future tLogPop( TLogData* self, TLogPopRequest req, Reference logData ) { + // timeout check for ignorePopRequest + if (self->ignorePopRequest && (g_network->now() > self->ignorePopDeadline)) { + + TraceEvent("EnableTLogPlayAllIgnoredPops"); + // use toBePopped and issue all the pops + state std::map::iterator it; + state vector> ignoredPops; + self->ignorePopRequest = false; + self->ignorePopUid = ""; + self->ignorePopDeadline = 0.0; + for (it = self->toBePopped.begin(); it != self->toBePopped.end(); it++) { + TraceEvent("PlayIgnoredPop") + .detail("Tag", it->first.toString()) + .detail("Version", it->second); + ignoredPops.push_back(tLogPopCore(self, it->first, it->second, logData)); + } + self->toBePopped.clear(); + wait(waitForAll(ignoredPops)); + TraceEvent("ResetIgnorePopRequest") + .detail("Now", g_network->now()) + .detail("IgnorePopRequest", self->ignorePopRequest) + .detail("IgnorePopDeadline", self->ignorePopDeadline); + } + wait(tLogPopCore(self, req.tag, req.to, logData)); + req.reply.send(Void()); + return Void(); +} + // This function (and updatePersistentData, which is called by this function) run at a low priority and can soak up all CPU resources. // For this reason, they employ aggressive use of yields to avoid causing slow tasks that could introduce latencies for more important // work (e.g. commits). @@ -696,6 +771,26 @@ ACTOR Future updateStorage( TLogData* self ) { state FlowLock::Releaser commitLockReleaser; + //FIXME: This policy for calculating the cache pop version could end up popping recent data in the remote DC after two consecutive recoveries. + // It also does not protect against spilling the cache tag directly, so it is theoretically possible to spill this tag; which is not intended to ever happen. + Optional cachePopVersion; + for(auto& it : self->id_data) { + if(!it.second->stopped) { + if(it.second->version.get() - it.second->unrecoveredBefore > SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT + SERVER_KNOBS->MAX_CACHE_VERSIONS) { + cachePopVersion = it.second->version.get() - SERVER_KNOBS->MAX_CACHE_VERSIONS; + } + break; + } + } + + if(cachePopVersion.present()) { + state std::vector> cachePopFutures; + for(auto& it : self->id_data) { + cachePopFutures.push_back(tLogPop(self, TLogPopRequest(cachePopVersion.get(),0,cacheTag), it.second)); + } + wait( waitForAll(cachePopFutures) ); + } + if(logData->stopped) { if (self->bytesInput - self->bytesDurable >= self->targetVolatileBytes) { while(logData->persistentDataDurableVersion != logData->version.get()) { @@ -886,13 +981,13 @@ void commitMessages( TLogData* self, Reference logData, Version version void commitMessages( TLogData *self, Reference logData, Version version, Arena arena, StringRef messages ) { ArenaReader rd( arena, messages, Unversioned() ); - std::vector msgs; + self->tempTagMessages.clear(); while(!rd.empty()) { TagsAndMessage tagsAndMsg; tagsAndMsg.loadFromArena(&rd, nullptr); - msgs.push_back(std::move(tagsAndMsg)); + self->tempTagMessages.push_back(std::move(tagsAndMsg)); } - commitMessages(self, logData, version, msgs); + commitMessages(self, logData, version, self->tempTagMessages); } Version poppedVersion( Reference self, Tag tag) { @@ -915,80 +1010,6 @@ std::deque> & getVersionMessages( Re return tagData->versionMessages; }; -ACTOR Future tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference logData ) { - if (self->ignorePopRequest) { - TraceEvent(SevDebug, "IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline); - - if (self->toBePopped.find(inputTag) == self->toBePopped.end() - || to > self->toBePopped[inputTag]) { - self->toBePopped[inputTag] = to; - } - // add the pop to the toBePopped map - TraceEvent(SevDebug, "IgnoringPopRequest") - .detail("IgnorePopDeadline", self->ignorePopDeadline) - .detail("Tag", inputTag.toString()) - .detail("Version", to); - return Void(); - } - state Version upTo = to; - int8_t tagLocality = inputTag.locality; - if (logData->logSystem->get().isValid() && logData->logSystem->get()->isPseudoLocality(tagLocality)) { - upTo = logData->logSystem->get()->popPseudoLocalityTag(tagLocality, to); - tagLocality = tagLocalityLogRouter; - } - state Tag tag(tagLocality, inputTag.id); - auto tagData = logData->getTagData(tag); - if (!tagData) { - tagData = logData->createTagData(tag, upTo, true, true, false); - } else if (upTo > tagData->popped) { - tagData->popped = upTo; - tagData->poppedRecently = true; - - if(tagData->unpoppedRecovered && upTo > logData->recoveredAt) { - tagData->unpoppedRecovered = false; - logData->unpoppedRecoveredTags--; - TraceEvent("TLogPoppedTag", logData->logId).detail("Tags", logData->unpoppedRecoveredTags).detail("Tag", tag.toString()).detail("DurableKCVer", logData->durableKnownCommittedVersion).detail("RecoveredAt", logData->recoveredAt); - if(logData->unpoppedRecoveredTags == 0 && logData->durableKnownCommittedVersion >= logData->recoveredAt && logData->recoveryComplete.canBeSet()) { - logData->recoveryComplete.send(Void()); - } - } - - if (upTo > logData->persistentDataDurableVersion) - wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop)); - //TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo); - } - return Void(); -} - -ACTOR Future tLogPop( TLogData* self, TLogPopRequest req, Reference logData ) { - // timeout check for ignorePopRequest - if (self->ignorePopRequest && (g_network->now() > self->ignorePopDeadline)) { - - TraceEvent("EnableTLogPlayAllIgnoredPops"); - // use toBePopped and issue all the pops - state std::map::iterator it; - state vector> ignoredPops; - self->ignorePopRequest = false; - self->ignorePopUid = ""; - self->ignorePopDeadline = 0.0; - for (it = self->toBePopped.begin(); it != self->toBePopped.end(); it++) { - TraceEvent("PlayIgnoredPop") - .detail("Tag", it->first.toString()) - .detail("Version", it->second); - ignoredPops.push_back(tLogPopCore(self, it->first, it->second, logData)); - } - self->toBePopped.clear(); - wait(waitForAll(ignoredPops)); - TraceEvent("ResetIgnorePopRequest") - .detail("Now", g_network->now()) - .detail("IgnorePopRequest", self->ignorePopRequest) - .detail("IgnorePopDeadline", self->ignorePopDeadline); - } - wait(tLogPopCore(self, req.tag, req.to, logData)); - req.reply.send(Void()); - return Void(); -} - void peekMessagesFromMemory( Reference self, TLogPeekRequest const& req, BinaryWriter& messages, Version& endVersion ) { ASSERT( !messages.getLength() ); @@ -1025,6 +1046,9 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere try { peekId = req.sequence.get().first; sequence = req.sequence.get().second; + if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && logData->peekTracker.find(peekId) == logData->peekTracker.end()) { + throw timed_out(); + } auto& trackerData = logData->peekTracker[peekId]; if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); @@ -1226,6 +1250,7 @@ ACTOR Future doQueueCommit( TLogData* self, Reference logData, st self->queueCommitBegin = commitNumber; logData->queueCommittingVersion = ver; + g_network->setCurrentTask(TaskPriority::TLogCommitReply); Future c = self->persistentQueue->commit(); self->diskQueueCommitBytes = 0; self->largeDiskQueueCommitBytes.set(false); @@ -1716,7 +1741,7 @@ void removeLog( TLogData* self, Reference logData ) { } } -ACTOR Future pullAsyncData( TLogData* self, Reference logData, std::vector tags, Version beginVersion, Optional endVersion, bool poppedIsKnownCommitted, bool parallelGetMore ) { +ACTOR Future pullAsyncData( TLogData* self, Reference logData, std::vector tags, Version beginVersion, Optional endVersion, bool poppedIsKnownCommitted ) { state Future dbInfoChange = Void(); state Reference r; state Version tagAt = beginVersion; @@ -1730,7 +1755,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st } when( wait( dbInfoChange ) ) { if( logData->logSystem->get() ) { - r = logData->logSystem->get()->peek( logData->logId, tagAt, endVersion, tags, parallelGetMore ); + r = logData->logSystem->get()->peek( logData->logId, tagAt, endVersion, tags, true ); } else { r = Reference(); } @@ -1867,7 +1892,7 @@ ACTOR Future tLogCore( TLogData* self, Reference logData, TLogInt if(!logData->isPrimary) { std::vector tags; tags.push_back(logData->remoteTag); - logData->addActor.send( pullAsyncData(self, logData, tags, pulledRecoveryVersions ? logData->recoveredAt + 1 : logData->unrecoveredBefore, Optional(), true, true) ); + logData->addActor.send( pullAsyncData(self, logData, tags, pulledRecoveryVersions ? logData->recoveredAt + 1 : logData->unrecoveredBefore, Optional(), true) ); } try { @@ -2230,10 +2255,10 @@ ACTOR Future tLogStart( TLogData* self, InitializeTLogRequest req, Localit logData->logRouterPopToVersion = req.recoverAt; std::vector tags; tags.push_back(logData->remoteTag); - wait(pullAsyncData(self, logData, tags, logData->unrecoveredBefore, req.recoverAt, true, false) || logData->removed); + wait(pullAsyncData(self, logData, tags, logData->unrecoveredBefore, req.recoverAt, true) || logData->removed); } else if(!req.recoverTags.empty()) { ASSERT(logData->unrecoveredBefore > req.knownCommittedVersion); - wait(pullAsyncData(self, logData, req.recoverTags, req.knownCommittedVersion + 1, req.recoverAt, false, true) || logData->removed); + wait(pullAsyncData(self, logData, req.recoverTags, req.knownCommittedVersion + 1, req.recoverAt, false) || logData->removed); } pulledRecoveryVersions = true; logData->knownCommittedVersion = req.recoverAt; @@ -2331,6 +2356,7 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ self.sharedActors.send( commitQueue(&self) ); self.sharedActors.send( updateStorageLoop(&self) ); + state Future activeSharedChange = Void(); loop { choose { @@ -2343,12 +2369,13 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ } } when ( wait( error ) ) { throw internal_error(); } - when ( wait( activeSharedTLog->onChange() ) ) { + when ( wait( activeSharedChange ) ) { if (activeSharedTLog->get() == tlogId) { self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD; } else { self.sharedActors.send( startSpillingInTenSeconds(&self, tlogId, activeSharedTLog) ); } + activeSharedChange = activeSharedTLog->onChange(); } } } diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index 567502cfcb..d42f0a4d52 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -943,6 +943,8 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD return Void(); } +ACTOR Future tLogPop( TLogData* self, TLogPopRequest req, Reference logData ); + // This function (and updatePersistentData, which is called by this function) run at a low priority and can soak up all CPU resources. // For this reason, they employ aggressive use of yields to avoid causing slow tasks that could introduce latencies for more important // work (e.g. commits). @@ -962,6 +964,26 @@ ACTOR Future updateStorage( TLogData* self ) { state FlowLock::Releaser commitLockReleaser; + //FIXME: This policy for calculating the cache pop version could end up popping recent data in the remote DC after two consecutive recoveries. + // It also does not protect against spilling the cache tag directly, so it is theoretically possible to spill this tag; which is not intended to ever happen. + Optional cachePopVersion; + for(auto& it : self->id_data) { + if(!it.second->stopped) { + if(it.second->version.get() - it.second->unrecoveredBefore > SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT + SERVER_KNOBS->MAX_CACHE_VERSIONS) { + cachePopVersion = it.second->version.get() - SERVER_KNOBS->MAX_CACHE_VERSIONS; + } + break; + } + } + + if(cachePopVersion.present()) { + state std::vector> cachePopFutures; + for(auto& it : self->id_data) { + cachePopFutures.push_back(tLogPop(self, TLogPopRequest(cachePopVersion.get(),0,cacheTag), it.second)); + } + wait( waitForAll(cachePopFutures) ); + } + if(logData->stopped) { if (self->bytesInput - self->bytesDurable >= self->targetVolatileBytes) { while(logData->persistentDataDurableVersion != logData->version.get()) { diff --git a/fdbserver/PrefixTree.h b/fdbserver/PrefixTree.h deleted file mode 100644 index 2f67c20ccd..0000000000 --- a/fdbserver/PrefixTree.h +++ /dev/null @@ -1,1049 +0,0 @@ -/* - * PrefixTree.h - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "flow/flow.h" -#include "flow/Arena.h" -#include "fdbclient/FDBTypes.h" -#include "fdbserver/Knobs.h" -#include - -typedef uint64_t Word; -static inline int commonPrefixLength(uint8_t const* ap, uint8_t const* bp, int cl) { - int i = 0; - const int wordEnd = cl - sizeof(Word) + 1; - - for(; i < wordEnd; i += sizeof(Word)) { - Word a = *(Word *)ap; - Word b = *(Word *)bp; - if(a != b) { - return i + ctzll(a ^ b) / 8; - } - ap += sizeof(Word); - bp += sizeof(Word); - } - - for (; i < cl; i++) { - if (*ap != *bp) { - return i; - } - ++ap; - ++bp; - } - return cl; -} - -static int commonPrefixLength(StringRef a, StringRef b) { - return commonPrefixLength(a.begin(), b.begin(), std::min(a.size(), b.size())); -} - -// This appears to be the fastest version -static int lessOrEqualPowerOfTwo(int n) { - int p; - for (p = 1; p+p <= n; p+=p); - return p; -} - -/* -static int _lessOrEqualPowerOfTwo(uint32_t n) { - if(n == 0) - return n; - int trailing = __builtin_ctz(n); - int leading = __builtin_clz(n); - if(trailing + leading == ((sizeof(n) * 8) - 1)) - return n; - return 1 << ( (sizeof(n) * 8) - leading - 1); -} - -static int __lessOrEqualPowerOfTwo(unsigned int n) { - int p = 1; - for(; p <= n; p <<= 1); - return p >> 1; -} -*/ - -static int perfectSubtreeSplitPoint(int subtree_size) { - // return the inorder index of the root node in a subtree of the given size - // consistent with the resulting binary search tree being "perfect" (having minimal height - // and all missing nodes as far right as possible). - // There has to be a simpler way to do this. - int s = lessOrEqualPowerOfTwo((subtree_size - 1) / 2 + 1) - 1; - return std::min(s * 2 + 1, subtree_size - s - 1); -} - -static int perfectSubtreeSplitPointCached(int subtree_size) { - static uint16_t *points = nullptr; - static const int max = 500; - if(points == nullptr) { - points = new uint16_t[max]; - for(int i = 0; i < max; ++i) - points[i] = perfectSubtreeSplitPoint(i); - } - - if(subtree_size < max) - return points[subtree_size]; - return perfectSubtreeSplitPoint(subtree_size); -} - -struct PrefixTree { - // TODO: Make PrefixTree use a more complex record type with a multi column key - typedef KeyValueRef EntryRef; - typedef Standalone Entry; - - static int MaximumTreeSize() { - return std::numeric_limits::max(); - }; - - struct Node { - uint8_t flags; - -/* - * Node fields - * - * Logically, a node has the following things - * - Flags describing what is in the node - * - Optional left child - * - Optional right child - * - Prefix string, described by a length and a source (which is the most recent left or right ancestor) - * - Optional split string, which contains any bytes after prefix which are needed to make a branching decision - * - Optional suffix string, containing any remaining key bytes after the split string - * - Optional value string - * - * The physical layout places the left child subtree immediately after the split string so that it is likely - * that the bytes read to make a branching decision and then choosing left (as should happen half of the time) - * will have a high cache hit rate. - * - * If necessary, the flags byte could be an enumeration into a set of possible options, since not all options - * combinations are needed. For example, - * - * - The tree is balanced and filled from the left at the last level, so a node cannot have only a right child. - * - If there are no children, there is no point in splitting any key bytes after the prefix into separate strings. - * - If there is exactly one child (left) then the key bytes after the prefix can all go in the split string. The - * traversal decision is to either stop or go left and one of those options (stop) will still have good memory - * locality. - * - * 8 valid/necessary option combinations for presense of (Left, Right, Split, Suffix) out of 16 possibilities - * - * L R Split Suffix - * - * N N N N # No children, key has no bytes after prefix - * N N Y N # No children, key has bytes after prefix - * Y N N N # One child, key has no bytes after prefix - * Y N Y N # One child, key has bytes after prefix - * Y Y N N # Two children, key has no bytes after prefix - * Y Y N Y # Two children, branch decision can be made using only prefix bytes but there are more key bytes after - * Y Y Y N # Two children, branch decision requires all key bytes after prefix - * Y Y Y Y # Two children, branch decision requires some but not all bytes after prefix - * - * This can be represent with just 3 bits, if necessary, but for now there is space in the flags byte for all 4. - * - * Flag Bits - * - * prefix borrow from next - * true - borrow from the closest ancestor greater than this node - * false - borrow from the closest ancestor less than this node - * large lengths = use 2 byte ints instead of 1 byte for prefix, split, suffix, and value lengths - * (TODO: It might be better to just not use a suffix at all when large is lengths is set) - * left child present - * right child present - * split string present - * suffix string present - * value string present - * - * Serialized format: - * All lengths are in the header, which has variable size - * - * flags 1 byte - * prefix length 1-2 bytes based on large lengths flag - * split length 0-2 bytes based on split string present flag - * suffix length 0-2 bytes based on suffix string present and large lengths flags - * value length 0-1 bytes based on value string present and large lengths flag - * left length 0 or 2 bytes depending on left child present - * split 0+ bytes - * left child 0+ bytes - * suffix 0+ bytes - * value 0+ bytes - * right child 0+ bytes - * - */ - enum EFlags { - USE_LARGE_LENGTHS = 1 << 0, - PREFIX_SOURCE_NEXT = 1 << 1, - HAS_LEFT_CHILD = 1 << 2, - HAS_RIGHT_CHILD = 1 << 3, - HAS_SPLIT = 1 << 4, - HAS_SUFFIX = 1 << 5, - HAS_VALUE = 1 << 6 - }; - - // Stores decoded offsets (from beginning) of Node components - struct Parser { - Parser() {} - Parser(const Node *n) { - init(n); - } - - const Node *node; - - typedef uint16_t OffsetT; - OffsetT headerLen; - OffsetT prefixLen; - OffsetT leftPos; - OffsetT suffixPos; - OffsetT valuePos; - OffsetT rightPos; - - StringRef splitString() const { - return StringRef((const uint8_t *)node + headerLen, leftPos); - } - StringRef suffixString() const { - return StringRef((const uint8_t *)node + headerLen + suffixPos, valuePos - suffixPos); - } - StringRef valueString() const { - return StringRef((const uint8_t *)node + headerLen + valuePos, rightPos - valuePos); - } - const Node *leftChild() const { - if(node->flags & HAS_LEFT_CHILD) - return (const Node *)((const uint8_t *)node + headerLen + leftPos); - return nullptr; - } - const Node *rightChild() const { - if(node->flags & HAS_RIGHT_CHILD) - return (const Node *)((const uint8_t *)node + headerLen + rightPos); - return nullptr; - } - int keyLen() const { - int len = prefixLen + leftPos + (valuePos - suffixPos); - ASSERT(len >= 0); - return len; - } - - void init(const Node *n) { - node = n; - union { - const uint8_t *p8; - const uint16_t *p16; - }; - p8 = (const uint8_t *)&n->flags + 1; - - int flags = n->flags; - bool large = flags & USE_LARGE_LENGTHS; - - prefixLen = large ? *p16++ : *p8++; - - if(flags & HAS_SPLIT) - leftPos = large ? *p16++ : *p8++; - else - leftPos = 0; - suffixPos = leftPos; - if(flags & HAS_LEFT_CHILD) - suffixPos += *p16++; - - valuePos = suffixPos; - if(flags & HAS_SUFFIX) - valuePos += (large ? *p16++ : *p8++); - - rightPos = valuePos; - if(flags & HAS_VALUE) - rightPos += (large ? *p16++ : *p8++); - - int header = 2; // flags byte, first prefix len byte - if(large) - ++header; // second prefix len byte - if(flags & HAS_SPLIT) - header += large ? 2 : 1; - if(flags & HAS_LEFT_CHILD) - header += 2; - if(flags & HAS_SUFFIX) - header += large ? 2 : 1; - if(flags & HAS_VALUE) - header += large ? 2 : 1; - headerLen = header; - } - }; - - static inline int getMaxOverhead(int index, int keySize, int valueSize) { - bool large = keySize > 255 || valueSize > 255; - int overhead = 1 + (large ? 2 : 1); // flags and prefix len - // Value length size if present - if(valueSize > 0) - overhead += large ? 2 : 1; - overhead += large ? 6 : 3; // Worst case scenario for value, split and suffix lengths - if((index & 0x01) != 0) - overhead += 2; // Left child length, one less than half of nodes will have one. - return overhead; - } - - public: - - // Methods for decoding specific Node members on-demand - inline int getPrefixLen() const { - return Parser(this).prefixLen; - } - - inline StringRef getSplitString() const { - return Parser(this).splitString(); - } - - inline StringRef getSuffixString() const { - return Parser(this).suffixString(); - } - - inline StringRef getValueString() const { - return Parser(this).valueString(); - } - - inline const Node * getLeftChild() const { - return Parser(this).leftChild(); - } - - inline const Node * getRightChild() const { - return Parser(this).rightChild(); - } - - inline int getKeySize() const { - return Parser(this).keyLen(); - } - }; - -#pragma pack(push,1) - uint16_t size; // size in bytes - Node root; -#pragma pack(pop) - - static inline int GetHeaderSize() { - return sizeof(PrefixTree) - sizeof(root); - } - -private: - struct PathEntry { - const Node *node; - Node::Parser parser; - - // Key may or may not point to the space within keyBuffer. - // Key will always contain at least the prefix bytes borrowed by node - // KeyBuffer will always be large enough to hold the entire reconstituted key for node - // - // These are mutable because getting key bytes from this PathEntry can change these - // but they're really just a read cache for reconstituted key bytes. - mutable StringRef key; - mutable Standalone> keyBuffer; - - // Path entry was reached by going left from the previous node - bool nodeIsLeftChild; - // number of consecutive moves in same direction - int moves; - - PathEntry() : node(nullptr) { - } - PathEntry(const PathEntry &rhs) { - *this = rhs; - } - - // Initialize the key byte buffer to hold bytes of a new node. Use a new arena - // if the old arena is being held by any users. - void initKeyBufferSpace() { - if(node != nullptr) { - int size = parser.keyLen(); - if(keyBuffer.arena().impl && !keyBuffer.arena().impl->isSoleOwnerUnsafe()) { - keyBuffer = Standalone>(); - } - keyBuffer.reserve(keyBuffer.arena(), size); - } - } - - PathEntry & operator= (const PathEntry &rhs) { - node = rhs.node; - parser = rhs.parser; - nodeIsLeftChild = rhs.nodeIsLeftChild; - moves = rhs.moves; - // New key buffer must be able to hold full reconstituted key, not just the - // part of it referenced by rhs.key (which may not be the whole thing) - initKeyBufferSpace(); - if(node != nullptr && rhs.key.size() > 0) { - // Copy rhs.key into keyBuffer and set key to the destination bytes - memcpy(keyBuffer.begin(), rhs.key.begin(), rhs.key.size()); - key = StringRef(keyBuffer.begin(), rhs.key.size()); - } - else { - key = rhs.key; - } - return *this; - } - - void init(StringRef s) { - node = nullptr; - key = s; - } - - void init(const Node *_node, const PathEntry *prefixSource, bool isLeft, int numMoves) { - node = _node; - parser.init(node); - nodeIsLeftChild = isLeft; - moves = numMoves; - - // keyBuffer will be large enough to hold the full reconstituted key but initially - // key will be a reference returned from prefixSource->getKeyRef() - // See comments near keyBuffer and key for more info. - initKeyBufferSpace(); - key = prefixSource->getKeyRef(parser.prefixLen); - } - - inline bool valid() const { - return node != nullptr; - } - - int compareToKey(StringRef s) const { - // Key has at least this node's borrowed prefix bytes in it. - // If s is shorter than key, we only need to compare it to key - if(s.size() < key.size()) - return s.compare(key); - - int cmp = s.substr(0, key.size()).compare(key); - if(cmp != 0) - return cmp; - - // The borrowed prefix bytes and possibly more have already been compared and were equal - int comparedLen = key.size(); - s = s.substr(comparedLen); - StringRef split = parser.splitString(); - int splitSizeOriginal = split.size(); - int splitStart = comparedLen - parser.prefixLen; - if(splitStart < split.size()) { - split = split.substr(splitStart); - if(s.size() < split.size()) - return s.compare(split); - cmp = s.substr(0, split.size()).compare(split); - if(cmp != 0) - return cmp; - s = s.substr(split.size()); - comparedLen += split.size(); - } - - int suffixStart = comparedLen - (parser.prefixLen + splitSizeOriginal); - StringRef suffix = parser.suffixString(); - ASSERT(suffixStart >= 0 && suffixStart <= suffix.size()); - return s.compare(suffix.substr(suffixStart)); - } - - // Make sure that key refers to bytes in keyBuffer, copying if necessary - void ensureKeyInBuffer() const { - if(key.begin() != keyBuffer.begin()) { - memcpy(keyBuffer.begin(), key.begin(), key.size()); - key = StringRef(keyBuffer.begin(), key.size()); - } - } - - // Get the borrowed prefix string. Key must contain all of those bytes but it could contain more. - StringRef getPrefix() const { - if(node == nullptr) - return key; - return key.substr(0, parser.prefixLen); - } - - // Return a reference to the first size bytes of the key. - // - // If size <= key's size then a substring of key will be returned, but if alwaysUseKeyBuffer - // is true then before returning the existing value of key (not just the first size bytes) - // will be copied into keyBuffer and key will be updated to point there. - // - // If size is greater than key's size, then key will be moved into keyBuffer if it is not already there - // and the remaining needed bytes will be copied into keyBuffer from the split and suffix strings. - KeyRef getKeyRef(int size = -1, bool alwaysUseKeyBuffer = false) const { - if(size < 0) - size = parser.keyLen(); - - // If size is less than key then return a substring of it, possibly after moving it to the keyBuffer. - if(size <= key.size()) { - if(alwaysUseKeyBuffer) - ensureKeyInBuffer(); - return key.substr(0, size); - } - - ASSERT(node != nullptr); - ensureKeyInBuffer(); - - // The borrowed prefix bytes and possibly more must already be in key - int writtenLen = key.size(); - StringRef split = parser.splitString(); - StringRef suffix = parser.suffixString(); - int splitStart = writtenLen - parser.prefixLen; - if(splitStart < split.size()) { - int splitLen = std::min(split.size() - splitStart, size - writtenLen); - memcpy(mutateString(key) + writtenLen, split.begin() + splitStart, splitLen); - writtenLen += splitLen; - } - int suffixStart = writtenLen - parser.prefixLen - split.size(); - if(suffixStart < suffix.size()) { - int suffixLen = std::min(suffix.size() - suffixStart, size - writtenLen); - memcpy(mutateString(key) + writtenLen, suffix.begin() + suffixStart, suffixLen); - writtenLen += suffixLen; - } - ASSERT(writtenLen == size); - key = StringRef(key.begin(), size); - return key; - } - - // Return keyRef(size) and the arena that keyBuffer resides in. - Key getKey(int size = -1) const { - StringRef k = getKeyRef(size, true); - return Key(k, keyBuffer.arena()); - } - }; - -public: - // Cursor provides a way to seek into a PrefixTree and iterate over its content - // Seek and move methods can return false can return false if they fail to achieve the desired effect - // but a cursor will remain 'valid' as long as the tree is not empty. - // - // It coalesces prefix bytes into a contiguous buffer for each node along the traversal - // path to make iteration faster. - struct Cursor { - Cursor() : pathLen(0) { - } - - Cursor(const Node *root, StringRef prevAncestor, StringRef nextAncestor) { - init(root, prevAncestor, nextAncestor); - } - - static const int initialPathLen = 3; - static const int initialPathCapacity = 20; - // This is a separate function so that Cursors can be reused to search different PrefixTrees - // which avoids cursor destruction and creation which involves unnecessary memory churn. - // The root node is arbitrarily assumed to be a right child of prevAncestor which itself is a left child of nextAncestor - void init(const Node *root, StringRef prevAncestor, StringRef nextAncestor) { - if(path.size() < initialPathCapacity) - path.resize(initialPathCapacity); - pathLen = initialPathLen; - path[0].init(nextAncestor); - path[1].init(prevAncestor); - path[2].init(root, &path[root->flags & Node::PREFIX_SOURCE_NEXT ? 0 : 1], false, 1); - } - - bool operator == (const Cursor &rhs) const { - return pathBack().node == rhs.pathBack().node; - } - - StringRef leftParentBoundary; - StringRef rightParentBoundary; - std::vector path; - // pathLen is the number of elements in path which are in use. This is to prevent constantly destroying - // and constructing PathEntry objects which would unnecessarily churn through memory in Arena for storing - // coalesced prefixes. - int pathLen; - - bool valid() const { - return pathLen != 0 && pathBack().valid(); - } - - // Get a reference to the current key which is valid until the Cursor is moved. - KeyRef getKeyRef() const { - return pathBack().getKeyRef(); - } - - // Get a Standalone for the current key which will still be valid after the Cursor is moved. - Key getKey() const { - return pathBack().getKey(); - } - - // Get a reference to the current value which is valid as long as the Cursor's page memory exists. - ValueRef getValueRef() const { - return pathBack().parser.valueString(); - } - - // Get a key/value reference that is valid until the Cursor is moved. - EntryRef getKVRef() const { - return EntryRef(getKeyRef(), getValueRef()); - } - - // Returns a standalone EntryRef where both key and value exist in the standalone's arena, - // unless copyValue is false in which case the value will be a reference into tree memory. - Entry getKV(bool copyValue = true) const { - Key k = getKey(); - ValueRef v = getValueRef(); - if(copyValue) - v = ValueRef(k.arena(), getValueRef()); - return Entry(EntryRef(k, v), k.arena()); - } - - // Moves the cursor to the node with the greatest key less than or equal to s. If successful, - // returns true, otherwise returns false and the cursor will be at the node with the next key - // greater than s. - bool seekLessThanOrEqual(StringRef s) { - if(pathLen == 0) - return false; - - pathLen = initialPathLen; - - // TODO: Track position of difference and use prefix reuse bytes and prefix sources - // to skip comparison of some prefix bytes when possible - while(1) { - const PathEntry &p = pathBack(); - const Node *right = p.parser.rightChild(); - _mm_prefetch((const char*)right, _MM_HINT_T0); - - int cmp = p.compareToKey(s); - if(cmp == 0) - return true; - - if(cmp < 0) { - // Try to traverse left - const Node *left = p.parser.leftChild(); - if(left == nullptr) { - // If we're at the root, cursor should now be before the first element - if(pathLen == initialPathLen) { - return false; - } - - if(p.nodeIsLeftChild) { - // If we only went left, cursor should now be before the first element - if((p.moves + initialPathLen) == pathLen) { - return false; - } - - // Otherwise, go to the parent of the last right child traversed, - // which is the last node from which we went right - popPath(p.moves + 1); - return true; - } - - // p.directionLeft is false, so p.node is a right child, so go to its parent. - popPath(1); - return true; - } - - int newMoves = p.nodeIsLeftChild ? p.moves + 1 : 1; - const PathEntry *borrowSource = (left->flags & Node::PREFIX_SOURCE_NEXT) ? &p : &p - newMoves; - pushPath(left, borrowSource, true, newMoves); - } - else { - // Try to traverse right - if(right == nullptr) { - return true; - } - - int newMoves = p.nodeIsLeftChild ? 1 : p.moves + 1; - const PathEntry *borrowSource = (right->flags & Node::PREFIX_SOURCE_NEXT) ? &p - newMoves : &p; - pushPath(right, borrowSource, false, newMoves); - } - } - } - - inline const PathEntry &pathBack() const { - return path[pathLen - 1]; - } - - inline PathEntry &pathBack() { - return path[pathLen - 1]; - } - - inline void pushPath(const Node *node, const PathEntry *borrowSource, bool left, int moves) { - ++pathLen; - if(path.size() < pathLen) { - path.resize(pathLen); - } - pathBack().init(node, borrowSource, left, moves); - } - - inline void popPath(int n) { - pathLen -= n; - } - - std::string pathToString() const { - std::string s; - for(int i = 0; i < pathLen; ++i) { - s += format("(%d: ", i); - const Node *node = path[i].node; - if(node != nullptr) { - s += "childDir="; - s += (path[i].nodeIsLeftChild ? "left " : "right "); - } - s += format("prefix='%s'", path[i].getPrefix().toHexString(20).c_str()); - if(node != nullptr) { - s += format(" split='%s' suffix='%s' value='%s'", node->getSplitString().toHexString(20).c_str(), node->getSuffixString().toHexString(20).c_str(), node->getValueString().toHexString(20).c_str()); - } - else - s += ") "; - } - return s; - } - - bool moveFirst() { - if(pathLen == 0) - return false; - - pathLen = initialPathLen; - - while(1) { - const PathEntry &p = pathBack(); - const Node *left = p.parser.leftChild(); - - if(left == nullptr) - break; - - // TODO: This can be simpler since it only goes left - int newMoves = p.nodeIsLeftChild ? p.moves + 1 : 1; - const PathEntry *borrowSource = (left->flags & Node::PREFIX_SOURCE_NEXT) ? &p : &p - newMoves; - pushPath(left, borrowSource, true, newMoves); - } - - return true; - } - - bool moveLast() { - if(pathLen == 0) - return false; - - pathLen = initialPathLen; - - while(1) { - const PathEntry &p = pathBack(); - const Node *right = p.parser.rightChild(); - - if(right == nullptr) - break; - - // TODO: This can be simpler since it only goes right - int newMoves = p.nodeIsLeftChild ? 1 : p.moves + 1; - const PathEntry *borrowSource = (right->flags & Node::PREFIX_SOURCE_NEXT) ? &p - newMoves : &p; - pushPath(right, borrowSource, false, newMoves); - } - - return true; - } - - bool moveNext() { - const PathEntry &p = pathBack(); - - // If p isn't valid - if(!p.valid()) { - return false; - } - - const Node *right = p.parser.rightChild(); - - // If we can't go right, then go upward to the parent of the last left child - if(right == nullptr) { - // If current node was a left child then pop one node and we're done - if(p.nodeIsLeftChild) { - popPath(1); - return true; - } - - // Current node is a right child. - // If we are at the rightmost tree node return false and don't move. - if(p.moves + initialPathLen - 1 == pathLen) { - return false; - } - - // Truncate path to the parent of the last left child - popPath(p.moves + 1); - return true; - } - - // Go right - int newMoves = p.nodeIsLeftChild ? 1 : p.moves + 1; - const PathEntry *borrowSource = (right->flags & Node::PREFIX_SOURCE_NEXT) ? &p - newMoves : &p; - pushPath(right, borrowSource, false, newMoves); - - // Go left as far as possible - while(1) { - const PathEntry &p = pathBack(); - const Node *left = p.parser.leftChild(); - if(left == nullptr) { - return true; - } - - int newMoves = p.nodeIsLeftChild ? p.moves + 1 : 1; - const PathEntry *borrowSource = (left->flags & Node::PREFIX_SOURCE_NEXT) ? &p : &p - newMoves; - pushPath(left, borrowSource, true, newMoves); - } - } - - bool movePrev() { - const PathEntry &p = pathBack(); - - // If p isn't valid - if(!p.valid()) { - return false; - } - - const Node *left = p.parser.leftChild(); - - // If we can't go left, then go upward to the parent of the last right child - if(left == nullptr) { - // If current node was a right child - if(!p.nodeIsLeftChild) { - // If we are at the root then don't move and return false. - if(pathLen == initialPathLen) - return false; - - // Otherwise, pop one node from the path and return true. - popPath(1); - return true; - } - - // Current node is a left child. - // If we are at the leftmost tree node then return false and don't move. - if(p.moves + 3 == pathLen) { - return false; - } - - // Truncate path to the parent of the last right child - popPath(p.moves + 1); - return true; - } - - // Go left - int newMoves = p.nodeIsLeftChild ? p.moves + 1 : 1; - const PathEntry *borrowSource = (left->flags & Node::PREFIX_SOURCE_NEXT) ? &p : &p - newMoves; - pushPath(left, borrowSource, true, newMoves); - - // Go right as far as possible - while(1) { - const PathEntry &p = pathBack(); - const Node *right = p.parser.rightChild(); - if(right == nullptr) { - return true; - } - - int newMoves = p.nodeIsLeftChild ? 1 : p.moves + 1; - const PathEntry *borrowSource = (right->flags & Node::PREFIX_SOURCE_NEXT) ? &p - newMoves : &p; - pushPath(right, borrowSource, false, newMoves); - } - } - - }; - - Cursor getCursor(StringRef prevAncestor, StringRef nextAncestor) const { - return (size != 0) ? Cursor(&root, prevAncestor, nextAncestor) : Cursor(); - } - - static std::string escapeForDOT(StringRef s) { - std::string r = "\""; - for(char c : s) { - if(c == '\n') - r += "\\n"; - else if(isprint(c) && c != '"') - r += c; - else - r += format("{%02X}", c); - } - return r + '"'; - } - - std::string toDOT(StringRef prevAncestor, StringRef nextAncestor) const { - auto c = getCursor(prevAncestor, nextAncestor); - c.moveFirst(); - - std::string r; - r += format("digraph PrefixTree%p {\n", this); - - do { - const PathEntry &p = c.pathBack(); - const Node *n = p.node; - const Node *left = p.parser.leftChild(); - const Node *right = p.parser.rightChild(); - - std::string label = escapeForDOT(format("PrefixSource: %s\nPrefix: [%s]\nSplit: %s\nSuffix: %s", - n->flags & Node::PREFIX_SOURCE_NEXT ? "Left" : "Right", - p.getPrefix().toString().c_str(), - p.parser.splitString().toString().c_str(), - p.parser.suffixString().toString().c_str() - )); - - r += format("node%p [ label = %s ];\nnode%p -> { %s %s };\n", n, label.c_str(), n, - left ? format("node%p", left).c_str() : "", - right ? format("node%p", right).c_str() : "" - ); - - } while(c.moveNext()); - - r += "}\n"; - - return r; - } - - // Returns number of bytes written - int build(const EntryRef *begin, const EntryRef *end, StringRef prevAncestor, StringRef nextAncestor) { - // The boundary leading to the new page acts as the last time we branched right - if(begin == end) { - size = 0; - } - else { - size = sizeof(size) + build(root, begin, end, nextAncestor, prevAncestor); - } - ASSERT(size <= MaximumTreeSize()); - return size; - } - -private: - static uint16_t build(Node &root, const EntryRef *begin, const EntryRef *end, const StringRef &nextAncestor, const StringRef &prevAncestor) { - ASSERT(end != begin); - - int count = end - begin; - - // Find key to be stored in root - int mid = perfectSubtreeSplitPointCached(count); - const StringRef &key = begin[mid].key; - const StringRef &val = begin[mid].value; - - // Since key must be between lastLeft and lastRight, any common prefix they share must be shared by key - // so rather than comparing all of key to each one separately we can just compare lastLeft and lastRight - // to each other and then skip over the resulting length in key - int nextPrevCommon = commonPrefixLength(nextAncestor.begin(), prevAncestor.begin(), std::min(nextAncestor.size(), prevAncestor.size())); - - // Pointer to remainder of key after the left/right common bytes - const uint8_t *keyExt = key.begin() + nextPrevCommon; - - // Find out how many bytes beyond leftRightCommon key has with each last left/right string separately - int extNext = commonPrefixLength(keyExt, nextAncestor.begin() + nextPrevCommon, std::min(key.size(), nextAncestor.size()) - nextPrevCommon); - int extPrev = commonPrefixLength(keyExt, prevAncestor.begin() + nextPrevCommon, std::min(key.size(), prevAncestor.size()) - nextPrevCommon); - - // Use the longer result - bool prefixSourceNext = extNext > extPrev; - - int prefixLen = nextPrevCommon + (prefixSourceNext ? extNext : extPrev); - - int splitLen; // Bytes after prefix required to make traversal decision - int suffixLen; // Remainder of key bytes after split key portion - - //printf("build: '%s'\n prefixLen %d prefixSourceNext %d\n", key.toHexString(20).c_str(), prefixLen, prefixSourceNext); - - // 2 entries or less means no right child, so just put all remaining key bytes into split string. - if(count < 3) { - splitLen = key.size() - prefixLen; - suffixLen = 0; - } - else { - // There are 2 children - // Avoid using the suffix at all if the remainder is small enough. - splitLen = key.size() - prefixLen; - if(splitLen < SERVER_KNOBS->PREFIX_TREE_IMMEDIATE_KEY_SIZE_LIMIT) { - suffixLen = 0; - } - else { - // Remainder of the key was not small enough to put entirely before the left child, so find the actual required to make the branch decision - const StringRef &prevKey = begin[mid - 1].key; - splitLen = commonPrefixLength(key.begin(), prevKey.begin(), std::min(key.size(), prevKey.size())) + 1 - prefixLen; - - // Put at least the minimum immediate byte count in the split key (before the left child) - if(splitLen < SERVER_KNOBS->PREFIX_TREE_IMMEDIATE_KEY_SIZE_MIN) - splitLen = std::min(key.size() - prefixLen, SERVER_KNOBS->PREFIX_TREE_IMMEDIATE_KEY_SIZE_MIN); - - suffixLen = key.size() - splitLen - prefixLen; - } - } - - // We now know enough about the fields present and their lengths to set the flag bits and write a header - // If any int is more than 8 bits then use large ints - bool large = prefixLen > 255 || splitLen > 255 || suffixLen > 255 || val.size() > 255; - root.flags = large ? Node::USE_LARGE_LENGTHS : 0; - - if(prefixSourceNext) - root.flags |= Node::PREFIX_SOURCE_NEXT; - - union { - uint8_t *p8; - uint16_t *p16; - }; - p8 = &root.flags + 1; - - if(large) - *p16++ = prefixLen; - else - *p8++ = prefixLen; - - if(splitLen > 0) { - root.flags |= Node::HAS_SPLIT; - if(large) - *p16++ = splitLen; - else - *p8++ = splitLen; - } - - uint16_t *pLeftLen = p16; - if(count > 1) { - ++p16; - } - - if(suffixLen > 0) { - root.flags |= Node::HAS_SUFFIX; - if(large) - *p16++ = suffixLen; - else - *p8++ = suffixLen; - } - - if(val.size() > 0) { - root.flags |= Node::HAS_VALUE; - if(large) - *p16++ = val.size(); - else - *p8++ = val.size(); - } - - // Header is written, now write strings and children in order. - const uint8_t *keyPtr = key.begin() + prefixLen; - - // Serialize split bytes - if(splitLen > 0) { - memcpy(p8, keyPtr, splitLen); - p8 += splitLen; - keyPtr += splitLen; - } - - // Serialize left child - if(count > 1) { - root.flags |= Node::HAS_LEFT_CHILD; - int leftLen = build(*(Node *)(p8), begin, begin + mid, key, prevAncestor); - *pLeftLen = leftLen; - p8 += leftLen; - } - - // Serialize suffix bytes - if(suffixLen > 0) { - memcpy(p8, keyPtr, suffixLen); - p8 += suffixLen; - } - - // Serialize value bytes - if(val.size() > 0) { - memcpy(p8, val.begin(), val.size()); - p8 += val.size(); - } - - // Serialize right child - if(count > 2) { - root.flags |= Node::HAS_RIGHT_CHILD; - int rightLen = build(*(Node *)(p8), begin + mid + 1, end, nextAncestor, key); - p8 += rightLen; - } - -/* -printf("\nBuilt: key '%s' c %d p %d spl %d suf %d\nRaw: %s\n", key.toString().c_str(), count, prefixLen, splitLen, suffixLen, StringRef(&root.flags, p8 - &root.flags).toHexString(20).c_str()); -Node::Parser p(&root); -printf("parser: headerLen %d prefixLen %d leftPos %d rightPos %d split %s suffix %s val %s\n", - p.headerLen, p.prefixLen, p.leftPos, p.rightPos, p.splitString().toString().c_str(), p.suffixString().toString().c_str(), p.valueString().toString().c_str()); -*/ - return p8 - (uint8_t *)&root; - } -}; diff --git a/fdbserver/RestoreApplier.actor.cpp b/fdbserver/RestoreApplier.actor.cpp index c305ab72c1..64f0501fbf 100644 --- a/fdbserver/RestoreApplier.actor.cpp +++ b/fdbserver/RestoreApplier.actor.cpp @@ -65,11 +65,12 @@ ACTOR Future restoreApplierCore(RestoreApplierInterface applierInterf, int } when(RestoreVersionBatchRequest req = waitNext(applierInterf.initVersionBatch.getFuture())) { requestTypeStr = "initVersionBatch"; - actors.add(handleInitVersionBatchRequest(req, self)); + wait(handleInitVersionBatchRequest(req, self)); } when(RestoreVersionBatchRequest req = waitNext(applierInterf.finishRestore.getFuture())) { requestTypeStr = "finishRestore"; - exitRole = handleFinishRestoreRequest(req, self); + handleFinishRestoreRequest(req, self); + exitRole = Void(); } when(wait(exitRole)) { TraceEvent("FastRestore").detail("RestoreApplierCore", "ExitRole").detail("NodeID", self->id()); @@ -115,12 +116,14 @@ ACTOR static Future handleSendMutationVectorRequest(RestoreSendMutationVec state int mIndex = 0; for (mIndex = 0; mIndex < mutations.size(); mIndex++) { MutationRef mutation = mutations[mIndex]; - // TraceEvent(SevDebug, "FastRestore") - // .detail("ApplierNode", self->id()) - // .detail("FileUID", req.fileUID) - // .detail("Version", commitVersion) - // .detail("MutationReceived", mutation.toString()); + TraceEvent(SevDebug, "FastRestore") + .detail("ApplierNode", self->id()) + .detail("FileUID", req.fileIndex) + .detail("Version", commitVersion) + .detail("Index", mIndex) + .detail("MutationReceived", mutation.toString()); self->kvOps[commitVersion].push_back_deep(self->kvOps[commitVersion].arena(), mutation); + // TODO: What if log file's mutations are delivered out-of-order (behind) the range file's mutations?! } curFilePos.set(req.version); } @@ -218,9 +221,8 @@ struct DBApplyProgress { } bool shouldCommit() { - // TODO: Change transactionSize > 0 to transactionSize > opConfig.transactionBatchSizeThreshold to batch - // mutations in a txn - return (!lastTxnHasError && (startNextVersion || transactionSize > 0 || curItInCurTxn == self->kvOps.end())); + return (!lastTxnHasError && (startNextVersion || transactionSize >= opConfig.transactionBatchSizeThreshold || + curItInCurTxn == self->kvOps.end())); } bool hasError() { return lastTxnHasError; } @@ -270,6 +272,29 @@ ACTOR Future applyToDB(Reference self, Database cx) { } state Reference tr(new ReadYourWritesTransaction(cx)); + // Sanity check the restoreApplierKeys, which should be empty at this point + loop { + try { + tr->reset(); + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + Key begin = restoreApplierKeyFor(self->id(), 0); + Key end = restoreApplierKeyFor(self->id(), std::numeric_limits::max()); + Standalone txnIds = wait(tr->getRange(KeyRangeRef(begin, end), CLIENT_KNOBS->TOO_MANY)); + if (txnIds.size() > 0) { + TraceEvent(SevError, "FastRestore_ApplyTxnStateNotClean").detail("TxnIds", txnIds.size()); + for (auto& kv : txnIds) { + std::pair applierInfo = decodeRestoreApplierKey(kv.key); + TraceEvent(SevError, "FastRestore_ApplyTxnStateNotClean") + .detail("Applier", applierInfo.first) + .detail("ResidueTxnID", applierInfo.second); + } + } + break; + } catch (Error& e) { + wait(tr->onError(e)); + } + } loop { // Transaction retry loop try { @@ -299,7 +324,7 @@ ACTOR Future applyToDB(Reference self, Database cx) { TraceEvent("FastRestore_ApplierTxn") .detail("ApplierApplyToDB", self->id()) .detail("TxnId", progress.curTxnId) - .detail("StartIndexInCurrentTxn", progress.curIndexInCurTxn) + .detail("CurrentIndexInCurrentTxn", progress.curIndexInCurTxn) .detail("CurrentIteratorMutations", progress.curItInCurTxn->second.size()) .detail("Version", progress.curItInCurTxn->first); @@ -315,7 +340,13 @@ ACTOR Future applyToDB(Reference self, Database cx) { TraceEvent(SevError, "FastRestore").detail("InvalidMutationType", m.type); } - // TraceEvent(SevDebug, "FastRestore_Debug").detail("ApplierApplyToDB", self->describeNode()).detail("Version", progress.curItInCurTxn->first).detail("Mutation", m.toString()); + TraceEvent(SevDebug, "FastRestore_Debug") + .detail("ApplierApplyToDB", self->describeNode()) + .detail("Version", progress.curItInCurTxn->first) + .detail("Index", progress.curIndexInCurTxn) + .detail("Mutation", m.toString()) + .detail("MutationSize", m.expectedSize()) + .detail("TxnSize", progress.transactionSize); if (m.type == MutationRef::SetValue) { tr->set(m.param1, m.param2); } else if (m.type == MutationRef::ClearRange) { @@ -332,14 +363,10 @@ ACTOR Future applyToDB(Reference self, Database cx) { progress.transactionSize += m.expectedSize(); - if (progress.transactionSize >= opConfig.transactionBatchSizeThreshold) { // commit per 512B + progress.nextMutation(); // Prepare for the next mutation + // commit per transactionBatchSizeThreshold bytes; and commit does not cross version boundary + if (progress.shouldCommit()) { break; // Got enough mutation in the txn - } else { - progress.nextMutation(); - // Mutations in the same transaction come from the same version - if (progress.startNextVersion || progress.isDone()) { - break; - } } } } // !lastTxnHasError @@ -348,8 +375,7 @@ ACTOR Future applyToDB(Reference self, Database cx) { if (progress.shouldCommit()) { wait(tr->commit()); } - // Logic for a successful transaction: Update current txn info and uncommitted txn info - progress.nextMutation(); + if (progress.isDone()) { // Are all mutations processed? break; } @@ -359,7 +385,7 @@ ACTOR Future applyToDB(Reference self, Database cx) { .detail("TxnStatus", "?") .detail("ApplierApplyToDB", self->id()) .detail("TxnId", progress.curTxnId) - .detail("StartIndexInCurrentTxn", progress.curIndexInCurTxn) + .detail("CurrentIndexInCurrentTxn", progress.curIndexInCurTxn) .detail("Version", progress.curItInCurTxn->first) .error(e, true); progress.lastTxnHasError = true; @@ -381,8 +407,9 @@ ACTOR Future applyToDB(Reference self, Database cx) { tr->reset(); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); + // Clear txnIds in [0, progress.curTxnId). We add 100 to curTxnId just to be safe. tr->clear(KeyRangeRef(restoreApplierKeyFor(self->id(), 0), - restoreApplierKeyFor(self->id(), progress.curTxnId + 1))); + restoreApplierKeyFor(self->id(), progress.curTxnId + 100))); wait(tr->commit()); break; } catch (Error& e) { diff --git a/fdbserver/RestoreApplier.actor.h b/fdbserver/RestoreApplier.actor.h index 37f9b78b08..038d3c3d4a 100644 --- a/fdbserver/RestoreApplier.actor.h +++ b/fdbserver/RestoreApplier.actor.h @@ -34,7 +34,7 @@ #include "fdbrpc/fdbrpc.h" #include "fdbrpc/Locality.h" #include "fdbserver/CoordinationInterface.h" -#include "fdbserver/RestoreWorkerInterface.h" +#include "fdbclient/RestoreWorkerInterface.actor.h" #include "fdbserver/RestoreUtil.h" #include "fdbserver/RestoreRoleCommon.actor.h" @@ -128,4 +128,4 @@ struct RestoreApplierData : RestoreRoleData, public ReferenceCounted restoreApplierCore(RestoreApplierInterface applierInterf, int nodeIndex, Database cx); #include "flow/unactorcompiler.h" -#endif \ No newline at end of file +#endif diff --git a/fdbserver/RestoreCommon.actor.cpp b/fdbserver/RestoreCommon.actor.cpp index ac6e638f4c..d8689d136f 100644 --- a/fdbserver/RestoreCommon.actor.cpp +++ b/fdbserver/RestoreCommon.actor.cpp @@ -32,6 +32,7 @@ #include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/MutationList.h" #include "fdbclient/BackupContainer.h" +#include "flow/actorcompiler.h" // This must be the last #include. // Split RestoreConfigFR defined in FileBackupAgent.actor.cpp to declaration in Restore.actor.h and implementation in // RestoreCommon.actor.cpp @@ -268,7 +269,6 @@ ACTOR Future RestoreConfigFR::getFullStatus_impl(Reference progress = restore->getProgress(tr); // restore might no longer be valid after the first wait so make sure it is not needed anymore. - state UID uid = restore->getUid(); wait(success(ranges) && success(addPrefix) && success(removePrefix) && success(url) && success(restoreVersion) && success(progress)); @@ -322,8 +322,8 @@ struct StringRefReader { // Functions for consuming big endian (network byte order) integers. // Consumes a big endian number, swaps it to little endian, and returns it. - const int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume()); } - const uint32_t consumeNetworkUInt32() { return bigEndian32(consume()); } + int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume()); } + uint32_t consumeNetworkUInt32() { return bigEndian32(consume()); } bool eof() { return rptr == end; } @@ -433,4 +433,4 @@ ACTOR Future>> decodeLogFileBlock(Reference, uint32_t> SerializedMutationPartMap; bool isRangeMutation(MutationRef m); void splitMutation(Reference self, MutationRef m, Arena& mvector_arena, VectorRef& mvector, Arena& nodeIDs_arena, VectorRef& nodeIDs); -void _parseSerializedMutation(VersionedMutationsMap* kvOps, SerializedMutationListMap* mutationMap, - bool isSampling = false); +void _parseSerializedMutation(std::map::iterator kvOpsIter, + SerializedMutationListMap* mutationMap, bool isSampling = false); -ACTOR Future handleRestoreSysInfoRequest(RestoreSysInfoRequest req, Reference self); -ACTOR Future handleSetApplierKeyRangeVectorRequest(RestoreSetApplierKeyRangeVectorRequest req, - Reference self); +void handleRestoreSysInfoRequest(const RestoreSysInfoRequest& req, Reference self); ACTOR Future handleLoadFileRequest(RestoreLoadFileRequest req, Reference self, bool isSampling = false); +ACTOR Future handleSendMutationsRequest(RestoreSendMutationsToAppliersRequest req, + Reference self); ACTOR Future sendMutationsToApplier(Reference self, VersionedMutationsMap* kvOps, bool isRangeFile, Version startVersion, Version endVersion, int fileIndex); ACTOR static Future _parseLogFileToMutationsOnLoader( NotifiedVersion* pProcessedFileOffset, SerializedMutationListMap* mutationMap, SerializedMutationPartMap* mutationPartMap, Reference bc, Version version, std::string fileName, int64_t readOffset, int64_t readLen, KeyRange restoreRange, Key addPrefix, Key removePrefix, Key mutationLogPrefix); -ACTOR static Future _parseRangeFileToMutationsOnLoader(VersionedMutationsMap* kvOps, - Reference bc, Version version, - std::string fileName, int64_t readOffset_input, - int64_t readLen_input, KeyRange restoreRange); +ACTOR static Future _parseRangeFileToMutationsOnLoader( + std::map::iterator kvOpsIter, Reference bc, Version version, + std::string fileName, int64_t readOffset_input, int64_t readLen_input, KeyRange restoreRange); ACTOR Future restoreLoaderCore(RestoreLoaderInterface loaderInterf, int nodeIndex, Database cx) { state Reference self = @@ -72,25 +71,25 @@ ACTOR Future restoreLoaderCore(RestoreLoaderInterface loaderInterf, int no } when(RestoreSysInfoRequest req = waitNext(loaderInterf.updateRestoreSysInfo.getFuture())) { requestTypeStr = "updateRestoreSysInfo"; - actors.add(handleRestoreSysInfoRequest(req, self)); - } - when(RestoreSetApplierKeyRangeVectorRequest req = - waitNext(loaderInterf.setApplierKeyRangeVectorRequest.getFuture())) { - requestTypeStr = "setApplierKeyRangeVectorRequest"; - actors.add(handleSetApplierKeyRangeVectorRequest(req, self)); + handleRestoreSysInfoRequest(req, self); } when(RestoreLoadFileRequest req = waitNext(loaderInterf.loadFile.getFuture())) { requestTypeStr = "loadFile"; self->initBackupContainer(req.param.url); actors.add(handleLoadFileRequest(req, self, false)); } + when(RestoreSendMutationsToAppliersRequest req = waitNext(loaderInterf.sendMutations.getFuture())) { + requestTypeStr = "sendMutations"; + actors.add(handleSendMutationsRequest(req, self)); + } when(RestoreVersionBatchRequest req = waitNext(loaderInterf.initVersionBatch.getFuture())) { requestTypeStr = "initVersionBatch"; - actors.add(handleInitVersionBatchRequest(req, self)); + wait(handleInitVersionBatchRequest(req, self)); } when(RestoreVersionBatchRequest req = waitNext(loaderInterf.finishRestore.getFuture())) { requestTypeStr = "finishRestore"; - exitRole = handleFinishRestoreRequest(req, self); + handleFinishRestoreRequest(req, self); + exitRole = Void(); } when(wait(exitRole)) { TraceEvent("FastRestore").detail("RestoreLoaderCore", "ExitRole").detail("NodeID", self->id()); @@ -109,31 +108,19 @@ ACTOR Future restoreLoaderCore(RestoreLoaderInterface loaderInterf, int no } // Assume: Only update the local data if it (applierInterf) has not been set -ACTOR Future handleRestoreSysInfoRequest(RestoreSysInfoRequest req, Reference self) { +void handleRestoreSysInfoRequest(const RestoreSysInfoRequest& req, Reference self) { TraceEvent("FastRestore").detail("HandleRestoreSysInfoRequest", self->id()); ASSERT(self.isValid()); // The loader has received the appliers interfaces if (!self->appliersInterf.empty()) { req.reply.send(RestoreCommonReply(self->id())); - return Void(); + return; } self->appliersInterf = req.sysInfo.appliers; req.reply.send(RestoreCommonReply(self->id())); - return Void(); -} - -ACTOR Future handleSetApplierKeyRangeVectorRequest(RestoreSetApplierKeyRangeVectorRequest req, - Reference self) { - // Idempodent operation. OK to re-execute the duplicate cmd - if (self->rangeToApplier.empty()) { - self->rangeToApplier = req.rangeToApplier; - } - req.reply.send(RestoreCommonReply(self->id())); - - return Void(); } ACTOR Future _processLoadingParam(LoadingParam param, Reference self) { @@ -141,10 +128,14 @@ ACTOR Future _processLoadingParam(LoadingParam param, Referenceid()).detail("StartProcessLoadParam", param.toString()); ASSERT(param.blockSize > 0); ASSERT(param.offset % param.blockSize == 0); // Parse file must be at block bondary. + ASSERT(self->kvOpsPerLP.find(param) == self->kvOpsPerLP.end()); + // NOTE: map's iterator is guaranteed to be stable, but pointer may not. + // state VersionedMutationsMap* kvOps = &self->kvOpsPerLP[param]; + self->kvOpsPerLP.emplace(param, VersionedMutationsMap()); + state std::map::iterator kvOpsPerLPIter = self->kvOpsPerLP.find(param); - // Temporary data structure for parsing range and log files into (version, ) + // Temporary data structure for parsing log files into (version, ) // Must use StandAlone to save mutations, otherwise, the mutationref memory will be corrupted - state VersionedMutationsMap kvOps; // mutationMap: Key is the unique identifier for a batch of mutation logs at the same version state SerializedMutationListMap mutationMap; state std::map, uint32_t> mutationPartMap; // Sanity check the data parsing is correct @@ -159,7 +150,7 @@ ACTOR Future _processLoadingParam(LoadingParam param, Reference(param.blockSize, param.length - j); if (param.isRangeFile) { fileParserFutures.push_back(_parseRangeFileToMutationsOnLoader( - &kvOps, self->bc, param.version, param.filename, readOffset, readLen, param.restoreRange)); + kvOpsPerLPIter, self->bc, param.version, param.filename, readOffset, readLen, param.restoreRange)); } else { fileParserFutures.push_back(_parseLogFileToMutationsOnLoader( &processedFileOffset, &mutationMap, &mutationPartMap, self->bc, param.version, param.filename, @@ -169,12 +160,9 @@ ACTOR Future _processLoadingParam(LoadingParam param, Referenceid()).detail("FinishLoadingFile", param.filename); return Void(); @@ -187,10 +175,35 @@ ACTOR Future handleLoadFileRequest(RestoreLoadFileRequest req, Referenceid()).detail("ProcessLoadParam", req.param.toString()); self->processedFileParams[req.param] = Never(); self->processedFileParams[req.param] = _processLoadingParam(req.param, self); + } else { + TraceEvent("FastRestore").detail("Loader", self->id()).detail("WaitOnProcessLoadParam", req.param.toString()); } ASSERT(self->processedFileParams.find(req.param) != self->processedFileParams.end()); wait(self->processedFileParams[req.param]); // wait on the processing of the req.param. + // TODO: Send sampled mutations back to master + req.reply.send(RestoreCommonReply(self->id())); + return Void(); +} + +ACTOR Future handleSendMutationsRequest(RestoreSendMutationsToAppliersRequest req, + Reference self) { + if (self->rangeToApplier.empty()) { + self->rangeToApplier = req.rangeToApplier; + } else { + ASSERT(self->rangeToApplier == req.rangeToApplier); + } + + // Send mutations from log files first to ensure log mutation at the same version is before the range kv + state std::map::iterator item = self->kvOpsPerLP.begin(); + for (; item != self->kvOpsPerLP.end(); item++) { + if (item->first.isRangeFile == req.useRangeFile) { + // Send the parsed mutation to applier who will apply the mutation to DB + wait(sendMutationsToApplier(self, &item->second, item->first.isRangeFile, item->first.prevVersion, + item->first.endVersion, item->first.fileIndex)); + } + } + req.reply.send(RestoreCommonReply(self->id())); return Void(); } @@ -345,8 +358,6 @@ void splitMutation(Reference self, MutationRef m, Arena& mvec mvector.push_back_deep(mvector_arena, curm); nodeIDs.push_back(nodeIDs_arena, itApplier->second); } - - return; } // key_input format: @@ -360,13 +371,14 @@ bool concatenateBackupMutationForLogFile(std::map, Standal std::string prefix = "||\t"; std::stringstream ss; StringRef val = val_input.contents(); + const int key_prefix_len = sizeof(uint8_t) + sizeof(Version) + sizeof(uint32_t); StringRefReaderMX reader(val, restore_corrupted_data()); StringRefReaderMX readerKey(key_input, restore_corrupted_data()); // read key_input! - int logRangeMutationFirstLength = key_input.size() - 1 - 8 - 4; + int logRangeMutationFirstLength = key_input.size() - key_prefix_len; bool concatenated = false; - ASSERT_WE_THINK(key_input.size() >= 1 + 8 + 4); + ASSERT_WE_THINK(key_input.size() >= key_prefix_len); if (logRangeMutationFirstLength > 0) { // Strip out the [logRangeMutation.first]; otherwise, the following readerKey.consume will produce wrong value @@ -374,10 +386,10 @@ bool concatenateBackupMutationForLogFile(std::map, Standal } readerKey.consume(); // uint8_t hashValue = readerKey.consume() - uint64_t commitVersion = readerKey.consumeNetworkUInt64(); + Version commitVersion = readerKey.consumeNetworkUInt64(); uint32_t part = readerKey.consumeNetworkUInt32(); // Use commitVersion as id - Standalone id = StringRef((uint8_t*)&commitVersion, 8); + Standalone id = StringRef((uint8_t*)&commitVersion, sizeof(Version)); if (mutationMap.find(id) == mutationMap.end()) { mutationMap.insert(std::make_pair(id, val_input)); @@ -425,8 +437,9 @@ bool isRangeMutation(MutationRef m) { // we may not get the entire mutation list for the version encoded_list_of_mutations: // [mutation1][mutation2]...[mutationk], where // a mutation is encoded as [type:uint32_t][keyLength:uint32_t][valueLength:uint32_t][keyContent][valueContent] -void _parseSerializedMutation(VersionedMutationsMap* pkvOps, SerializedMutationListMap* pmutationMap, bool isSampling) { - VersionedMutationsMap& kvOps = *pkvOps; +void _parseSerializedMutation(std::map::iterator kvOpsIter, + SerializedMutationListMap* pmutationMap, bool isSampling) { + VersionedMutationsMap& kvOps = kvOpsIter->second; SerializedMutationListMap& mutationMap = *pmutationMap; for (auto& m : mutationMap) { @@ -439,10 +452,11 @@ void _parseSerializedMutation(VersionedMutationsMap* pkvOps, SerializedMutationL StringRefReaderMX vReader(val, restore_corrupted_data()); vReader.consume(); // Consume the includeVersion - uint32_t val_length_decoded = - vReader.consume(); // Parse little endian value, confirmed it is correct! - ASSERT(val_length_decoded == - val.size() - 12); // 12 is the length of [includeVersion:uint64_t][val_length:uint32_t] + // TODO(xumengpanda): verify the protocol version is compatible and raise error if needed + + // Parse little endian value, confirmed it is correct! + uint32_t val_length_decoded = vReader.consume(); + ASSERT(val_length_decoded == val.size() - sizeof(uint64_t) - sizeof(uint32_t)); while (1) { // stop when reach the end of the string @@ -457,7 +471,9 @@ void _parseSerializedMutation(VersionedMutationsMap* pkvOps, SerializedMutationL const uint8_t* v = vReader.consume(vLen); MutationRef mutation((MutationRef::Type)type, KeyRef(k, kLen), KeyRef(v, vLen)); - //TraceEvent(SevDebug, "FastRestore_VerboseDebug").detail("CommitVersion", commitVersion).detail("ParsedMutation", mutation.toString()); + TraceEvent(SevFRMutationInfo, "FastRestore_VerboseDebug") + .detail("CommitVersion", commitVersion) + .detail("ParsedMutation", mutation.toString()); kvOps[commitVersion].push_back_deep(kvOps[commitVersion].arena(), mutation); ASSERT_WE_THINK(kLen >= 0 && kLen < val.size()); ASSERT_WE_THINK(vLen >= 0 && vLen < val.size()); @@ -466,11 +482,10 @@ void _parseSerializedMutation(VersionedMutationsMap* pkvOps, SerializedMutationL } // Parsing the data blocks in a range file -ACTOR static Future _parseRangeFileToMutationsOnLoader(VersionedMutationsMap* pkvOps, - Reference bc, Version version, - std::string fileName, int64_t readOffset, int64_t readLen, - KeyRange restoreRange) { - state VersionedMutationsMap& kvOps = *pkvOps; +ACTOR static Future _parseRangeFileToMutationsOnLoader( + std::map::iterator kvOpsIter, Reference bc, Version version, + std::string fileName, int64_t readOffset, int64_t readLen, KeyRange restoreRange) { + state VersionedMutationsMap& kvOps = kvOpsIter->second; // The set of key value version is rangeFile.version. the key-value set in the same range file has the same version Reference inFile = wait(bc->readFile(fileName)); @@ -519,7 +534,9 @@ ACTOR static Future _parseRangeFileToMutationsOnLoader(VersionedMutationsM // We cache all kv operations into kvOps, and apply all kv operations later in one place kvOps.insert(std::make_pair(version, VectorRef())); - //TraceEvent(SevDebug, "FastRestore_VerboseDebug").detail("CommitVersion", version).detail("ParsedMutationKV", m.toString()); + TraceEvent(SevFRMutationInfo, "FastRestore_VerboseDebug") + .detail("CommitVersion", version) + .detail("ParsedMutationKV", m.toString()); ASSERT_WE_THINK(kvOps.find(version) != kvOps.end()); kvOps[version].push_back_deep(kvOps[version].arena(), m); diff --git a/fdbserver/RestoreLoader.actor.h b/fdbserver/RestoreLoader.actor.h index b6e44aa2e5..83331fb26e 100644 --- a/fdbserver/RestoreLoader.actor.h +++ b/fdbserver/RestoreLoader.actor.h @@ -34,7 +34,7 @@ #include "fdbrpc/fdbrpc.h" #include "fdbserver/CoordinationInterface.h" #include "fdbrpc/Locality.h" -#include "fdbserver/RestoreWorkerInterface.h" +#include "fdbclient/RestoreWorkerInterface.actor.h" #include "fdbserver/RestoreUtil.h" #include "fdbserver/RestoreCommon.actor.h" #include "fdbserver/RestoreRoleCommon.actor.h" @@ -44,6 +44,7 @@ struct RestoreLoaderData : RestoreRoleData, public ReferenceCounted { std::map> processedFileParams; + std::map kvOpsPerLP; // Buffered kvOps for each loading param // rangeToApplier is in master and loader. Loader uses this to determine which applier a mutation should be sent // KeyRef is the inclusive lower bound of the key range the applier (UID) is responsible for @@ -79,6 +80,7 @@ struct RestoreLoaderData : RestoreRoleData, public ReferenceCounted restoreLoaderCore(RestoreLoaderInterface loaderInterf, int nodeIndex, Database cx); #include "flow/unactorcompiler.h" -#endif \ No newline at end of file +#endif diff --git a/fdbserver/RestoreMaster.actor.cpp b/fdbserver/RestoreMaster.actor.cpp index e9ed9bd593..f7dfc13b56 100644 --- a/fdbserver/RestoreMaster.actor.cpp +++ b/fdbserver/RestoreMaster.actor.cpp @@ -51,7 +51,6 @@ ACTOR static Future distributeRestoreSysInfo(Reference ACTOR static Future>> collectRestoreRequests(Database cx); ACTOR static Future initializeVersionBatch(Reference self); -ACTOR static Future notifyLoaderAppliersKeyRange(Reference self); ACTOR static Future notifyApplierToApplyMutations(Reference self); ACTOR static Future notifyRestoreCompleted(Reference self, Database cx); @@ -193,7 +192,7 @@ ACTOR Future startProcessRestoreRequests(Reference self for (restoreIndex = 0; restoreIndex < restoreRequests.size(); restoreIndex++) { RestoreRequest& request = restoreRequests[restoreIndex]; TraceEvent("FastRestore").detail("RestoreRequestInfo", request.toString()); - Version ver = wait(processRestoreRequest(self, cx, request)); + wait(success(processRestoreRequest(self, cx, request))); } } catch (Error& e) { TraceEvent(SevError, "FastRestoreFailed").detail("RestoreRequest", restoreRequests[restoreIndex].toString()); @@ -308,6 +307,21 @@ ACTOR static Future loadFilesOnLoaders(Reference self, return Void(); } +// Ask loaders to send its buffered mutations to appliers +ACTOR static Future sendMutationsFromLoaders(Reference self, bool useRangeFile) { + TraceEvent("FastRestore") + .detail("SendMutationsFromLoaders", self->batchIndex) + .detail("UseRangeFiles", useRangeFile); + + std::vector> requests; + for (auto& loader : self->loadersInterf) { + requests.emplace_back(loader.first, RestoreSendMutationsToAppliersRequest(self->rangeToApplier, useRangeFile)); + } + wait(sendBatchRequests(&RestoreLoaderInterface::sendMutations, self->loadersInterf, requests)); + + return Void(); +} + ACTOR static Future distributeWorkloadPerVersionBatch(Reference self, Database cx, RestoreRequest request, VersionBatch versionBatch) { ASSERT(!versionBatch.isEmpty()); @@ -315,13 +329,19 @@ ACTOR static Future distributeWorkloadPerVersionBatch(ReferenceloadersInterf.size() > 0); ASSERT(self->appliersInterf.size() > 0); - dummySampleWorkload(self); - wait(notifyLoaderAppliersKeyRange(self)); + dummySampleWorkload(self); // TODO: Delete // Parse log files and send mutations to appliers before we parse range files + // TODO: Allow loading both range and log files in parallel wait(loadFilesOnLoaders(self, cx, request, versionBatch, false)); wait(loadFilesOnLoaders(self, cx, request, versionBatch, true)); + // Loaders should ensure log files' mutations sent to appliers before range files' mutations + // TODO: Let applier buffer mutations from log and range files differently so that loaders can send mutations in + // parallel + wait(sendMutationsFromLoaders(self, false)); + wait(sendMutationsFromLoaders(self, true)); + wait(notifyApplierToApplyMutations(self)); return Void(); @@ -331,20 +351,22 @@ ACTOR static Future distributeWorkloadPerVersionBatch(Reference self) { int numAppliers = self->appliersInterf.size(); - std::vector keyrangeSplitter; + std::vector keyrangeSplitter; // We will use the splitter at [1, numAppliers - 1]. The first splitter is normalKeys.begin int i; - for (i = 0; i < numAppliers - 1; i++) { - keyrangeSplitter.push_back(deterministicRandom()->randomUniqueID()); + for (i = 0; i < numAppliers; i++) { + keyrangeSplitter.push_back(Key(deterministicRandom()->randomUniqueID().toString())); } std::sort(keyrangeSplitter.begin(), keyrangeSplitter.end()); i = 0; + self->rangeToApplier.clear(); for (auto& applier : self->appliersInterf) { if (i == 0) { self->rangeToApplier[normalKeys.begin] = applier.first; } else { - self->rangeToApplier[StringRef(keyrangeSplitter[i].toString())] = applier.first; + self->rangeToApplier[keyrangeSplitter[i]] = applier.first; } + i++; } self->logApplierKeyRange(); } @@ -412,11 +434,13 @@ ACTOR static Future collectBackupFiles(Reference bc, std for (const RangeFile& f : restorable.get().ranges) { TraceEvent("FastRestore").detail("RangeFile", f.toString()); RestoreFileFR file(f.version, f.fileName, true, f.blockSize, f.fileSize, f.version, f.version); + TraceEvent("FastRestore").detail("RangeFileFR", file.toString()); files->push_back(file); } for (const LogFile& f : restorable.get().logs) { TraceEvent("FastRestore").detail("LogFile", f.toString()); RestoreFileFR file(f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion, f.beginVersion); + TraceEvent("FastRestore").detail("LogFileFR", file.toString()); files->push_back(file); } @@ -464,17 +488,6 @@ ACTOR static Future notifyApplierToApplyMutations(Reference notifyLoaderAppliersKeyRange(Reference self) { - std::vector> requests; - for (auto& loader : self->loadersInterf) { - requests.push_back(std::make_pair(loader.first, RestoreSetApplierKeyRangeVectorRequest(self->rangeToApplier))); - } - wait(sendBatchRequests(&RestoreLoaderInterface::setApplierKeyRangeVectorRequest, self->loadersInterf, requests)); - - return Void(); -} - // Ask all loaders and appliers to perform housecleaning at the end of restore and // Register the restoreRequestDoneKey to signal the end of restore ACTOR static Future notifyRestoreCompleted(Reference self, Database cx) { @@ -514,4 +527,4 @@ ACTOR static Future notifyRestoreCompleted(Reference se TraceEvent("FastRestore").detail("RestoreMaster", "RestoreCompleted"); return Void(); -} \ No newline at end of file +} diff --git a/fdbserver/RestoreMaster.actor.h b/fdbserver/RestoreMaster.actor.h index 1ec8819c37..3cfb0956b4 100644 --- a/fdbserver/RestoreMaster.actor.h +++ b/fdbserver/RestoreMaster.actor.h @@ -54,7 +54,7 @@ struct VersionBatch { struct RestoreMasterData : RestoreRoleData, public ReferenceCounted { // rangeToApplier is in master and loader node. Loader uses this to determine which applier a mutation should be sent. // KeyRef is the inclusive lower bound of the key range the applier (UID) is responsible for - std::map, UID> rangeToApplier; + std::map rangeToApplier; std::map versionBatches; // key is the beginVersion of the version batch int batchIndex; @@ -68,7 +68,7 @@ struct RestoreMasterData : RestoreRoleData, public ReferenceCountedbegin(); versionBatch != versionBatches->end(); versionBatch++) { std::sort(versionBatch->second.rangeFiles.begin(), versionBatch->second.rangeFiles.end()); std::sort(versionBatch->second.logFiles.begin(), versionBatch->second.logFiles.end()); for (auto& logFile : versionBatch->second.logFiles) { logFile.fileIndex = ++fileIndex; + TraceEvent("FastRestore") + .detail("VersionBatchId", versionBatchId) + .detail("LogFile", logFile.toString()); } for (auto& rangeFile : versionBatch->second.rangeFiles) { rangeFile.fileIndex = ++fileIndex; + TraceEvent("FastRestore") + .detail("VersionBatchId", versionBatchId) + .detail("RangeFile", rangeFile.toString()); } + versionBatchId++; } TraceEvent("FastRestore").detail("VersionBatches", versionBatches->size()); diff --git a/fdbserver/RestoreRoleCommon.actor.cpp b/fdbserver/RestoreRoleCommon.actor.cpp index 6217dc8c85..80b9db92a2 100644 --- a/fdbserver/RestoreRoleCommon.actor.cpp +++ b/fdbserver/RestoreRoleCommon.actor.cpp @@ -39,11 +39,10 @@ struct RestoreWorkerData; ACTOR Future handleHeartbeat(RestoreSimpleRequest req, UID id) { wait(delayJittered(5.0)); // Random jitter reduces heat beat monitor's pressure req.reply.send(RestoreCommonReply(id)); - return Void(); } -ACTOR Future handleFinishRestoreRequest(RestoreVersionBatchRequest req, Reference self) { +void handleFinishRestoreRequest(const RestoreVersionBatchRequest& req, Reference self) { if (self->versionBatchStart) { self->versionBatchStart = false; } @@ -54,19 +53,22 @@ ACTOR Future handleFinishRestoreRequest(RestoreVersionBatchRequest req, Re .detail("Node", self->id()); req.reply.send(RestoreCommonReply(self->id())); - - return Void(); } ACTOR Future handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Reference self) { - self->resetPerVersionBatch(); - TraceEvent("FastRestore") - .detail("InitVersionBatch", req.batchID) - .detail("Role", getRoleStr(self->role)) - .detail("Node", self->id()); + // batchId is continuous. (req.batchID-1) is the id of the just finished batch. + wait(self->versionBatchId.whenAtLeast(req.batchID - 1)); + + if (self->versionBatchId.get() == req.batchID - 1) { + self->resetPerVersionBatch(); + TraceEvent("FastRestore") + .detail("InitVersionBatch", req.batchID) + .detail("Role", getRoleStr(self->role)) + .detail("Node", self->id()); + self->versionBatchId.set(req.batchID); + } req.reply.send(RestoreCommonReply(self->id())); - return Void(); } diff --git a/fdbserver/RestoreRoleCommon.actor.h b/fdbserver/RestoreRoleCommon.actor.h index 86d63bbaa4..f6a5c5b658 100644 --- a/fdbserver/RestoreRoleCommon.actor.h +++ b/fdbserver/RestoreRoleCommon.actor.h @@ -32,10 +32,11 @@ #include "flow/Stats.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/CommitTransaction.h" +#include "fdbclient/Notified.h" #include "fdbrpc/fdbrpc.h" #include "fdbrpc/Locality.h" #include "fdbserver/CoordinationInterface.h" -#include "fdbserver/RestoreWorkerInterface.h" +#include "fdbclient/RestoreWorkerInterface.actor.h" #include "fdbserver/RestoreUtil.h" #include "flow/actorcompiler.h" // has to be last include @@ -55,7 +56,7 @@ typedef std::map>> VersionedMutations ACTOR Future handleHeartbeat(RestoreSimpleRequest req, UID id); ACTOR Future handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Reference self); -ACTOR Future handleFinishRestoreRequest(RestoreVersionBatchRequest req, Reference self); +void handleFinishRestoreRequest(const RestoreVersionBatchRequest& req, Reference self); // Helper class for reading restore data from a buffer and throwing the right errors. // This struct is mostly copied from StringRefReader. We add a sanity check in this struct. @@ -90,12 +91,12 @@ struct StringRefReaderMX { // Functions for consuming big endian (network byte oselfer) integers. // Consumes a big endian number, swaps it to little endian, and returns it. - const int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume()); } - const uint32_t consumeNetworkUInt32() { return bigEndian32(consume()); } + int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume()); } + uint32_t consumeNetworkUInt32() { return bigEndian32(consume()); } // Convert big Endian value (e.g., encoded in log file) into a littleEndian uint64_t value. - const int64_t consumeNetworkInt64() { return (int64_t)bigEndian64((uint32_t)consume()); } - const uint64_t consumeNetworkUInt64() { return bigEndian64(consume()); } + int64_t consumeNetworkInt64() { return (int64_t)bigEndian64((uint32_t)consume()); } + uint64_t consumeNetworkUInt64() { return bigEndian64(consume()); } bool eof() { return rptr == end; } @@ -114,6 +115,8 @@ public: std::map appliersInterf; RestoreApplierInterface masterApplierInterf; + NotifiedVersion versionBatchId; // Continuously increase for each versionBatch + bool versionBatchStart = false; uint32_t inProgressFlag = 0; @@ -135,4 +138,4 @@ public: }; #include "flow/unactorcompiler.h" -#endif \ No newline at end of file +#endif diff --git a/fdbserver/RestoreUtil.h b/fdbserver/RestoreUtil.h index 9045c9828e..698cc33af2 100644 --- a/fdbserver/RestoreUtil.h +++ b/fdbserver/RestoreUtil.h @@ -34,6 +34,9 @@ #include #include +//#define SevFRMutationInfo SevVerbose +#define SevFRMutationInfo SevInfo + enum class RestoreRole { Invalid = 0, Master = 1, Loader, Applier }; BINARY_SERIALIZABLE(RestoreRole); std::string getRoleStr(RestoreRole role); diff --git a/fdbserver/RestoreWorker.actor.cpp b/fdbserver/RestoreWorker.actor.cpp index c53bbd6be1..becbc75ddb 100644 --- a/fdbserver/RestoreWorker.actor.cpp +++ b/fdbserver/RestoreWorker.actor.cpp @@ -1,5 +1,5 @@ /* - * Restore.actor.cpp + * RestoreWorker.actor.cpp * * This source file is part of the FoundationDB open source project * @@ -98,8 +98,9 @@ ACTOR Future handleRecruitRoleRequest(RestoreRecruitRoleRequest req, Refer self->loaderInterf = RestoreLoaderInterface(); self->loaderInterf.get().initEndpoints(); RestoreLoaderInterface& recruited = self->loaderInterf.get(); - DUMPTOKEN(recruited.setApplierKeyRangeVectorRequest); DUMPTOKEN(recruited.initVersionBatch); + DUMPTOKEN(recruited.loadFile); + DUMPTOKEN(recruited.sendMutations); DUMPTOKEN(recruited.collectRestoreRoleInterfaces); DUMPTOKEN(recruited.finishRestore); actors->add(restoreLoaderCore(self->loaderInterf.get(), req.nodeIndex, cx)); @@ -183,7 +184,7 @@ void initRestoreWorkerConfig() { opConfig.num_loaders = g_network->isSimulated() ? 3 : opConfig.num_loaders; opConfig.num_appliers = g_network->isSimulated() ? 3 : opConfig.num_appliers; opConfig.transactionBatchSizeThreshold = - g_network->isSimulated() ? 1 : opConfig.transactionBatchSizeThreshold; // Byte + g_network->isSimulated() ? 512 : opConfig.transactionBatchSizeThreshold; // Byte TraceEvent("FastRestore") .detail("InitOpConfig", "Result") .detail("NumLoaders", opConfig.num_loaders) diff --git a/fdbserver/RestoreWorker.actor.h b/fdbserver/RestoreWorker.actor.h index b17fe984c1..7b26899ab9 100644 --- a/fdbserver/RestoreWorker.actor.h +++ b/fdbserver/RestoreWorker.actor.h @@ -34,7 +34,7 @@ #include #include -#include "fdbserver/RestoreWorkerInterface.h" +#include "fdbclient/RestoreWorkerInterface.actor.h" #include "fdbserver/RestoreUtil.h" #include "fdbserver/RestoreCommon.actor.h" #include "fdbserver/RestoreRoleCommon.actor.h" @@ -70,4 +70,4 @@ struct RestoreWorkerData : NonCopyable, public ReferenceCounted priorCommittedLogServers; // If !fullyRecovered and logSystemConfig refers to a new log system which may not have been committed to the coordinated state yet, then priorCommittedLogServers are the previous, fully committed generation which need to stay alive in case this recovery fails Optional latencyBandConfig; + std::vector> storageCaches; explicit ServerDBInfo() : recoveryCount(0), recoveryState(RecoveryState::UNINITIALIZED) {} @@ -58,7 +59,7 @@ struct ServerDBInfo { template void serialize( Ar& ar ) { - serializer(ar, id, clusterInterface, client, distributor, master, ratekeeper, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig); + serializer(ar, id, clusterInterface, client, distributor, master, ratekeeper, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig, storageCaches); } }; diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index efd80242f4..9b5b14df36 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -43,7 +43,7 @@ #undef min extern "C" int g_expect_full_pointermap; -extern const char* getHGVersion(); +extern const char* getSourceVersion(); const int MACHINE_REBOOT_TIME = 10; @@ -232,7 +232,7 @@ ACTOR Future simulatedFDBDRebooter(Referenceexcluded) .detail("UsingSSL", sslEnabled); TraceEvent("ProgramStart").detail("Cycles", cycles).detail("RandomId", randomId) - .detail("SourceVersion", getHGVersion()) + .detail("SourceVersion", getSourceVersion()) .detail("Version", FDB_VT_VERSION) .detail("PackageName", FDB_VT_PACKAGE_NAME) .detail("DataFolder", *dataFolder) @@ -1254,6 +1254,13 @@ void setupSimulatedSystem(vector>* systemActors, std::string baseFo int dcCoordinators = coordinatorCount / dataCenters + (dc < coordinatorCount%dataCenters); printf("Datacenter %d: %d/%d machines, %d/%d coordinators\n", dc, machines, machineCount, dcCoordinators, coordinatorCount); ASSERT( dcCoordinators <= machines ); + + //FIXME: temporarily code to test storage cache + //TODO: caching disabled for this merge + //if(dc==0) { + // machines++; + //} + int useSeedForMachine = deterministicRandom()->randomInt(0, machines); Standalone zoneId; Standalone newZoneId; @@ -1277,6 +1284,13 @@ void setupSimulatedSystem(vector>* systemActors, std::string baseFo nonVersatileMachines++; } + //FIXME: temporarily code to test storage cache + //TODO: caching disabled for this merge + //if(machine==machines-1 && dc==0) { + // processClass = ProcessClass(ProcessClass::StorageCacheClass, ProcessClass::CommandLineSource); + // nonVersatileMachines++; + //} + std::vector ips; for (int i = 0; i < processesPerMachine; i++) { ips.push_back(makeIPAddressForSim(useIPv6, { 2, dc, deterministicRandom()->randomInt(1, i + 2), machine })); @@ -1395,8 +1409,6 @@ ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool reboot state int extraDB = 0; state int minimumReplication = 0; state int minimumRegions = 0; - state float timeout = 5400; // old default is 5400 seconds - state float buggify_timeout = 36000.0; // old default is 36000 seconds checkExtraDB(testFile, extraDB, minimumReplication, minimumRegions); // TODO (IPv6) Use IPv6? diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index d981ccae16..223b934bb3 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1151,26 +1151,61 @@ ACTOR static Future consistencyCheckStatusFetcher(Database cx, JsonBuilder return Void(); } +struct LogRangeAndUID { + KeyRange range; + UID destID; + + LogRangeAndUID(KeyRange const& range, UID const& destID) : range(range), destID(destID) {} + + bool operator < (LogRangeAndUID const& r) const { + if(range.begin != r.range.begin) return range.begin < r.range.begin; + if(range.end != r.range.end) return range.end < r.range.end; + return destID < r.destID; + } +}; + ACTOR static Future logRangeWarningFetcher(Database cx, JsonBuilderArray *messages, std::set *incomplete_reasons) { try { state Transaction tr(cx); + state Future timeoutFuture = timeoutError(Future(Never()), 5.0); loop { try { tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); - tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); - tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - Standalone existingDestUidValues = wait(timeoutError(tr.getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY), 5.0)); - std::set> existingRanges; - for(auto it : existingDestUidValues) { - KeyRange range = BinaryReader::fromStringRef(it.key.removePrefix(destUidLookupPrefix), IncludeVersion()); - std::pair rangePair = std::make_pair(range.begin,range.end); - if(existingRanges.count(rangePair)) { - messages->push_back(JsonString::makeMessage("duplicate_mutation_streams", format("Backup and DR are not sharing the same stream of mutations for `%s` - `%s`", printable(range.begin).c_str(), printable(range.end).c_str()).c_str())); - break; - } - existingRanges.insert(rangePair); + state Future> existingDestUidValues = tr.getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY); + state Future> existingLogRanges = tr.getRange(logRangesRange, CLIENT_KNOBS->TOO_MANY); + wait( (success(existingDestUidValues) && success(existingLogRanges)) || timeoutFuture ); + + std::set loggingRanges; + for(auto& it : existingLogRanges.get()) { + Key logDestination; + UID logUid; + KeyRef logRangeBegin = logRangesDecodeKey(it.key, &logUid); + Key logRangeEnd = logRangesDecodeValue(it.value, &logDestination); + loggingRanges.insert(LogRangeAndUID(KeyRangeRef(logRangeBegin, logRangeEnd), logUid)); } + + std::set> existingRanges; + for(auto& it : existingDestUidValues.get()) { + KeyRange range = BinaryReader::fromStringRef(it.key.removePrefix(destUidLookupPrefix), IncludeVersion()); + UID logUid = BinaryReader::fromStringRef(it.value, Unversioned()); + if(loggingRanges.count(LogRangeAndUID(range, logUid))) { + std::pair rangePair = std::make_pair(range.begin,range.end); + if(existingRanges.count(rangePair)) { + messages->push_back(JsonString::makeMessage("duplicate_mutation_streams", format("Backup and DR are not sharing the same stream of mutations for `%s` - `%s`", printable(range.begin).c_str(), printable(range.end).c_str()).c_str())); + break; + } + existingRanges.insert(rangePair); + } else { + //This cleanup is done during status, because it should only be required once after upgrading to 6.2.7 or later. + //There is no other good location to detect that the metadata is mismatched. + TraceEvent(SevWarnAlways, "CleaningDestUidLookup").detail("K", it.key.printable()).detail("V", it.value.printable()); + tr.clear(it.key); + } + } + wait(tr.commit() || timeoutFuture); break; } catch(Error &e) { if(e.code() == error_code_timed_out) { diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp new file mode 100644 index 0000000000..2887d2017c --- /dev/null +++ b/fdbserver/StorageCache.actor.cpp @@ -0,0 +1,1007 @@ +/* + * StorageCache.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbserver/Knobs.h" +#include "fdbserver/ServerDBInfo.h" +#include "fdbclient/StorageServerInterface.h" +#include "fdbclient/VersionedMap.h" +#include "fdbclient/KeyRangeMap.h" +#include "fdbclient/Atomic.h" +#include "fdbclient/Notified.h" +#include "fdbserver/LogSystem.h" +#include "fdbserver/WaitFailure.h" +#include "fdbserver/WorkerInterface.actor.h" +#include "flow/actorcompiler.h" // This must be the last #include. + + +//TODO storageCache server shares quite a bit of storageServer functionality, although simplified +// Need to look into refactoring common code out for better code readability and to avoid duplication + +//TODO rename wrong_shard_server error to wrong_cache_server +inline bool canReplyWith(Error e) { + switch(e.code()) { + case error_code_transaction_too_old: + case error_code_future_version: + case error_code_wrong_shard_server: + case error_code_process_behind: + //case error_code_all_alternatives_failed: + return true; + default: + return false; + }; +} + +const int VERSION_OVERHEAD = 64 + sizeof(Version) + sizeof(Standalone) + //mutationLog, 64b overhead for map + 2 * (64 + sizeof(Version) + sizeof(Reference::PTreeT>)); //versioned map [ x2 for createNewVersion(version+1) ], 64b overhead for map +static int mvccStorageBytes( MutationRef const& m ) { return VersionedMap::overheadPerItem * 2 + (MutationRef::OVERHEAD_BYTES + m.param1.size() + m.param2.size()) * 2; } + +struct StorageCacheData { + typedef VersionedMap VersionedData; +private: + // in-memory versioned struct (PTree as of now. Subject to change) + VersionedData versionedData; + // in-memory mutationLog that the versionedData contains references to + // TODO change it to a deque, already contains mutations in version order + std::map> mutationLog; // versions (durableVersion, version] + +public: + UID thisServerID; // unique id + uint16_t index; // server index + Reference>> logSystem; + Key ck; //cacheKey + KeyRangeMap cachedRangeMap; // map of cached key-ranges + + // The following are in rough order from newest to oldest + // TODO double check which ones we need for storageCache servers + Version lastTLogVersion, lastVersionWithData; + NotifiedVersion version; // current version i.e. the max version that can be read from the cache + NotifiedVersion desiredOldestVersion; // oldestVersion can be increased to this after compaction + NotifiedVersion oldestVersion; // Min version that might be read from the cache + + // TODO not really in use as of now. may need in some failure cases. Revisit and remove if no plausible use + Future compactionInProgress; + + // TODO do we need otherError here? + Promise otherError; + + int64_t versionLag; // An estimate for how many versions it takes for the data to move from the logs to this cache server + bool behind; + + // TODO double check which ones we need for storageCache servers + struct Counters { + CounterCollection cc; + Counter allQueries, getKeyQueries, getValueQueries, getRangeQueries, finishedQueries, rowsQueried, bytesQueried, watchQueries; + Counter bytesInput, mutationBytes; // Like bytesInput but without MVCC accounting + Counter mutations, setMutations, clearRangeMutations, atomicMutations; + Counter updateBatches, updateVersions; + Counter loops; + Counter readsRejected; + + //LatencyBands readLatencyBands; + + Counters(StorageCacheData* self) + : cc("StorageCacheServer", self->thisServerID.toString()), + getKeyQueries("GetKeyQueries", cc), + getValueQueries("GetValueQueries",cc), + getRangeQueries("GetRangeQueries", cc), + allQueries("QueryQueue", cc), + finishedQueries("FinishedQueries", cc), + rowsQueried("RowsQueried", cc), + bytesQueried("BytesQueried", cc), + watchQueries("WatchQueries", cc), + bytesInput("BytesInput", cc), + mutationBytes("MutationBytes", cc), + mutations("Mutations", cc), + setMutations("SetMutations", cc), + clearRangeMutations("ClearRangeMutations", cc), + atomicMutations("AtomicMutations", cc), + updateBatches("UpdateBatches", cc), + updateVersions("UpdateVersions", cc), + loops("Loops", cc), + readsRejected("ReadsRejected", cc) + { + specialCounter(cc, "LastTLogVersion", [self](){ return self->lastTLogVersion; }); + specialCounter(cc, "Version", [self](){ return self->version.get(); }); + specialCounter(cc, "VersionLag", [self](){ return self->versionLag; }); + } + } counters; + + explicit StorageCacheData(UID thisServerID, uint16_t index) + : thisServerID(thisServerID), index(index), + logSystem(new AsyncVar>()), + lastTLogVersion(0), lastVersionWithData(0), + compactionInProgress(Void()), + versionLag(0), behind(false), counters(this) + { + version.initMetric(LiteralStringRef("StorageCacheData.Version"), counters.cc.id); + desiredOldestVersion.initMetric(LiteralStringRef("StorageCacheData.DesriedOldestVersion"), counters.cc.id); + oldestVersion.initMetric(LiteralStringRef("StorageCacheData.OldestVersion"), counters.cc.id); + } + + void addMutation(KeyRangeRef const& cachedKeyRange, Version version, MutationRef const& mutation); + + bool isReadable( KeyRangeRef const& keys ) { + auto cr = cachedRangeMap.intersectingRanges(keys); + for(auto i = cr.begin(); i != cr.end(); ++i) + if (!i->value()) + return false; + return true; + } + + Arena lastArena; + std::map> const & getMutationLog() { return mutationLog; } + std::map>& getMutableMutationLog() { return mutationLog; } + VersionedData const& data() const { return versionedData; } + VersionedData& mutableData() { return versionedData; } + + Standalone& addVersionToMutationLog(Version v) { + // return existing version... + auto m = mutationLog.find(v); + if (m != mutationLog.end()) + return m->second; + + // ...or create a new one + auto& u = mutationLog[v]; + u.version = v; + if (lastArena.getSize() >= 65536) lastArena = Arena(4096); + u.arena() = lastArena; + counters.bytesInput += VERSION_OVERHEAD; + return u; + } + + MutationRef addMutationToMutationLog(Standalone &mLV, MutationRef const& m){ + //TODO find out more + //byteSampleApplyMutation(m, mLV.version); + counters.bytesInput += mvccStorageBytes(m); + return mLV.mutations.push_back_deep( mLV.arena(), m ); + } + +}; + +///////////////////////////////////// Queries ///////////////////////////////// +#pragma region Queries +ACTOR Future waitForVersion( StorageCacheData* data, Version version ) { + // This could become an Actor transparently, but for now it just does the lookup + if (version == latestVersion) + version = std::max(Version(1), data->version.get()); + if (version < data->oldestVersion.get() || version <= 0) throw transaction_too_old(); + else if (version <= data->version.get()) + return version; + + if(data->behind && version > data->version.get()) { + throw process_behind(); + } + + if(deterministicRandom()->random01() < 0.001) + TraceEvent("WaitForVersion1000x"); + choose { + when ( wait( data->version.whenAtLeast(version) ) ) { + //FIXME: A bunch of these can block with or without the following delay 0. + //wait( delay(0) ); // don't do a whole bunch of these at once + if (version < data->oldestVersion.get()) throw transaction_too_old(); + return version; + } + when ( wait( delay( SERVER_KNOBS->FUTURE_VERSION_DELAY ) ) ) { + if(deterministicRandom()->random01() < 0.001) + TraceEvent(SevWarn, "CacheServerFutureVersion1000x", data->thisServerID) + .detail("Version", version) + .detail("MyVersion", data->version.get()) + .detail("ServerID", data->thisServerID); + throw future_version(); + } + } +} + +ACTOR Future waitForVersionNoTooOld( StorageCacheData* data, Version version ) { + // This could become an Actor transparently, but for now it just does the lookup + if (version == latestVersion) + version = std::max(Version(1), data->version.get()); + if (version <= data->version.get()) + return version; + choose { + when ( wait( data->version.whenAtLeast(version) ) ) { + return version; + } + when ( wait( delay( SERVER_KNOBS->FUTURE_VERSION_DELAY ) ) ) { + if(deterministicRandom()->random01() < 0.001) + TraceEvent(SevWarn, "CacheServerFutureVersion1000x", data->thisServerID) + .detail("Version", version) + .detail("MyVersion", data->version.get()) + .detail("ServerID", data->thisServerID); + throw future_version(); + } + } +} + +ACTOR Future getValueQ( StorageCacheData* data, GetValueRequest req ) { + state int64_t resultSize = 0; + + try { + ++data->counters.getValueQueries; + ++data->counters.allQueries; + //++data->readQueueSizeMetric; + //TODO later + //data->maxQueryQueue = std::max( data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue()); + + // Active load balancing runs at a very high priority (to obtain accurate queue lengths) + // so we need to downgrade here + + //TODO what's this? + wait( delay(0, TaskPriority::DefaultEndpoint) ); + + if( req.debugID.present() ) + g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask()); + + state Optional v; + state Version version = wait( waitForVersion( data, req.version ) ); + if( req.debugID.present() ) + g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterVersion"); //.detail("TaskID", g_network->getCurrentTask()); + + if (!data->cachedRangeMap[req.key]) { + //TraceEvent("WrongCacheServer", data->thisServerID).detail("Key", req.key).detail("Version", version).detail("In", "getValueQ"); + throw wrong_shard_server(); + } + + state int path = 0; + auto i = data->data().at(version).lastLessOrEqual(req.key); + if (i && i->isValue() && i.key() == req.key) { + v = (Value)i->getValue(); + path = 1; + } + + //debugMutation("CacheGetValue", version, MutationRef(MutationRef::DebugKey, req.key, v.present()?v.get():LiteralStringRef(""))); + //debugMutation("CacheGetPath", version, MutationRef(MutationRef::DebugKey, req.key, path==0?LiteralStringRef("0"):path==1?LiteralStringRef("1"):LiteralStringRef("2"))); + + if (v.present()) { + ++data->counters.rowsQueried; + resultSize = v.get().size(); + data->counters.bytesQueried += resultSize; + } + + if( req.debugID.present() ) + g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterRead"); //.detail("TaskID", g_network->getCurrentTask()); + + GetValueReply reply(v); + req.reply.send(reply); + } catch (Error& e) { + if(!canReplyWith(e)) + throw; + req.reply.sendError(e); + } + + ++data->counters.finishedQueries; + //--data->readQueueSizeMetric; + //if(data->latencyBandConfig.present()) { + // int maxReadBytes = data->latencyBandConfig.get().readConfig.maxReadBytes.orDefault(std::numeric_limits::max()); + // data->counters.readLatencyBands.addMeasurement(timer() - req.requestTime(), resultSize > maxReadBytes); + //} + + return Void(); +}; + +//TODO Implement the reverse readRange +GetKeyValuesReply readRange(StorageCacheData* data, Version version, KeyRangeRef range, int limit, int* pLimitBytes) { + GetKeyValuesReply result; + StorageCacheData::VersionedData::ViewAtVersion view = data->data().at(version); + StorageCacheData::VersionedData::iterator vCurrent = view.end(); + KeyRef readBegin; + KeyRef rangeBegin = range.begin; + KeyRef rangeEnd = range.end; + + //We might care about a clear beginning before start that runs into range + vCurrent = view.lastLessOrEqual(rangeBegin); + if (vCurrent && vCurrent->isClearTo() && vCurrent->getEndKey() > rangeBegin) + readBegin = vCurrent->getEndKey(); + else + readBegin = rangeBegin; + + vCurrent = view.lower_bound(readBegin); + ASSERT(!vCurrent || vCurrent.key() >= readBegin); + if (vCurrent) { + auto b = vCurrent; + --b; + ASSERT(!b || b.key() < readBegin); + } + int accumulatedBytes = 0; + while (vCurrent && vCurrent.key() < rangeEnd && limit > 0 && accumulatedBytes < *pLimitBytes) { + if (!vCurrent->isClearTo()) { + result.data.push_back_deep(result.arena, KeyValueRef(vCurrent.key(), vCurrent->getValue())); + accumulatedBytes += sizeof(KeyValueRef) + result.data.end()[-1].expectedSize(); + --limit; + } + ++vCurrent; + } + + *pLimitBytes -= accumulatedBytes; + ASSERT(result.data.size() == 0 || *pLimitBytes + result.data.end()[-1].expectedSize() + sizeof(KeyValueRef) > 0); + result.more = limit == 0 || *pLimitBytes <= 0; // FIXME: Does this have to be exact? + result.version = version; + return result; +} + +Key findKey( StorageCacheData* data, KeySelectorRef sel, Version version, KeyRange range, int* pOffset) +// Attempts to find the key indicated by sel in the data at version, within range. +// Precondition: selectorInRange(sel, range) +// If it is found, offset is set to 0 and a key is returned which falls inside range. +// If the search would depend on any key outside range OR if the key selector offset is too large (range read returns too many bytes), it returns either +// a negative offset and a key in [range.begin, sel.getKey()], indicating the key is (the first key <= returned key) + offset, or +// a positive offset and a key in (sel.getKey(), range.end], indicating the key is (the first key >= returned key) + offset-1 +// The range passed in to this function should specify a shard. If range.begin is repeatedly not the beginning of a shard, then it is possible to get stuck looping here +{ + ASSERT( version != latestVersion ); + ASSERT( selectorInRange(sel, range) && version >= data->oldestVersion.get()); + + // Count forward or backward distance items, skipping the first one if it == key and skipEqualKey + bool forward = sel.offset > 0; // If forward, result >= sel.getKey(); else result <= sel.getKey() + int sign = forward ? +1 : -1; + bool skipEqualKey = sel.orEqual == forward; + int distance = forward ? sel.offset : 1-sel.offset; + + //Don't limit the number of bytes if this is a trivial key selector (there will be at most two items returned from the read range in this case) + int maxBytes; + if (sel.offset <= 1 && sel.offset >= 0) + maxBytes = std::numeric_limits::max(); + else + maxBytes = BUGGIFY ? SERVER_KNOBS->BUGGIFY_LIMIT_BYTES : SERVER_KNOBS->STORAGE_LIMIT_BYTES; + + GetKeyValuesReply rep = readRange( data, version, + forward ? KeyRangeRef(sel.getKey(), range.end) : KeyRangeRef(range.begin, keyAfter(sel.getKey())), + (distance + skipEqualKey)*sign, &maxBytes ); + bool more = rep.more && rep.data.size() != distance + skipEqualKey; + + //If we get only one result in the reverse direction as a result of the data being too large, we could get stuck in a loop + if(more && !forward && rep.data.size() == 1) { + TEST(true); //Reverse key selector returned only one result in range read + maxBytes = std::numeric_limits::max(); + GetKeyValuesReply rep2 = readRange( data, version, KeyRangeRef(range.begin, keyAfter(sel.getKey())), -2, &maxBytes ); + rep = rep2; + more = rep.more && rep.data.size() != distance + skipEqualKey; + ASSERT(rep.data.size() == 2 || !more); + } + + int index = distance-1; + if (skipEqualKey && rep.data.size() && rep.data[0].key == sel.getKey() ) + ++index; + + if (index < rep.data.size()) { + *pOffset = 0; + return rep.data[ index ].key; + } else { + // FIXME: If range.begin=="" && !forward, return success? + *pOffset = index - rep.data.size() + 1; + if (!forward) *pOffset = -*pOffset; + + if (more) { + TEST(true); // Key selector read range had more results + + ASSERT(rep.data.size()); + Key returnKey = forward ? keyAfter(rep.data.back().key) : rep.data.back().key; + + //This is possible if key/value pairs are very large and only one result is returned on a last less than query + //SOMEDAY: graceful handling of exceptionally sized values + ASSERT(returnKey != sel.getKey()); + + return returnKey; + } else + return forward ? range.end : range.begin; + } +} + +KeyRange getCachedKeyRange( StorageCacheData* data, const KeySelectorRef& sel ) +// Returns largest range that is cached on this server and selectorInRange(sel, range) or wrong_shard_server if no such range exists +{ + auto i = sel.isBackward() ? data->cachedRangeMap.rangeContainingKeyBefore( sel.getKey() ) : + data->cachedRangeMap.rangeContaining( sel.getKey() ); + if (!i->value()) throw wrong_shard_server(); + ASSERT( selectorInRange(sel, i->range()) ); + return i->range(); +} + +ACTOR Future getKeyValues( StorageCacheData* data, GetKeyValuesRequest req ) +// Throws a wrong_shard_server if the keys in the request or result depend on data outside this server OR if a large selector offset prevents +// all data from being read in one range read +{ + state int64_t resultSize = 0; + + ++data->counters.getRangeQueries; + ++data->counters.allQueries; + //++data->readQueueSizeMetric; + //data->maxQueryQueue = std::max( data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue()); + + // Active load balancing runs at a very high priority (to obtain accurate queue lengths) + // so we need to downgrade here + TaskPriority taskType = TaskPriority::DefaultEndpoint; + if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.isFetchKeys) { + taskType = TaskPriority::FetchKeys; + // } else if (false) { + // // Placeholder for up-prioritizing fetches for important requests + // taskType = TaskPriority::DefaultDelay; + } + wait( delay(0, taskType) ); + + try { + if( req.debugID.present() ) + g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.Before"); + state Version version = wait( waitForVersion( data, req.version ) ); + + try { + state KeyRange cachedKeyRange = getCachedKeyRange( data, req.begin ); + + if( req.debugID.present() ) + g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.AfterVersion"); + //.detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end); + } catch (Error& e) { TraceEvent("WrongShardServer", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("Shard", "None").detail("In", "getKeyValues>getShardKeyRange"); throw e; } + + if ( !selectorInRange(req.end, cachedKeyRange) && !(req.end.isFirstGreaterOrEqual() && req.end.getKey() == cachedKeyRange.end) ) { +// TraceEvent("WrongShardServer1", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end).detail("In", "getKeyValues>checkShardExtents"); + throw wrong_shard_server(); + } + + state int offset1; + state int offset2; + state Key begin = req.begin.isFirstGreaterOrEqual() ? req.begin.getKey() : findKey( data, req.begin, version, cachedKeyRange, &offset1 ); + state Key end = req.end.isFirstGreaterOrEqual() ? req.end.getKey() : findKey( data, req.end, version, cachedKeyRange, &offset2 ); + if( req.debugID.present() ) + g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.AfterKeys"); + //.detail("Off1",offset1).detail("Off2",offset2).detail("ReqBegin",req.begin.getKey()).detail("ReqEnd",req.end.getKey()); + + // Offsets of zero indicate begin/end keys in this cachedKeyRange, which obviously means we can answer the query + // An end offset of 1 is also OK because the end key is exclusive, so if the first key of the next cachedKeyRange is the end the last actual key returned must be from this cachedKeyRange. + // A begin offset of 1 is also OK because then either begin is past end or equal to end (so the result is definitely empty) + if ((offset1 && offset1!=1) || (offset2 && offset2!=1)) { + TEST(true); // wrong_cache_server due to offset + // We could detect when offset1 takes us off the beginning of the database or offset2 takes us off the end, and return a clipped range rather + // than an error (since that is what the NativeAPI.getRange will do anyway via its "slow path"), but we would have to add some flags to the response + // to encode whether we went off the beginning and the end, since it needs that information. + //TraceEvent("WrongShardServer2", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end).detail("In", "getKeyValues>checkOffsets").detail("BeginKey", begin).detail("EndKey", end).detail("BeginOffset", offset1).detail("EndOffset", offset2); + throw wrong_shard_server(); + } + + if (begin >= end) { + if( req.debugID.present() ) + g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.Send"); + //.detail("Begin",begin).detail("End",end); + + GetKeyValuesReply none; + none.version = version; + none.more = false; + req.reply.send( none ); + } else { + state int remainingLimitBytes = req.limitBytes; + + GetKeyValuesReply _r = readRange(data, version, KeyRangeRef(begin, end), req.limit, &remainingLimitBytes); + GetKeyValuesReply r = _r; + + if( req.debugID.present() ) + g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.AfterReadRange"); + //.detail("Begin",begin).detail("End",end).detail("SizeOf",r.data.size()); + if (EXPENSIVE_VALIDATION) { + for (int i = 0; i < r.data.size(); i++) + ASSERT(r.data[i].key >= begin && r.data[i].key < end); + ASSERT(r.data.size() <= std::abs(req.limit)); + } + + req.reply.send( r ); + + resultSize = req.limitBytes - remainingLimitBytes; + data->counters.bytesQueried += resultSize; + data->counters.rowsQueried += r.data.size(); + } + } catch (Error& e) { + if(!canReplyWith(e)) + throw; + req.reply.sendError(e); + } + + ++data->counters.finishedQueries; + + return Void(); +} + +ACTOR Future getKey( StorageCacheData* data, GetKeyRequest req ) { + state int64_t resultSize = 0; + + ++data->counters.getKeyQueries; + ++data->counters.allQueries; + + // Active load balancing runs at a very high priority (to obtain accurate queue lengths) + // so we need to downgrade here + wait( delay(0, TaskPriority::DefaultEndpoint) ); + + try { + state Version version = wait( waitForVersion( data, req.version ) ); + state KeyRange cachedKeyRange = getCachedKeyRange( data, req.sel ); + + state int offset; + Key k = findKey( data, req.sel, version, cachedKeyRange, &offset ); + + KeySelector updated; + if (offset < 0) + updated = firstGreaterOrEqual(k)+offset; // first thing on this shard OR (large offset case) smallest key retrieved in range read + else if (offset > 0) + updated = firstGreaterOrEqual(k)+offset-1; // first thing on next shard OR (large offset case) keyAfter largest key retrieved in range read + else + updated = KeySelectorRef(k,true,0); //found + + resultSize = k.size(); + data->counters.bytesQueried += resultSize; + ++data->counters.rowsQueried; + + GetKeyReply reply(updated); + req.reply.send(reply); + } + catch (Error& e) { + if (e.code() == error_code_wrong_shard_server) TraceEvent("WrongShardServer").detail("In","getKey"); + if(!canReplyWith(e)) + throw; + req.reply.sendError(e); + } + + ++data->counters.finishedQueries; + + return Void(); +} + +#pragma endregion + +bool expandMutation( MutationRef& m, StorageCacheData::VersionedData const& data, KeyRef eagerTrustedEnd, Arena& ar ) { + // After this function call, m should be copied into an arena immediately (before modifying data, shards, or eager) + if (m.type == MutationRef::ClearRange) { + // Expand the clear + const auto& d = data.atLatest(); + + // If another clear overlaps the beginning of this one, engulf it + auto i = d.lastLess(m.param1); + if (i && i->isClearTo() && i->getEndKey() >= m.param1) + m.param1 = i.key(); + + // If another clear overlaps the end of this one, engulf it; otherwise expand + i = d.lastLessOrEqual(m.param2); + if (i && i->isClearTo() && i->getEndKey() >= m.param2) { + m.param2 = i->getEndKey(); + } else { + // Expand to the next set or clear (from storage or latestVersion), and if it + // is a clear, engulf it as well + i = d.lower_bound(m.param2); + //KeyRef endKeyAtStorageVersion = m.param2 == eagerTrustedEnd ? eagerTrustedEnd : std::min( eager->getKeyEnd( m.param2 ), eagerTrustedEnd ); + // TODO check if the following is correct + KeyRef endKeyAtStorageVersion = eagerTrustedEnd; + if (!i || endKeyAtStorageVersion < i.key()) + m.param2 = endKeyAtStorageVersion; + else if (i->isClearTo()) + m.param2 = i->getEndKey(); + else + m.param2 = i.key(); + } + } + else if (m.type != MutationRef::SetValue && (m.type)) { + + Optional oldVal; + auto it = data.atLatest().lastLessOrEqual(m.param1); + if (it != data.atLatest().end() && it->isValue() && it.key() == m.param1) + oldVal = it->getValue(); + else if (it != data.atLatest().end() && it->isClearTo() && it->getEndKey() > m.param1) { + TEST(true); // Atomic op right after a clear. + } + + switch(m.type) { + case MutationRef::AddValue: + m.param2 = doLittleEndianAdd(oldVal, m.param2, ar); + break; + case MutationRef::And: + m.param2 = doAnd(oldVal, m.param2, ar); + break; + case MutationRef::Or: + m.param2 = doOr(oldVal, m.param2, ar); + break; + case MutationRef::Xor: + m.param2 = doXor(oldVal, m.param2, ar); + break; + case MutationRef::AppendIfFits: + m.param2 = doAppendIfFits(oldVal, m.param2, ar); + break; + case MutationRef::Max: + m.param2 = doMax(oldVal, m.param2, ar); + break; + case MutationRef::Min: + m.param2 = doMin(oldVal, m.param2, ar); + break; + case MutationRef::ByteMin: + m.param2 = doByteMin(oldVal, m.param2, ar); + break; + case MutationRef::ByteMax: + m.param2 = doByteMax(oldVal, m.param2, ar); + break; + case MutationRef::MinV2: + m.param2 = doMinV2(oldVal, m.param2, ar); + break; + case MutationRef::AndV2: + m.param2 = doAndV2(oldVal, m.param2, ar); + break; + case MutationRef::CompareAndClear: + if (oldVal.present() && m.param2 == oldVal.get()) { + m.type = MutationRef::ClearRange; + m.param2 = keyAfter(m.param1, ar); + return expandMutation(m, data, eagerTrustedEnd, ar); + } + return false; + } + m.type = MutationRef::SetValue; + } + + return true; +} + +// Applies a write mutation (SetValue or ClearRange) to the in-memory versioned data structure +void applyMutation( StorageCacheData *self, MutationRef const& m, Arena& arena, StorageCacheData::VersionedData &data ) { + // m is expected to be in arena already + // Clear split keys are added to arena + + if (m.type == MutationRef::SetValue) { + auto prev = data.atLatest().lastLessOrEqual(m.param1); + if (prev && prev->isClearTo() && prev->getEndKey() > m.param1) { + ASSERT( prev.key() <= m.param1 ); + KeyRef end = prev->getEndKey(); + // TODO double check if the insert version of the previous clear needs to be preserved for the "left half", + // insert() invalidates prev, so prev.key() is not safe to pass to it by reference + data.insert( KeyRef(prev.key()), ValueOrClearToRef::clearTo( m.param1 ), prev.insertVersion() ); // overwritten by below insert if empty + KeyRef nextKey = keyAfter(m.param1, arena); + if ( end != nextKey ) { + ASSERT( end > nextKey ); + // TODO double check if it's okay to let go of the the insert version of the "right half" + // FIXME: This copy is technically an asymptotic problem, definitely a waste of memory (copy of keyAfter is a waste, but not asymptotic) + data.insert( nextKey, ValueOrClearToRef::clearTo( KeyRef(arena, end) ) ); + } + } + data.insert( m.param1, ValueOrClearToRef::value(m.param2) ); + } else if (m.type == MutationRef::ClearRange) { + data.erase( m.param1, m.param2 ); + ASSERT( m.param2 > m.param1 ); + ASSERT( !data.isClearContaining( data.atLatest(), m.param1 ) ); + data.insert( m.param1, ValueOrClearToRef::clearTo(m.param2) ); + } +} + +template +void splitMutation(StorageCacheData* data, KeyRangeMap& map, MutationRef const& m, Version ver) { + if(isSingleKeyMutation((MutationRef::Type) m.type)) { + auto i = map.rangeContaining(m.param1); + if (i->value()) // If this key lies in the cached key-range on this server + data->addMutation( i->range(), ver, m ); + } + else if (m.type == MutationRef::ClearRange) { + KeyRangeRef mKeys( m.param1, m.param2 ); + auto r = map.intersectingRanges( mKeys ); + for(auto i = r.begin(); i != r.end(); ++i) { + if (i->value()) { // if this sub-range exists on this cache server + KeyRangeRef k = mKeys & i->range(); + data->addMutation( i->range(), ver, MutationRef((MutationRef::Type)m.type, k.begin, k.end) ); + } + } + } else + ASSERT(false); // Unknown mutation type in splitMutations +} + +void StorageCacheData::addMutation(KeyRangeRef const& cachedKeyRange, Version version, MutationRef const& mutation) { + MutationRef expanded = mutation; + auto& mLog = addVersionToMutationLog(version); + + if ( !expandMutation( expanded, data(), cachedKeyRange.end, mLog.arena()) ) { + return; + } + expanded = addMutationToMutationLog(mLog, expanded); + if (debugMutation("expandedMutation", version, expanded)) { + const char* type = + mutation.type == MutationRef::SetValue ? "SetValue" : + mutation.type == MutationRef::ClearRange ? "ClearRange" : + mutation.type == MutationRef::DebugKeyRange ? "DebugKeyRange" : + mutation.type == MutationRef::DebugKey ? "DebugKey" : + "UnknownMutation"; + printf("DEBUGMUTATION:\t%.6f\t%s\t%s\t%s\t%s\t%s\n", + now(), g_network->getLocalAddress().toString().c_str(), "originalMutation", + type, printable(mutation.param1).c_str(), printable(mutation.param2).c_str()); + printf(" Cached Key-range: %s - %s\n", printable(cachedKeyRange.begin).c_str(), printable(cachedKeyRange.end).c_str()); + } + applyMutation( this, expanded, mLog.arena(), mutableData() ); + printf("\nSCUpdate: Printing versioned tree after applying mutation\n"); + mutableData().printTree(version); + +} + +// Helper class for updating the storage cache (i.e. applying mutations) +class StorageCacheUpdater { +public: + StorageCacheUpdater() : currentVersion(invalidVersion), processedCacheStartKey(false) {} + StorageCacheUpdater(Version currentVersion) : currentVersion(currentVersion), processedCacheStartKey(false) {} + + void applyMutation(StorageCacheData* data, MutationRef const& m , Version ver) { + //TraceEvent("SCNewVersion", data->thisServerID).detail("VerWas", data->mutableData().latestVersion).detail("ChVer", ver); + + if(currentVersion != ver) { + currentVersion = ver; + data->mutableData().createNewVersion(ver); + } + + if (m.param1.startsWith( systemKeys.end )) { + //TraceEvent("PrivateData", data->thisServerID).detail("Mutation", m.toString()).detail("Version", ver); + applyPrivateCacheData( data, m ); + } else { + // FIXME: enable when debugMutation is active + //for(auto m = changes[c].mutations.begin(); m; ++m) { + // debugMutation("SCUpdateMutation", changes[c].version, *m); + //} + + splitMutation(data, data->cachedRangeMap, m, ver); + } + + //TODO + if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get(); + } + + Version currentVersion; +private: + KeyRef cacheStartKey; + bool nowAssigned; + bool processedCacheStartKey; + + // Applies private mutations, as the name suggests. It's basically establishes the key-ranges + //that this cache server is responsible for + // TODO Revisit during failure handling. Might we loose some private mutations? + void applyPrivateCacheData( StorageCacheData* data, MutationRef const& m ) { + TraceEvent(SevDebug, "SCPrivateCacheMutation", data->thisServerID).detail("Mutation", m.toString()); + + if (processedCacheStartKey) { + // we expect changes in pairs, [begin,end). This mutation is for end key of the range + ASSERT (m.type == MutationRef::SetValue && m.param1.startsWith(data->ck)); + KeyRangeRef keys( cacheStartKey.removePrefix(data->ck), m.param1.removePrefix(data->ck)); + data->cachedRangeMap.insert(keys, true); + fprintf(stderr, "SCPrivateCacheMutation: begin: %s, end: %s\n", printable(keys.begin).c_str(), printable(keys.end).c_str()); + + processedCacheStartKey = false; + } else if (m.type == MutationRef::SetValue && m.param1.startsWith( data->ck )) { + // We expect changes in pairs, [begin,end), This mutation is for start key of the range + cacheStartKey = m.param1; + processedCacheStartKey = true; + } else { + fprintf(stderr, "SCPrivateCacheMutation: Unknown private mutation\n"); + ASSERT(false); // Unknown private mutation + } + } +}; + +// Compacts the in-memory VersionedMap, i.e. removes versions below the desiredOldestVersion +// TODO revisit if we change the data structure +ACTOR Future compactCache(StorageCacheData* data) { + loop { + //TODO understand this, should we add delay here? + //if (g_network->isSimulated()) { + // double endTime = g_simulator.checkDisabled(format("%s/compactCache", data->thisServerID.toString().c_str())); + // if(endTime > now()) { + // wait(delay(endTime - now(), TaskPriority::CompactCache)); + // } + //} + + // Wait until the desiredOldestVersion is greater than the current oldestVersion + wait( data->desiredOldestVersion.whenAtLeast( data->oldestVersion.get()+1 ) ); + wait( delay(0, TaskPriority::CompactCache) ); + + //TODO not really in use as of now. may need in some failure cases. Revisit and remove if no plausible use + state Promise compactionInProgress; + data->compactionInProgress = compactionInProgress.getFuture(); + state Version oldestVersion = data->oldestVersion.get(); + state Version desiredVersion = data->desiredOldestVersion.get(); + // Call the compaction routine that does the actual work, + // TODO It's a synchronous function call as of now. Should it asynch? + data->mutableData().compact(desiredVersion); + Future finishedForgetting = data->mutableData().forgetVersionsBeforeAsync( desiredVersion, + TaskPriority::CompactCache ); + data->oldestVersion.set( desiredVersion ); + wait( finishedForgetting ); + // TODO how do we yield here? This may not be enough, because compact() does the heavy lifting + // of compating the VersionedMap. We should probably look into per version compaction and then + // we can yield after compacting one version + wait( yield(TaskPriority::CompactCache) ); + + // TODO what flowlock to acquire during compaction? + compactionInProgress.send(Void()); + wait( delay(0, TaskPriority::CompactCache) ); //Setting compactionInProgess could cause the cache server to shut down, so delay to check for cancellation + } +} + +ACTOR Future pullAsyncData( StorageCacheData *data ) { + state Future dbInfoChange = Void(); + state Reference r; + state Version tagAt = 0; + + state StorageCacheUpdater updater(data->lastVersionWithData); + state Version ver = invalidVersion; + //data->lastTLogVersion = r->getMaxKnownVersion(); + //data->versionLag = std::max(0, data->lastTLogVersion - data->version.get()); + ++data->counters.updateBatches; + + loop { + loop { + choose { + when(wait( r ? r->getMore(TaskPriority::TLogCommit) : Never() ) ) { + break; + } + when( wait( dbInfoChange ) ) { + if( data->logSystem->get() ) + r = data->logSystem->get()->peek( data->thisServerID, tagAt, Optional(), cacheTag, true ); + else + r = Reference(); + dbInfoChange = data->logSystem->onChange(); + } + } + } + //FIXME: if the popped version is greater than our last version, we need to clear the cache + + //FIXME: ensure this can only read data from the current version + r->setProtocolVersion(currentProtocolVersion); + + // Now process the mutations + for (; r->hasMessage(); r->nextMessage()) { + ArenaReader& reader = *r->reader(); + + MutationRef msg; + reader >> msg; + fprintf(stderr, "%lld : %s\n", r->version().version, msg.toString().c_str()); + + if (r->version().version > ver && r->version().version > data->version.get()) { + ++data->counters.updateVersions; + ver = r->version().version; + } + if (ver != invalidVersion) // This change belongs to a version < minVersion + { + updater.applyMutation(data, msg, ver); + // TODO + //mutationBytes += msg.totalSize(); + data->counters.mutationBytes += msg.totalSize(); + ++data->counters.mutations; + switch(msg.type) { + case MutationRef::SetValue: + ++data->counters.setMutations; + break; + case MutationRef::ClearRange: + ++data->counters.clearRangeMutations; + break; + case MutationRef::AddValue: + case MutationRef::And: + case MutationRef::AndV2: + case MutationRef::AppendIfFits: + case MutationRef::ByteMax: + case MutationRef::ByteMin: + case MutationRef::Max: + case MutationRef::Min: + case MutationRef::MinV2: + case MutationRef::Or: + case MutationRef::Xor: + case MutationRef::CompareAndClear: + ++data->counters.atomicMutations; + break; + } + } + else + TraceEvent(SevError, "DiscardingPeekedData", data->thisServerID).detail("Mutation", msg.toString()).detail("Version", r->version().toString()); + + tagAt = r->version().version + 1; + } + + if(ver != invalidVersion) { + data->lastVersionWithData = ver; + } else { + // TODO double check + ver = r->version().version - 1; + } + + if(ver != invalidVersion && ver > data->version.get()) { + debugKeyRange("SCUpdate", ver, allKeys); + + data->mutableData().createNewVersion(ver); + + // TODO what about otherError + if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get(); + + // TODO may enable these later + //data->noRecentUpdates.set(false); + //data->lastUpdate = now(); + data->version.set( ver ); // Triggers replies to waiting gets for new version(s) + // TODO double check + //setDataVersion(data->thisServerID, data->version.get()); + + // TODO what about otherError + if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get(); + + // we can get rid of versions beyond maxVerionsInMemory at any point. Update the + //desiredOldestVersion and that may invoke the compaction actor + Version maxVersionsInMemory = SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS; + Version proposedOldestVersion = data->version.get() - maxVersionsInMemory; + proposedOldestVersion = std::max(proposedOldestVersion, data->oldestVersion.get()); + data->desiredOldestVersion.set(proposedOldestVersion); + } + + // TODO implement a validate function for the cache + //validate(data); + + if(r->version().version >= data->lastTLogVersion) { + if(data->behind) { + TraceEvent("StorageCacheNoLongerBehind", data->thisServerID).detail("CursorVersion", r->version().version).detail("TLogVersion", data->lastTLogVersion); + } + data->behind = false; + } + + tagAt = std::max( tagAt, r->version().version); + } +} + +ACTOR Future storageCache(StorageServerInterface ssi, uint16_t id, Reference> db) { + state StorageCacheData self(ssi.id(), id); + state ActorCollection actors(false); + state Future dbInfoChange = Void(); + + // This helps identify the private mutations meant for this cache server + self.ck = cacheKeysPrefixFor( id ).withPrefix(systemKeys.begin); // FFFF/02cacheKeys/[this server]/ + + actors.add(waitFailureServer(ssi.waitFailure.getFuture())); + + // compactCache actor will periodically compact the cache when certain version condityion is met + actors.add(compactCache(&self)); + + // pullAsyncData actor pulls mutations from the TLog and also applies them. + actors.add(pullAsyncData(&self)); + + loop { + ++self.counters.loops; + choose { + when( wait( dbInfoChange ) ) { + dbInfoChange = db->onChange(); + self.logSystem->set(ILogSystem::fromServerDBInfo( ssi.id(), db->get(), true )); + } + when( GetValueRequest req = waitNext(ssi.getValue.getFuture()) ) { + // TODO do we need to add throttling for cache servers? Probably not + //actors.add(self->readGuard(req , getValueQ)); + actors.add(getValueQ(&self, req)); + } + when( WatchValueRequest req = waitNext(ssi.watchValue.getFuture()) ) { + ASSERT(false); + } + when (GetKeyRequest req = waitNext(ssi.getKey.getFuture())) { + actors.add(getKey(&self, req)); + } + when (GetKeyValuesRequest req = waitNext(ssi.getKeyValues.getFuture()) ) { + actors.add(getKeyValues(&self, req)); + } + when (GetShardStateRequest req = waitNext(ssi.getShardState.getFuture()) ) { + ASSERT(false); + } + when (StorageQueuingMetricsRequest req = waitNext(ssi.getQueuingMetrics.getFuture())) { + ASSERT(false); + } + //when( ReplyPromise reply = waitNext(ssi.getVersion.getFuture()) ) { + // ASSERT(false); + //} + when( ReplyPromise reply = waitNext(ssi.getKeyValueStoreType.getFuture()) ) { + ASSERT(false); + } + when(wait(actors.getResult())) {} + } + } +} diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 1d6e2b1d51..d794633905 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -344,6 +344,7 @@ struct TLogData : NonCopyable { std::map toBePopped; // map of Tag->Version for all the pops // that came when ignorePopRequest was set Reference> degraded; + std::vector tempTagMessages; TLogData(UID dbgid, IKeyValueStore* persistentData, IDiskQueue * persistentQueue, Reference> dbInfo, Reference> degraded, std::string folder) : dbgid(dbgid), instanceID(deterministicRandom()->randomUniqueID().first()), @@ -958,6 +959,81 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD return Void(); } +ACTOR Future tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference logData ) { + if (self->ignorePopRequest) { + TraceEvent(SevDebug, "IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline); + + if (self->toBePopped.find(inputTag) == self->toBePopped.end() + || to > self->toBePopped[inputTag]) { + self->toBePopped[inputTag] = to; + } + // add the pop to the toBePopped map + TraceEvent(SevDebug, "IgnoringPopRequest") + .detail("IgnorePopDeadline", self->ignorePopDeadline) + .detail("Tag", inputTag.toString()) + .detail("Version", to); + return Void(); + } + state Version upTo = to; + int8_t tagLocality = inputTag.locality; + if (logData->logSystem->get().isValid() && logData->logSystem->get()->isPseudoLocality(tagLocality)) { + upTo = logData->logSystem->get()->popPseudoLocalityTag(tagLocality, to); + tagLocality = tagLocalityLogRouter; + } + state Tag tag(tagLocality, inputTag.id); + auto tagData = logData->getTagData(tag); + if (!tagData) { + tagData = logData->createTagData(tag, upTo, true, true, false); + } else if (upTo > tagData->popped) { + tagData->popped = upTo; + tagData->poppedRecently = true; + tagData->requiresPoppedLocationUpdate = true; + + if(tagData->unpoppedRecovered && upTo > logData->recoveredAt) { + tagData->unpoppedRecovered = false; + logData->unpoppedRecoveredTags--; + TraceEvent("TLogPoppedTag", logData->logId).detail("Tags", logData->unpoppedRecoveredTags).detail("Tag", tag.toString()).detail("DurableKCVer", logData->durableKnownCommittedVersion).detail("RecoveredAt", logData->recoveredAt); + if(logData->unpoppedRecoveredTags == 0 && logData->durableKnownCommittedVersion >= logData->recoveredAt && logData->recoveryComplete.canBeSet()) { + logData->recoveryComplete.send(Void()); + } + } + + if (upTo > logData->persistentDataDurableVersion) + wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop)); + //TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo); + } + return Void(); +} + +ACTOR Future tLogPop( TLogData* self, TLogPopRequest req, Reference logData ) { + // timeout check for ignorePopRequest + if (self->ignorePopRequest && (g_network->now() > self->ignorePopDeadline)) { + + TraceEvent("EnableTLogPlayAllIgnoredPops"); + // use toBePopped and issue all the pops + std::map::iterator it; + vector> ignoredPops; + self->ignorePopRequest = false; + self->ignorePopUid = ""; + self->ignorePopDeadline = 0.0; + for (it = self->toBePopped.begin(); it != self->toBePopped.end(); it++) { + TraceEvent("PlayIgnoredPop") + .detail("Tag", it->first.toString()) + .detail("Version", it->second); + ignoredPops.push_back(tLogPopCore(self, it->first, it->second, logData)); + } + self->toBePopped.clear(); + wait(waitForAll(ignoredPops)); + TraceEvent("ResetIgnorePopRequest") + .detail("Now", g_network->now()) + .detail("IgnorePopRequest", self->ignorePopRequest) + .detail("IgnorePopDeadline", self->ignorePopDeadline); + } + wait(tLogPopCore(self, req.tag, req.to, logData)); + req.reply.send(Void()); + return Void(); +} + // This function (and updatePersistentData, which is called by this function) run at a low priority and can soak up all CPU resources. // For this reason, they employ aggressive use of yields to avoid causing slow tasks that could introduce latencies for more important // work (e.g. commits). @@ -977,6 +1053,26 @@ ACTOR Future updateStorage( TLogData* self ) { state FlowLock::Releaser commitLockReleaser; + //FIXME: This policy for calculating the cache pop version could end up popping recent data in the remote DC after two consecutive recoveries. + // It also does not protect against spilling the cache tag directly, so it is theoretically possible to spill this tag; which is not intended to ever happen. + Optional cachePopVersion; + for(auto& it : self->id_data) { + if(!it.second->stopped) { + if(it.second->version.get() - it.second->unrecoveredBefore > SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT + SERVER_KNOBS->MAX_CACHE_VERSIONS) { + cachePopVersion = it.second->version.get() - SERVER_KNOBS->MAX_CACHE_VERSIONS; + } + break; + } + } + + if(cachePopVersion.present()) { + state std::vector> cachePopFutures; + for(auto& it : self->id_data) { + cachePopFutures.push_back(tLogPop(self, TLogPopRequest(cachePopVersion.get(),0,cacheTag), it.second)); + } + wait( waitForAll(cachePopFutures) ); + } + if(logData->stopped) { if (self->bytesInput - self->bytesDurable >= self->targetVolatileBytes) { while(logData->persistentDataDurableVersion != logData->version.get()) { @@ -1178,13 +1274,13 @@ void commitMessages( TLogData* self, Reference logData, Version version void commitMessages( TLogData *self, Reference logData, Version version, Arena arena, StringRef messages ) { ArenaReader rd( arena, messages, Unversioned() ); - std::vector msgs; + self->tempTagMessages.clear(); while(!rd.empty()) { TagsAndMessage tagsAndMsg; tagsAndMsg.loadFromArena(&rd, nullptr); - msgs.push_back(std::move(tagsAndMsg)); + self->tempTagMessages.push_back(std::move(tagsAndMsg)); } - commitMessages(self, logData, version, msgs); + commitMessages(self, logData, version, self->tempTagMessages); } Version poppedVersion( Reference self, Tag tag) { @@ -1207,81 +1303,6 @@ std::deque> & getVersionMessages( Re return tagData->versionMessages; }; -ACTOR Future tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference logData ) { - if (self->ignorePopRequest) { - TraceEvent(SevDebug, "IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline); - - if (self->toBePopped.find(inputTag) == self->toBePopped.end() - || to > self->toBePopped[inputTag]) { - self->toBePopped[inputTag] = to; - } - // add the pop to the toBePopped map - TraceEvent(SevDebug, "IgnoringPopRequest") - .detail("IgnorePopDeadline", self->ignorePopDeadline) - .detail("Tag", inputTag.toString()) - .detail("Version", to); - return Void(); - } - state Version upTo = to; - int8_t tagLocality = inputTag.locality; - if (logData->logSystem->get().isValid() && logData->logSystem->get()->isPseudoLocality(tagLocality)) { - upTo = logData->logSystem->get()->popPseudoLocalityTag(tagLocality, to); - tagLocality = tagLocalityLogRouter; - } - state Tag tag(tagLocality, inputTag.id); - auto tagData = logData->getTagData(tag); - if (!tagData) { - tagData = logData->createTagData(tag, upTo, true, true, false); - } else if (upTo > tagData->popped) { - tagData->popped = upTo; - tagData->poppedRecently = true; - tagData->requiresPoppedLocationUpdate = true; - - if(tagData->unpoppedRecovered && upTo > logData->recoveredAt) { - tagData->unpoppedRecovered = false; - logData->unpoppedRecoveredTags--; - TraceEvent("TLogPoppedTag", logData->logId).detail("Tags", logData->unpoppedRecoveredTags).detail("Tag", tag.toString()).detail("DurableKCVer", logData->durableKnownCommittedVersion).detail("RecoveredAt", logData->recoveredAt); - if(logData->unpoppedRecoveredTags == 0 && logData->durableKnownCommittedVersion >= logData->recoveredAt && logData->recoveryComplete.canBeSet()) { - logData->recoveryComplete.send(Void()); - } - } - - if (upTo > logData->persistentDataDurableVersion) - wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop)); - //TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo); - } - return Void(); -} - -ACTOR Future tLogPop( TLogData* self, TLogPopRequest req, Reference logData ) { - // timeout check for ignorePopRequest - if (self->ignorePopRequest && (g_network->now() > self->ignorePopDeadline)) { - - TraceEvent("EnableTLogPlayAllIgnoredPops"); - // use toBePopped and issue all the pops - std::map::iterator it; - vector> ignoredPops; - self->ignorePopRequest = false; - self->ignorePopUid = ""; - self->ignorePopDeadline = 0.0; - for (it = self->toBePopped.begin(); it != self->toBePopped.end(); it++) { - TraceEvent("PlayIgnoredPop") - .detail("Tag", it->first.toString()) - .detail("Version", it->second); - ignoredPops.push_back(tLogPopCore(self, it->first, it->second, logData)); - } - self->toBePopped.clear(); - wait(waitForAll(ignoredPops)); - TraceEvent("ResetIgnorePopRequest") - .detail("Now", g_network->now()) - .detail("IgnorePopRequest", self->ignorePopRequest) - .detail("IgnorePopDeadline", self->ignorePopDeadline); - } - wait(tLogPopCore(self, req.tag, req.to, logData)); - req.reply.send(Void()); - return Void(); -} - void peekMessagesFromMemory( Reference self, TLogPeekRequest const& req, BinaryWriter& messages, Version& endVersion ) { ASSERT( !messages.getLength() ); @@ -1340,6 +1361,9 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere try { peekId = req.sequence.get().first; sequence = req.sequence.get().second; + if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && logData->peekTracker.find(peekId) == logData->peekTracker.end()) { + throw timed_out(); + } auto& trackerData = logData->peekTracker[peekId]; if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); @@ -1630,6 +1654,7 @@ ACTOR Future doQueueCommit( TLogData* self, Reference logData, st self->queueCommitBegin = commitNumber; logData->queueCommittingVersion = ver; + g_network->setCurrentTask(TaskPriority::TLogCommitReply); Future c = self->persistentQueue->commit(); self->diskQueueCommitBytes = 0; self->largeDiskQueueCommitBytes.set(false); @@ -2128,8 +2153,7 @@ void removeLog( TLogData* self, Reference logData ) { } } -// copy data from old gene to new gene without desiarlzing -ACTOR Future pullAsyncData( TLogData* self, Reference logData, std::vector tags, Version beginVersion, Optional endVersion, bool poppedIsKnownCommitted, bool parallelGetMore ) { +ACTOR Future pullAsyncData( TLogData* self, Reference logData, std::vector tags, Version beginVersion, Optional endVersion, bool poppedIsKnownCommitted ) { state Future dbInfoChange = Void(); state Reference r; state Version tagAt = beginVersion; @@ -2147,7 +2171,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st } when( wait( dbInfoChange ) ) { if( logData->logSystem->get() ) { - r = logData->logSystem->get()->peek( logData->logId, tagAt, endVersion, tags, parallelGetMore ); + r = logData->logSystem->get()->peek( logData->logId, tagAt, endVersion, tags, true ); } else { r = Reference(); } @@ -2284,7 +2308,7 @@ ACTOR Future tLogCore( TLogData* self, Reference logData, TLogInt if(!logData->isPrimary) { std::vector tags; tags.push_back(logData->remoteTag); - logData->addActor.send( pullAsyncData(self, logData, tags, pulledRecoveryVersions ? logData->recoveredAt + 1 : logData->unrecoveredBefore, Optional(), true, true) ); + logData->addActor.send( pullAsyncData(self, logData, tags, pulledRecoveryVersions ? logData->recoveredAt + 1 : logData->unrecoveredBefore, Optional(), true) ); } try { @@ -2679,10 +2703,10 @@ ACTOR Future tLogStart( TLogData* self, InitializeTLogRequest req, Localit logData->logRouterPopToVersion = req.recoverAt; std::vector tags; tags.push_back(logData->remoteTag); - wait(pullAsyncData(self, logData, tags, logData->unrecoveredBefore, req.recoverAt, true, false) || logData->removed); + wait(pullAsyncData(self, logData, tags, logData->unrecoveredBefore, req.recoverAt, true) || logData->removed); } else if(!req.recoverTags.empty()) { ASSERT(logData->unrecoveredBefore > req.knownCommittedVersion); - wait(pullAsyncData(self, logData, req.recoverTags, req.knownCommittedVersion + 1, req.recoverAt, false, true) || logData->removed); + wait(pullAsyncData(self, logData, req.recoverTags, req.knownCommittedVersion + 1, req.recoverAt, false) || logData->removed); } pulledRecoveryVersions = true; logData->knownCommittedVersion = req.recoverAt; @@ -2783,6 +2807,7 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ self.sharedActors.send( commitQueue(&self) ); self.sharedActors.send( updateStorageLoop(&self) ); + state Future activeSharedChange = Void(); loop { choose { @@ -2795,7 +2820,7 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ } } when ( wait( error ) ) { throw internal_error(); } - when ( wait( activeSharedTLog->onChange() ) ) { + when ( wait( activeSharedChange ) ) { if (activeSharedTLog->get() == tlogId) { TraceEvent("SharedTLogNowActive", self.dbgid).detail("NowActive", activeSharedTLog->get()); self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD; @@ -2804,6 +2829,7 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ TraceEvent("SharedTLogQueueSpilling", self.dbgid).detail("NowActive", activeSharedTLog->get()); self.sharedActors.send( startSpillingInTenSeconds(&self, tlogId, activeSharedTLog) ); } + activeSharedChange = activeSharedTLog->onChange(); } } } diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index 37f40b15c3..68b3e21b24 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -459,7 +459,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedisLocal && log->logServers.size() && (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded || log->locality == tag.locality || - tag == txsTag || tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || (tag.locality == tagLocalityUpgraded && log->locality != tagLocalitySatellite))) { + tag == txsTag || tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || ((tag.locality == tagLocalityUpgraded || tag == cacheTag) && log->locality != tagLocalitySatellite))) { lastBegin = std::max(lastBegin, log->startVersion); localSets.push_back(log); if(log->locality != tagLocalitySatellite) { @@ -486,7 +486,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedisLocal && log->logServers.size() && (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded || log->locality == tag.locality || - tag == txsTag || tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || (tag.locality == tagLocalityUpgraded && log->locality != tagLocalitySatellite))) { + tag == txsTag || tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || ((tag.locality == tagLocalityUpgraded || tag == cacheTag) && log->locality != tagLocalitySatellite))) { thisBegin = std::max(thisBegin, log->startVersion); localOldSets.push_back(log); if(log->locality != tagLocalitySatellite) { @@ -538,7 +538,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted peekRemote( UID dbgid, Version begin, Tag tag, bool parallelGetMore ) { + Reference peekRemote( UID dbgid, Version begin, Optional end, Tag tag, bool parallelGetMore ) { int bestSet = -1; Version lastBegin = recoveredAt.present() ? recoveredAt.get() + 1 : 0; for(int t = 0; t < tLogs.size(); t++) { @@ -552,22 +552,22 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted( new ILogSystem::ServerPeekCursor( Reference>>(), tag, begin, getPeekEnd(), false, false ) ); + TraceEvent("TLogPeekRemoteNoBestSet", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end.present() ? end.get() : getPeekEnd()); + return Reference( new ILogSystem::ServerPeekCursor( Reference>>(), tag, begin, getPeekEnd(), false, parallelGetMore ) ); } if(begin >= lastBegin) { - TraceEvent("TLogPeekRemoteBestOnly", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("BestSet", bestSet).detail("BestSetStart", lastBegin).detail("LogRouterIds", tLogs[bestSet]->logRouterString()); - return Reference( new ILogSystem::MergedPeekCursor( tLogs[bestSet]->logRouters, -1, (int)tLogs[bestSet]->logRouters.size(), tag, begin, getPeekEnd(), false, std::vector(), Reference(), 0 ) ); + TraceEvent("TLogPeekRemoteBestOnly", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end.present() ? end.get() : getPeekEnd()).detail("BestSet", bestSet).detail("BestSetStart", lastBegin).detail("LogRouterIds", tLogs[bestSet]->logRouterString()); + return Reference( new ILogSystem::BufferedCursor( tLogs[bestSet]->logRouters, tag, begin, end.present() ? end.get() + 1 : getPeekEnd(), parallelGetMore ) ); } else { std::vector< Reference > cursors; std::vector< LogMessageVersion > epochEnds; - TraceEvent("TLogPeekRemoteAddingBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("BestSet", bestSet).detail("BestSetStart", lastBegin).detail("LogRouterIds", tLogs[bestSet]->logRouterString()); - cursors.emplace_back(new ILogSystem::MergedPeekCursor( tLogs[bestSet]->logRouters, -1, (int)tLogs[bestSet]->logRouters.size(), tag, lastBegin, getPeekEnd(), false, std::vector(), Reference(), 0 ) ); + TraceEvent("TLogPeekRemoteAddingBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end.present() ? end.get() : getPeekEnd()).detail("BestSet", bestSet).detail("BestSetStart", lastBegin).detail("LogRouterIds", tLogs[bestSet]->logRouterString()); + cursors.emplace_back(new ILogSystem::BufferedCursor( tLogs[bestSet]->logRouters, tag, lastBegin, end.present() ? end.get() + 1 : getPeekEnd(), parallelGetMore ) ); int i = 0; while(begin < lastBegin) { if(i == oldLogData.size()) { - TraceEvent("TLogPeekRemoteDead", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("LastBegin", lastBegin).detail("OldLogDataSize", oldLogData.size()); - return Reference( new ILogSystem::ServerPeekCursor( Reference>>(), tag, begin, getPeekEnd(), false, false ) ); + TraceEvent("TLogPeekRemoteDead", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end.present() ? end.get() : getPeekEnd()).detail("LastBegin", lastBegin).detail("OldLogDataSize", oldLogData.size()); + return Reference( new ILogSystem::ServerPeekCursor( Reference>>(), tag, begin, getPeekEnd(), false, parallelGetMore ) ); } int bestOldSet = -1; @@ -583,15 +583,14 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted( new ILogSystem::ServerPeekCursor( Reference>>(), tag, begin, getPeekEnd(), false, false ) ); + TraceEvent("TLogPeekRemoteNoOldBestSet", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end.present() ? end.get() : getPeekEnd()); + return Reference( new ILogSystem::ServerPeekCursor( Reference>>(), tag, begin, getPeekEnd(), false, parallelGetMore ) ); } if(thisBegin < lastBegin) { - TraceEvent("TLogPeekRemoteAddingOldBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("BestOldSet", bestOldSet).detail("LogRouterIds", oldLogData[i].tLogs[bestOldSet]->logRouterString()) + TraceEvent("TLogPeekRemoteAddingOldBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end.present() ? end.get() : getPeekEnd()).detail("BestOldSet", bestOldSet).detail("LogRouterIds", oldLogData[i].tLogs[bestOldSet]->logRouterString()) .detail("LastBegin", lastBegin).detail("ThisBegin", thisBegin).detail("BestStartVer", oldLogData[i].tLogs[bestOldSet]->startVersion); - cursors.emplace_back(new ILogSystem::MergedPeekCursor(oldLogData[i].tLogs[bestOldSet]->logRouters, -1, (int)oldLogData[i].tLogs[bestOldSet]->logRouters.size(), tag, - thisBegin, lastBegin, false, std::vector(), Reference(), 0)); + cursors.emplace_back(new ILogSystem::BufferedCursor(oldLogData[i].tLogs[bestOldSet]->logRouters, tag, thisBegin, lastBegin, parallelGetMore)); epochEnds.emplace_back(lastBegin); lastBegin = thisBegin; } @@ -602,14 +601,14 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted peek( UID dbgid, Version begin, Tag tag, bool parallelGetMore ) { + virtual Reference peek( UID dbgid, Version begin, Optional end, Tag tag, bool parallelGetMore ) { if(!tLogs.size()) { TraceEvent("TLogPeekNoLogSets", dbgid).detail("Tag", tag.toString()).detail("Begin", begin); return Reference( new ILogSystem::ServerPeekCursor( Reference>>(), tag, begin, getPeekEnd(), false, false ) ); } if(tag.locality == tagLocalityRemoteLog) { - return peekRemote(dbgid, begin, tag, parallelGetMore); + return peekRemote(dbgid, begin, end, tag, parallelGetMore); } else { return peekAll(dbgid, begin, getPeekEnd(), tag, parallelGetMore); } @@ -622,12 +621,12 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted > cursors; for(auto tag : tags) { - cursors.push_back(peek(dbgid, begin, tag, parallelGetMore)); + cursors.push_back(peek(dbgid, begin, end, tag, parallelGetMore)); } return Reference( new ILogSystem::BufferedCursor(cursors, begin, end.present() ? end.get() + 1 : getPeekEnd(), true, tLogs[0]->locality == tagLocalityUpgraded, false) ); } @@ -1033,7 +1032,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted popFromLog( TagPartitionedLogSystem* self, Reference>> log, Tag tag, double time ) { state Version last = 0; loop { - wait( delay(time) ); + wait( delay(time, TaskPriority::TLogPop) ); state std::pair to = self->outstandingPops[ std::make_pair(log->get().id(),tag) ]; @@ -1045,7 +1044,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedget().present() ) return Void(); - wait(log->get().interf().popMessages.getReply( TLogPopRequest( to.first, to.second, tag ) ) ); + wait(log->get().interf().popMessages.getReply( TLogPopRequest( to.first, to.second, tag ), TaskPriority::TLogPop ) ); last = to.first; } catch (Error& e) { @@ -1270,7 +1269,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted::max(); } - virtual void getPushLocations(std::vector const& tags, std::vector& locations, bool allLocations) { + virtual void getPushLocations(VectorRef tags, std::vector& locations, bool allLocations) { int locationOffset = 0; for(auto& log : tLogs) { if(log->isLocal && log->logServers.size()) { @@ -1907,7 +1906,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted locations; for( Tag tag : localTags ) { locations.clear(); - logSet->getPushLocations( vector(1, tag), locations, 0 ); + logSet->getPushLocations( VectorRef(&tag, 1), locations, 0 ); for(int loc : locations) remoteTLogReqs[ loc ].recoverTags.push_back( tag ); } @@ -1923,7 +1922,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtxsTags); locations.clear(); - logSet->getPushLocations( {pushTag}, locations, 0 ); + logSet->getPushLocations( VectorRef(&pushTag, 1), locations, 0 ); for(int loc : locations) remoteTLogReqs[ loc ].recoverTags.push_back( tag ); } @@ -2117,7 +2116,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted locations; for( Tag tag : localTags ) { locations.clear(); - logSystem->tLogs[0]->getPushLocations( vector(1, tag), locations, 0 ); + logSystem->tLogs[0]->getPushLocations( VectorRef(&tag, 1), locations, 0 ); for(int loc : locations) reqs[ loc ].recoverTags.push_back( tag ); } @@ -2131,7 +2130,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtxsTags); locations.clear(); - logSystem->tLogs[0]->getPushLocations( vector(1, pushTag), locations, 0 ); + logSystem->tLogs[0]->getPushLocations( VectorRef(&pushTag, 1), locations, 0 ); for(int loc : locations) reqs[ loc ].recoverTags.push_back( tag ); } @@ -2183,7 +2182,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedlogRouterTags); locations.clear(); - logSystem->tLogs[1]->getPushLocations( {pushLocation}, locations, 0 ); + logSystem->tLogs[1]->getPushLocations( VectorRef(&pushLocation,1), locations, 0 ); for(int loc : locations) sreqs[ loc ].recoverTags.push_back( tag ); } @@ -2193,7 +2192,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtxsTags); locations.clear(); - logSystem->tLogs[1]->getPushLocations( {pushTag}, locations, 0 ); + logSystem->tLogs[1]->getPushLocations( VectorRef(&pushTag,1), locations, 0 ); for(int loc : locations) sreqs[ loc ].recoverTags.push_back( tag ); } diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 376ea8bdf0..b4facd88f2 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -25,8 +25,10 @@ #include "flow/serialize.h" #include "flow/genericactors.actor.h" #include "flow/UnitTest.h" -#include "fdbserver/MemoryPager.h" -#include "fdbserver/IndirectShadowPager.h" +#include "fdbserver/IPager.h" +#include "fdbrpc/IAsyncFile.h" +#include "fdbrpc/crc32c.h" +#include "flow/ActorCollection.h" #include #include #include "fdbclient/CommitTransaction.h" @@ -35,6 +37,1744 @@ #include #include "flow/actorcompiler.h" #include +#include + +// Some convenience functions for debugging to stringify various structures +// Classes can add compatibility by either specializing toString or implementing +// std::string toString() const; +template +std::string toString(const T &o) { + return o.toString(); +} + +std::string toString(StringRef s) { + return s.printable(); +} + +std::string toString(LogicalPageID id) { + if(id == invalidLogicalPageID) { + return "LogicalPageID{invalid}"; + } + return format("LogicalPageID{%" PRId64 "}", id); +} + +template +std::string toString(const Standalone &s) { + return toString((T)s); +} + +template +std::string toString(const T *begin, const T *end) { + std::string r = "{"; + + bool comma = false; + while(begin != end) { + if(comma) { + r += ", "; + } + else { + comma = true; + } + r += toString(*begin++); + } + + r += "}"; + return r; +} + +template +std::string toString(const std::vector &v) { + return toString(v.begin(), v.end()); +} + +template +std::string toString(const VectorRef &v) { + return toString(v.begin(), v.end()); +} + +template +std::string toString(const Optional &o) { + if(o.present()) { + return toString(o.get()); + } + return ""; +} + +// A FIFO queue of T stored as a linked list of pages. +// Main operations are pop(), pushBack(), pushFront(), and flush(). +// +// flush() will ensure all queue pages are written to the pager and move the unflushed +// pushFront()'d records onto the front of the queue, in FIFO order. +// +// pop() will only return records that have been flushed, and pops +// from the front of the queue. +// +// Each page contains some number of T items and a link to the next page and starting position on that page. +// When the queue is flushed, the last page in the chain is ended and linked to a newly allocated +// but not-yet-written-to pageID, which future writes after the flush will write to. +// Items pushed onto the front of the queue are written to a separate linked list until flushed, +// at which point that list becomes the new front of the queue. +// +// The write pattern is designed such that no page is ever expected to be valid after +// being written to or updated but not fsync'd. This is why a new unused page is added +// to the queue, linked to by the last data page, before commit. The new page can't be +// added and filled with data as part of the next commit because that would mean modifying +// the previous tail page to update its next link, which risks corrupting it and losing +// data that was not yet popped if that write is never fsync'd. +// +// Requirements on T +// - must be trivially copyable +// OR have a specialization for FIFOQueueCodec +// OR have the following methods +// // Deserialize from src into *this, return number of bytes from src consumed +// int readFromBytes(const uint8_t *src); +// // Return the size of *this serialized +// int bytesNeeded() const; +// // Serialize *this to dst, return number of bytes written to dst +// int writeToBytes(uint8_t *dst) const; +// - must be supported by toString(object) (see above) +template +struct FIFOQueueCodec { + static T readFromBytes(const uint8_t *src, int &bytesRead) { + T x; + bytesRead = x.readFromBytes(src); + return x; + } + static int bytesNeeded(const T &x) { + return x.bytesNeeded(); + } + static int writeToBytes(uint8_t *dst, const T &x) { + return x.writeToBytes(dst); + } +}; + +template +struct FIFOQueueCodec::value>::type> { + static_assert(std::is_trivially_copyable::value); + static T readFromBytes(const uint8_t *src, int &bytesRead) { + bytesRead = sizeof(T); + return *(T *)src; + } + static int bytesNeeded(const T &x) { + return sizeof(T); + } + static int writeToBytes(uint8_t *dst, const T &x) { + *(T *)dst = x; + return sizeof(T); + } +}; + +template> +class FIFOQueue { +public: +#pragma pack(push, 1) + struct QueueState { + bool operator==(const QueueState &rhs) const { + return memcmp(this, &rhs, sizeof(QueueState)) == 0; + } + LogicalPageID headPageID = invalidLogicalPageID; + LogicalPageID tailPageID = invalidLogicalPageID; + uint16_t headOffset; + // Note that there is no tail index because the tail page is always never-before-written and its index will start at 0 + int64_t numPages; + int64_t numEntries; + std::string toString() const { + return format("{head: %s:%d tail: %s numPages: %" PRId64 " numEntries: %" PRId64 "}", ::toString(headPageID).c_str(), (int)headOffset, ::toString(tailPageID).c_str(), numPages, numEntries); + } + }; +#pragma pack(pop) + + struct Cursor { + enum Mode { + NONE, + POP, + READONLY, + WRITE + }; + + // The current page being read or written to + LogicalPageID pageID; + + // The first page ID to be written to the pager, if this cursor has written anything + LogicalPageID firstPageIDWritten; + + // Offset after RawPage header to next read from or write to + int offset; + + // A read cursor will not read this page (or beyond) + LogicalPageID endPageID; + + Reference page; + FIFOQueue *queue; + Future operation; + Mode mode; + + Cursor() : mode(NONE) { + } + + // Initialize a cursor. + void init(FIFOQueue *q = nullptr, Mode m = NONE, LogicalPageID initialPageID = invalidLogicalPageID, int readOffset = 0, LogicalPageID endPage = invalidLogicalPageID) { + if(operation.isValid()) { + operation.cancel(); + } + queue = q; + mode = m; + firstPageIDWritten = invalidLogicalPageID; + offset = readOffset; + endPageID = endPage; + page.clear(); + + if(mode == POP || mode == READONLY) { + // If cursor is not pointed at the end page then start loading it. + // The end page will not have been written to disk yet. + pageID = initialPageID; + operation = (pageID == endPageID) ? Void() : loadPage(); + } + else { + pageID = invalidLogicalPageID; + ASSERT(mode == WRITE || (initialPageID == invalidLogicalPageID && readOffset == 0 && endPage == invalidLogicalPageID)); + operation = Void(); + } + + debug_printf("FIFOQueue::Cursor(%s) initialized\n", toString().c_str()); + + if(mode == WRITE && initialPageID != invalidLogicalPageID) { + addNewPage(initialPageID, 0, true); + } + } + + // Since cursors can have async operations pending which modify their state they can't be copied cleanly + Cursor(const Cursor &other) = delete; + + // A read cursor can be initialized from a pop cursor + void initReadOnly(const Cursor &c) { + ASSERT(c.mode == READONLY || c.mode == POP); + init(c.queue, READONLY, c.pageID, c.offset, c.endPageID); + } + + ~Cursor() { + operation.cancel(); + } + + std::string toString() const { + if(mode == WRITE) { + return format("{WriteCursor %s:%p pos=%s:%d endOffset=%d}", queue->name.c_str(), this, ::toString(pageID).c_str(), offset, page ? raw()->endOffset : -1); + } + if(mode == POP || mode == READONLY) { + return format("{ReadCursor %s:%p pos=%s:%d endOffset=%d endPage=%s}", queue->name.c_str(), this, ::toString(pageID).c_str(), offset, page ? raw()->endOffset : -1, ::toString(endPageID).c_str()); + } + ASSERT(mode == NONE); + return format("{NullCursor=%p}", this); + } + +#pragma pack(push, 1) + struct RawPage { + LogicalPageID nextPageID; + uint16_t nextOffset; + uint16_t endOffset; + uint8_t * begin() { + return (uint8_t *)(this + 1); + } + }; +#pragma pack(pop) + + Future notBusy() { + return operation; + } + + // Returns true if any items have been written to the last page + bool pendingWrites() const { + return mode == WRITE && offset != 0; + } + + RawPage * raw() const { + return ((RawPage *)(page->begin())); + } + + void setNext(LogicalPageID pageID, int offset) { + ASSERT(mode == WRITE); + RawPage *p = raw(); + p->nextPageID = pageID; + p->nextOffset = offset; + } + + Future loadPage() { + ASSERT(mode == POP | mode == READONLY); + debug_printf("FIFOQueue::Cursor(%s) loadPage\n", toString().c_str()); + return map(queue->pager->readPage(pageID, true), [=](Reference p) { + page = p; + debug_printf("FIFOQueue::Cursor(%s) loadPage done\n", toString().c_str()); + return Void(); + }); + } + + void writePage() { + ASSERT(mode == WRITE); + debug_printf("FIFOQueue::Cursor(%s) writePage\n", toString().c_str()); + VALGRIND_MAKE_MEM_DEFINED(raw()->begin(), offset); + VALGRIND_MAKE_MEM_DEFINED(raw()->begin() + offset, queue->dataBytesPerPage - raw()->endOffset); + queue->pager->updatePage(pageID, page); + if(firstPageIDWritten == invalidLogicalPageID) { + firstPageIDWritten = pageID; + } + } + + // Link the current page to newPageID:newOffset and then write it to the pager. + // If initializeNewPage is true a page buffer will be allocated for the new page and it will be initialized + // as a new tail page. + void addNewPage(LogicalPageID newPageID, int newOffset, bool initializeNewPage) { + ASSERT(mode == WRITE); + ASSERT(newPageID != invalidLogicalPageID); + debug_printf("FIFOQueue::Cursor(%s) Adding page %s init=%d\n", toString().c_str(), ::toString(newPageID).c_str(), initializeNewPage); + + // Update existing page and write, if it exists + if(page) { + setNext(newPageID, newOffset); + debug_printf("FIFOQueue::Cursor(%s) Linked new page\n", toString().c_str()); + writePage(); + } + + pageID = newPageID; + offset = newOffset; + + if(initializeNewPage) { + debug_printf("FIFOQueue::Cursor(%s) Initializing new page\n", toString().c_str()); + page = queue->pager->newPageBuffer(); + setNext(0, 0); + auto p = raw(); + ASSERT(newOffset == 0); + p->endOffset = 0; + } + else { + page.clear(); + } + } + + // Write item to the next position in the current page or, if it won't fit, add a new page and write it there. + ACTOR static Future write_impl(Cursor *self, T item, Future start) { + ASSERT(self->mode == WRITE); + + // Wait for the previous operation to finish + state Future previous = self->operation; + wait(start); + wait(previous); + + state int bytesNeeded = Codec::bytesNeeded(item); + if(self->pageID == invalidLogicalPageID || self->offset + bytesNeeded > self->queue->dataBytesPerPage) { + debug_printf("FIFOQueue::Cursor(%s) write(%s) page is full, adding new page\n", self->toString().c_str(), ::toString(item).c_str()); + LogicalPageID newPageID = wait(self->queue->pager->newPageID()); + self->addNewPage(newPageID, 0, true); + ++self->queue->numPages; + wait(yield()); + } + debug_printf("FIFOQueue::Cursor(%s) before write(%s)\n", self->toString().c_str(), ::toString(item).c_str()); + auto p = self->raw(); + Codec::writeToBytes(p->begin() + self->offset, item); + self->offset += bytesNeeded; + p->endOffset = self->offset; + ++self->queue->numEntries; + return Void(); + } + + void write(const T &item) { + Promise p; + operation = write_impl(this, item, p.getFuture()); + p.send(Void()); + } + + // Read the next item at the cursor (if <= upperBound), moving to a new page first if the current page is exhausted + ACTOR static Future> readNext_impl(Cursor *self, Optional upperBound, Future start) { + ASSERT(self->mode == POP || self->mode == READONLY); + + // Wait for the previous operation to finish + state Future previous = self->operation; + wait(start); + wait(previous); + + debug_printf("FIFOQueue::Cursor(%s) readNext begin\n", self->toString().c_str()); + if(self->pageID == invalidLogicalPageID || self->pageID == self->endPageID) { + debug_printf("FIFOQueue::Cursor(%s) readNext returning nothing\n", self->toString().c_str()); + return Optional(); + } + + // We now know we are pointing to PageID and it should be read and used, but it may not be loaded yet. + if(!self->page) { + wait(self->loadPage()); + wait(yield()); + } + + auto p = self->raw(); + debug_printf("FIFOQueue::Cursor(%s) readNext reading at current position\n", self->toString().c_str()); + ASSERT(self->offset < p->endOffset); + int bytesRead; + T result = Codec::readFromBytes(p->begin() + self->offset, bytesRead); + + if(upperBound.present() && upperBound.get() < result) { + debug_printf("FIFOQueue::Cursor(%s) not popping %s, exceeds upper bound %s\n", + self->toString().c_str(), ::toString(result).c_str(), ::toString(upperBound.get()).c_str()); + return Optional(); + } + + self->offset += bytesRead; + if(self->mode == POP) { + --self->queue->numEntries; + } + debug_printf("FIFOQueue::Cursor(%s) after read of %s\n", self->toString().c_str(), ::toString(result).c_str()); + ASSERT(self->offset <= p->endOffset); + + if(self->offset == p->endOffset) { + debug_printf("FIFOQueue::Cursor(%s) Page exhausted\n", self->toString().c_str()); + LogicalPageID oldPageID = self->pageID; + self->pageID = p->nextPageID; + self->offset = p->nextOffset; + if(self->mode == POP) { + --self->queue->numPages; + } + self->page.clear(); + debug_printf("FIFOQueue::Cursor(%s) readNext page exhausted, moved to new page\n", self->toString().c_str()); + + if(self->mode == POP) { + // Freeing the old page must happen after advancing the cursor and clearing the page reference because + // freePage() could cause a push onto a queue that causes a newPageID() call which could pop() from this + // very same queue. + // Queue pages are freed at page 0 because they can be reused after the next commit. + self->queue->pager->freePage(oldPageID, 0); + } + } + + debug_printf("FIFOQueue(%s) %s(upperBound=%s) -> %s\n", self->queue->name.c_str(), (self->mode == POP ? "pop" : "peek"), ::toString(upperBound).c_str(), ::toString(result).c_str()); + return result; + } + + // Read and move past the next item if is <= upperBound or if upperBound is not present + Future> readNext(const Optional &upperBound = {}) { + if(mode == NONE) { + return Optional(); + } + Promise p; + Future> read = readNext_impl(this, upperBound, p.getFuture()); + operation = success(read); + p.send(Void()); + return read; + } + }; + +public: + FIFOQueue() : pager(nullptr) { + } + + ~FIFOQueue() { + newTailPage.cancel(); + } + + FIFOQueue(const FIFOQueue &other) = delete; + void operator=(const FIFOQueue &rhs) = delete; + + // Create a new queue at newPageID + void create(IPager2 *p, LogicalPageID newPageID, std::string queueName) { + debug_printf("FIFOQueue(%s) create from page %s\n", queueName.c_str(), toString(newPageID).c_str()); + pager = p; + name = queueName; + numPages = 1; + numEntries = 0; + dataBytesPerPage = pager->getUsablePageSize() - sizeof(typename Cursor::RawPage); + headReader.init(this, Cursor::POP, newPageID, 0, newPageID); + tailWriter.init(this, Cursor::WRITE, newPageID); + headWriter.init(this, Cursor::WRITE); + newTailPage = invalidLogicalPageID; + debug_printf("FIFOQueue(%s) created\n", queueName.c_str()); + } + + // Load an existing queue from its queue state + void recover(IPager2 *p, const QueueState &qs, std::string queueName) { + debug_printf("FIFOQueue(%s) recover from queue state %s\n", queueName.c_str(), qs.toString().c_str()); + pager = p; + name = queueName; + numPages = qs.numPages; + numEntries = qs.numEntries; + dataBytesPerPage = pager->getUsablePageSize() - sizeof(typename Cursor::RawPage); + headReader.init(this, Cursor::POP, qs.headPageID, qs.headOffset, qs.tailPageID); + tailWriter.init(this, Cursor::WRITE, qs.tailPageID); + headWriter.init(this, Cursor::WRITE); + newTailPage = invalidLogicalPageID; + debug_printf("FIFOQueue(%s) recovered\n", queueName.c_str()); + } + + ACTOR static Future>> peekAll_impl(FIFOQueue *self) { + state Standalone> results; + state Cursor c; + c.initReadOnly(self->headReader); + results.reserve(results.arena(), self->numEntries); + + loop { + Optional x = wait(c.readNext()); + if(!x.present()) { + break; + } + results.push_back(results.arena(), x.get()); + } + + return results; + } + + Future>> peekAll() { + return peekAll_impl(this); + } + + // Pop the next item on front of queue if it is <= upperBound or if upperBound is not present + Future> pop(Optional upperBound = {}) { + return headReader.readNext(upperBound); + } + + QueueState getState() const { + QueueState s; + s.headOffset = headReader.offset; + s.headPageID = headReader.pageID; + s.tailPageID = tailWriter.pageID; + s.numEntries = numEntries; + s.numPages = numPages; + + debug_printf("FIFOQueue(%s) getState(): %s\n", name.c_str(), s.toString().c_str()); + return s; + } + + void pushBack(const T &item) { + debug_printf("FIFOQueue(%s) pushBack(%s)\n", name.c_str(), toString(item).c_str()); + tailWriter.write(item); + } + + void pushFront(const T &item) { + debug_printf("FIFOQueue(%s) pushFront(%s)\n", name.c_str(), toString(item).c_str()); + headWriter.write(item); + } + + // Wait until the most recently started operations on each cursor as of now are ready + Future notBusy() { + return headWriter.notBusy() && headReader.notBusy() && tailWriter.notBusy() && ready(newTailPage); + } + + // Returns true if any most recently started operations on any cursors are not ready + bool busy() { + return !headWriter.notBusy().isReady() || !headReader.notBusy().isReady() || !tailWriter.notBusy().isReady() || !newTailPage.isReady(); + } + + // preFlush() prepares this queue to be flushed to disk, but doesn't actually do it so the queue can still + // be pushed and popped after this operation. It returns whether or not any operations were pending or + // started during execution. + // + // If one or more queues are used by their pager in newPageID() or freePage() operations, then preFlush() + // must be called on each of them inside a loop that runs until each of the preFlush() calls have returned + // false. + // + // The reason for all this is that: + // - queue pop() can call pager->freePage() which can call push() on the same or another queue + // - queue push() can call pager->newPageID() which can call pop() on the same or another queue + // This creates a circular dependency with 1 or more queues when those queues are used by the pager + // to manage free page IDs. + ACTOR static Future preFlush_impl(FIFOQueue *self) { + debug_printf("FIFOQueue(%s) preFlush begin\n", self->name.c_str()); + wait(self->notBusy()); + + // Completion of the pending operations as of the start of notBusy() could have began new operations, + // so see if any work is pending now. + bool workPending = self->busy(); + + if(!workPending) { + // A newly created or flushed queue starts out in a state where its tail page to be written to is empty. + // After pushBack() is called, this is no longer the case and never will be again until the queue is flushed. + // Before the non-empty tail page is written it must be linked to a new empty page for use after the next + // flush. (This is explained more at the top of FIFOQueue but it is because queue pages can only be written + // once because once they contain durable data a second write to link to a new page could corrupt the existing + // data if the subsequent commit never succeeds.) + if(self->newTailPage.isReady() && self->newTailPage.get() == invalidLogicalPageID && self->tailWriter.pendingWrites()) { + self->newTailPage = self->pager->newPageID(); + workPending = true; + } + } + + debug_printf("FIFOQueue(%s) preFlush returning %d\n", self->name.c_str(), workPending); + return workPending; + } + + Future preFlush() { + return preFlush_impl(this); + } + + void finishFlush() { + debug_printf("FIFOQueue(%s) finishFlush start\n", name.c_str()); + ASSERT(!busy()); + + // If a new tail page was allocated, link the last page of the tail writer to it. + if(newTailPage.get() != invalidLogicalPageID) { + tailWriter.addNewPage(newTailPage.get(), 0, false); + // The flush sequence allocated a page and added it to the queue so increment numPages + ++numPages; + + // newPage() should be ready immediately since a pageID is being explicitly passed. + ASSERT(tailWriter.notBusy().isReady()); + + newTailPage = invalidLogicalPageID; + } + + // If the headWriter wrote anything, link its tail page to the headReader position and point the headReader + // to the start of the headWriter + if(headWriter.pendingWrites()) { + headWriter.addNewPage(headReader.pageID, headReader.offset, false); + headReader.pageID = headWriter.firstPageIDWritten; + headReader.offset = 0; + headReader.page.clear(); + } + + // Update headReader's end page to the new tail page + headReader.endPageID = tailWriter.pageID; + + // Reset the write cursors + tailWriter.init(this, Cursor::WRITE, tailWriter.pageID); + headWriter.init(this, Cursor::WRITE); + + debug_printf("FIFOQueue(%s) finishFlush end\n", name.c_str()); + } + + ACTOR static Future flush_impl(FIFOQueue *self) { + loop { + bool notDone = wait(self->preFlush()); + if(!notDone) { + break; + } + } + self->finishFlush(); + return Void(); + } + + Future flush() { + return flush_impl(this); + } + + IPager2 *pager; + int64_t numPages; + int64_t numEntries; + int dataBytesPerPage; + + Cursor headReader; + Cursor tailWriter; + Cursor headWriter; + + Future newTailPage; + + // For debugging + std::string name; +}; + +int nextPowerOf2(uint32_t x) { + return 1 << (32 - clz(x - 1)); +} + +class FastAllocatedPage : public IPage, public FastAllocated, ReferenceCounted { +public: + // Create a fast-allocated page with size total bytes INCLUDING checksum + FastAllocatedPage(int size, int bufferSize) : logicalSize(size), bufferSize(bufferSize) { + buffer = (uint8_t *)allocateFast(bufferSize); + // Mark any unused page portion defined + VALGRIND_MAKE_MEM_DEFINED(buffer + logicalSize, bufferSize - logicalSize); + }; + + virtual ~FastAllocatedPage() { + freeFast(bufferSize, buffer); + } + + // Usable size, without checksum + int size() const { + return logicalSize - sizeof(Checksum); + } + + uint8_t const* begin() const { + return buffer; + } + + uint8_t* mutate() { + return buffer; + } + + void addref() const { + ReferenceCounted::addref(); + } + + void delref() const { + ReferenceCounted::delref(); + } + + typedef uint32_t Checksum; + + Checksum & getChecksum() { + return *(Checksum *)(buffer + size()); + } + + Checksum calculateChecksum(LogicalPageID pageID) { + return crc32c_append(pageID, buffer, size()); + } + + void updateChecksum(LogicalPageID pageID) { + getChecksum() = calculateChecksum(pageID); + } + + bool verifyChecksum(LogicalPageID pageID) { + return getChecksum() == calculateChecksum(pageID); + } +private: + int logicalSize; + int bufferSize; + uint8_t *buffer; +}; + +// Holds an index of recently used objects. +// ObjectType must have the method +// bool evictable() const; // return true if the entry can be evicted +// Future onEvictable() const; // ready when entry can be evicted +// indicating if it is safe to evict. +template +class ObjectCache : NonCopyable { + + struct Entry : public boost::intrusive::list_base_hook<> { + Entry() : hits(0) { + } + IndexType index; + ObjectType item; + int hits; + }; + +public: + ObjectCache(int sizeLimit = 0) : sizeLimit(sizeLimit), cacheHits(0), cacheMisses(0), noHitEvictions(0) { + } + + void setSizeLimit(int n) { + sizeLimit = n; + } + + // Get the object for i if it exists, else return nullptr. + // If the object exists, its eviction order will NOT change as this is not a cache hit. + ObjectType * getIfExists(const IndexType &index) { + auto i = cache.find(index); + if(i != cache.end()) { + ++i->second.hits; + return &i->second.item; + } + return nullptr; + } + + // Get the object for i or create a new one. + // After a get(), the object for i is the last in evictionOrder. + ObjectType & get(const IndexType &index, bool noHit = false) { + Entry &entry = cache[index]; + + // If entry is linked into evictionOrder then move it to the back of the order + if(entry.is_linked()) { + if(!noHit) { + ++entry.hits; + ++cacheHits; + } + // Move the entry to the back of the eviction order + evictionOrder.erase(evictionOrder.iterator_to(entry)); + evictionOrder.push_back(entry); + } + else { + ++cacheMisses; + // Finish initializing entry + entry.index = index; + entry.hits = noHit ? 0 : 1; + // Insert the newly created Entry at the back of the eviction order + evictionOrder.push_back(entry); + + // If the cache is too big, try to evict the first Entry in the eviction order + if(cache.size() > sizeLimit) { + Entry &toEvict = evictionOrder.front(); + debug_printf("Trying to evict %s to make room for %s\n", toString(toEvict.index).c_str(), toString(index).c_str()); + // Don't evict the entry that was just added as then we can't return a reference to it. + if(toEvict.index != index && toEvict.item.evictable()) { + if(toEvict.hits == 0) { + ++noHitEvictions; + } + debug_printf("Evicting %s to make room for %s\n", toString(toEvict.index).c_str(), toString(index).c_str()); + evictionOrder.pop_front(); + cache.erase(toEvict.index); + } + } + } + + return entry.item; + } + + // Clears the cache, saving the entries, and then waits for eachWaits for each item to be evictable and evicts it. + // The cache should not be Evicts all evictable entries + ACTOR static Future clear_impl(ObjectCache *self) { + state std::unordered_map cache; + state boost::intrusive::list evictionOrder; + + // Swap cache contents to local state vars + cache.swap(self->cache); + evictionOrder.swap(self->evictionOrder); + + state typename boost::intrusive::list::iterator i = evictionOrder.begin(); + state typename boost::intrusive::list::iterator iEnd = evictionOrder.begin(); + + while(i != iEnd) { + if(!i->item.evictable()) { + wait(i->item.onEvictable()); + } + ++i; + } + + evictionOrder.clear(); + cache.clear(); + + return Void(); + } + + Future clear() { + return clear_impl(this); + } + + int count() const { + ASSERT(evictionOrder.size() == cache.size()); + return evictionOrder.size(); + } + +private: + int64_t sizeLimit; + int64_t cacheHits; + int64_t cacheMisses; + int64_t noHitEvictions; + + // TODO: Use boost intrusive unordered set instead, with a comparator that only considers entry.index + std::unordered_map cache; + boost::intrusive::list evictionOrder; +}; + +ACTOR template Future forwardError(Future f, Promise target) { + try { + T x = wait(f); + return x; + } + catch(Error &e) { + if(e.code() != error_code_actor_cancelled && target.canBeSet()) { + target.sendError(e); + } + + throw e; + } +} + +class DWALPagerSnapshot; + +// An implementation of IPager2 that supports atomicUpdate() of a page without forcing a change to new page ID. +// It does this internally mapping the original page ID to alternate page IDs by write version. +// The page id remaps are kept in memory and also logged to a "remap queue" which must be reloaded on cold start. +// To prevent the set of remaps from growing unboundedly, once a remap is old enough to be at or before the +// oldest pager version being maintained the remap can be "undone" by popping it from the remap queue, +// copying the alternate page ID's data over top of the original page ID's data, and deleting the remap from memory. +// This process basically describes a "Delayed" Write-Ahead-Log (DWAL) because the remap queue and the newly allocated +// alternate pages it references basically serve as a write ahead log for pages that will eventially be copied +// back to their original location once the original version is no longer needed. +class DWALPager : public IPager2 { +public: + typedef FastAllocatedPage Page; + typedef FIFOQueue LogicalPageQueueT; + +#pragma pack(push, 1) + struct DelayedFreePage { + Version version; + LogicalPageID pageID; + + bool operator<(const DelayedFreePage &rhs) const { + return version < rhs.version; + } + + std::string toString() const { + return format("DelayedFreePage{%s @%" PRId64 "}", ::toString(pageID).c_str(), version); + } + }; + + struct RemappedPage { + Version version; + LogicalPageID originalPageID; + LogicalPageID newPageID; + + bool operator<(const RemappedPage &rhs) { + return version < rhs.version; + } + + std::string toString() const { + return format("RemappedPage(%s -> %s @%" PRId64 "}", ::toString(originalPageID).c_str(), ::toString(newPageID).c_str(), version); + } + }; + +#pragma pack(pop) + + typedef FIFOQueue DelayedFreePageQueueT; + typedef FIFOQueue RemapQueueT; + + // If the file already exists, pageSize might be different than desiredPageSize + // Use pageCacheSizeBytes == 0 for default + DWALPager(int desiredPageSize, std::string filename, int64_t pageCacheSizeBytes) + : desiredPageSize(desiredPageSize), filename(filename), pHeader(nullptr), pageCacheBytes(pageCacheSizeBytes) + { + if(pageCacheBytes == 0) { + pageCacheBytes = g_network->isSimulated() ? (BUGGIFY ? FLOW_KNOBS->BUGGIFY_SIM_PAGE_CACHE_4K : FLOW_KNOBS->SIM_PAGE_CACHE_4K) : FLOW_KNOBS->PAGE_CACHE_4K; + } + commitFuture = Void(); + recoverFuture = forwardError(recover(this), errorPromise); + } + + void setPageSize(int size) { + logicalPageSize = size; + physicalPageSize = smallestPhysicalBlock; + while(logicalPageSize > physicalPageSize) { + physicalPageSize += smallestPhysicalBlock; + } + if(pHeader != nullptr) { + pHeader->pageSize = logicalPageSize; + } + pageCache.setSizeLimit(pageCacheBytes / physicalPageSize); + } + + void updateCommittedHeader() { + memcpy(lastCommittedHeaderPage->mutate(), headerPage->begin(), smallestPhysicalBlock); + } + + ACTOR static Future recover(DWALPager *self) { + ASSERT(!self->recoverFuture.isValid()); + + self->remapUndoFuture = Void(); + + int64_t flags = IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_READWRITE | IAsyncFile::OPEN_LOCK; + state bool exists = fileExists(self->filename); + if(!exists) { + flags |= IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_CREATE; + } + + wait(store(self->pageFile, IAsyncFileSystem::filesystem()->open(self->filename, flags, 0644))); + + // Header page is always treated as having a page size of smallestPhysicalBlock + self->setPageSize(smallestPhysicalBlock); + self->lastCommittedHeaderPage = self->newPageBuffer(); + self->pLastCommittedHeader = (Header *)self->lastCommittedHeaderPage->begin(); + + state int64_t fileSize = 0; + if(exists) { + wait(store(fileSize, self->pageFile->size())); + } + + debug_printf("DWALPager(%s) recover exists=%d fileSize=%" PRId64 "\n", self->filename.c_str(), exists, fileSize); + // TODO: If the file exists but appears to never have been successfully committed is this an error or + // should recovery proceed with a new pager instance? + + // If there are at least 2 pages then try to recover the existing file + if(exists && fileSize >= (self->smallestPhysicalBlock * 2)) { + debug_printf("DWALPager(%s) recovering using existing file\n"); + + state bool recoveredHeader = false; + + // Read physical page 0 directly + wait(store(self->headerPage, self->readHeaderPage(self, 0))); + + // If the checksum fails for the header page, try to recover committed header backup from page 1 + if(!self->headerPage.castTo()->verifyChecksum(0)) { + TraceEvent(SevWarn, "DWALPagerRecoveringHeader").detail("Filename", self->filename); + + wait(store(self->headerPage, self->readHeaderPage(self, 1))); + + if(!self->headerPage.castTo()->verifyChecksum(1)) { + if(g_network->isSimulated()) { + // TODO: Detect if process is being restarted and only throw injected if so? + throw io_error().asInjectedFault(); + } + + Error e = checksum_failed(); + TraceEvent(SevError, "DWALPagerRecoveryFailed") + .detail("Filename", self->filename) + .error(e); + throw e; + } + recoveredHeader = true; + } + + self->pHeader = (Header *)self->headerPage->begin(); + + if(self->pHeader->formatVersion != Header::FORMAT_VERSION) { + Error e = internal_error(); // TODO: Something better? + TraceEvent(SevError, "DWALPagerRecoveryFailedWrongVersion") + .detail("Filename", self->filename) + .detail("Version", self->pHeader->formatVersion) + .detail("ExpectedVersion", Header::FORMAT_VERSION) + .error(e); + throw e; + } + + self->setPageSize(self->pHeader->pageSize); + if(self->logicalPageSize != self->desiredPageSize) { + TraceEvent(SevWarn, "DWALPagerPageSizeNotDesired") + .detail("Filename", self->filename) + .detail("ExistingPageSize", self->logicalPageSize) + .detail("DesiredPageSize", self->desiredPageSize); + } + + self->freeList.recover(self, self->pHeader->freeList, "FreeListRecovered"); + self->delayedFreeList.recover(self, self->pHeader->delayedFreeList, "DelayedFreeListRecovered"); + self->remapQueue.recover(self, self->pHeader->remapQueue, "RemapQueueRecovered"); + + Standalone> remaps = wait(self->remapQueue.peekAll()); + for(auto &r : remaps) { + if(r.newPageID != invalidLogicalPageID) { + self->remappedPages[r.originalPageID][r.version] = r.newPageID; + } + } + + // If the header was recovered from the backup at Page 1 then write and sync it to Page 0 before continuing. + // If this fails, the backup header is still in tact for the next recovery attempt. + if(recoveredHeader) { + // Write the header to page 0 + wait(self->writeHeaderPage(0, self->headerPage)); + + // Wait for all outstanding writes to complete + wait(self->operations.signalAndCollapse()); + + // Sync header + wait(self->pageFile->sync()); + debug_printf("DWALPager(%s) Header recovery complete.\n", self->filename.c_str()); + } + + // Update the last committed header with the one that was recovered (which is the last known committed header) + self->updateCommittedHeader(); + self->addLatestSnapshot(); + } + else { + // Note: If the file contains less than 2 pages but more than 0 bytes then the pager was never successfully committed. + // A new pager will be created in its place. + // TODO: Is the right behavior? + + debug_printf("DWALPager(%s) creating new pager\n"); + + self->headerPage = self->newPageBuffer(); + self->pHeader = (Header *)self->headerPage->begin(); + + // Now that the header page has been allocated, set page size to desired + self->setPageSize(self->desiredPageSize); + + // Write new header using desiredPageSize + self->pHeader->formatVersion = Header::FORMAT_VERSION; + self->pHeader->committedVersion = 1; + self->pHeader->oldestVersion = 1; + // No meta key until a user sets one and commits + self->pHeader->setMetaKey(Key()); + + // There are 2 reserved pages: + // Page 0 - header + // Page 1 - header backup + self->pHeader->pageCount = 2; + + // Create queues + self->freeList.create(self, self->newLastPageID(), "FreeList"); + self->delayedFreeList.create(self, self->newLastPageID(), "delayedFreeList"); + self->remapQueue.create(self, self->newLastPageID(), "remapQueue"); + + // The first commit() below will flush the queues and update the queue states in the header, + // but since the queues will not be used between now and then their states will not change. + // In order to populate lastCommittedHeader, update the header now with the queue states. + self->pHeader->freeList = self->freeList.getState(); + self->pHeader->delayedFreeList = self->delayedFreeList.getState(); + self->pHeader->remapQueue = self->remapQueue.getState(); + + // Set remaining header bytes to \xff + memset(self->headerPage->mutate() + self->pHeader->size(), 0xff, self->headerPage->size() - self->pHeader->size()); + + // Since there is no previously committed header use the initial header for the initial commit. + self->updateCommittedHeader(); + + wait(self->commit()); + } + + debug_printf("DWALPager(%s) recovered. committedVersion=%" PRId64 " logicalPageSize=%d physicalPageSize=%d\n", self->filename.c_str(), self->pHeader->committedVersion, self->logicalPageSize, self->physicalPageSize); + return Void(); + } + + Reference newPageBuffer() override { + return Reference(new FastAllocatedPage(logicalPageSize, physicalPageSize)); + } + + // Returns the usable size of pages returned by the pager (i.e. the size of the page that isn't pager overhead). + // For a given pager instance, separate calls to this function must return the same value. + int getUsablePageSize() override { + return logicalPageSize - sizeof(FastAllocatedPage::Checksum); + } + + // Get a new, previously available page ID. The page will be considered in-use after the next commit + // regardless of whether or not it was written to, until it is returned to the pager via freePage() + ACTOR static Future newPageID_impl(DWALPager *self) { + // First try the free list + Optional freePageID = wait(self->freeList.pop()); + if(freePageID.present()) { + debug_printf("DWALPager(%s) newPageID() returning %s from free list\n", self->filename.c_str(), toString(freePageID.get()).c_str()); + return freePageID.get(); + } + + // Try to reuse pages up to the earlier of the oldest version set by the user or the oldest snapshot still in the snapshots list + ASSERT(!self->snapshots.empty()); + Optional delayedFreePageID = wait(self->delayedFreeList.pop(DelayedFreePage{self->effectiveOldestVersion(), 0})); + if(delayedFreePageID.present()) { + debug_printf("DWALPager(%s) newPageID() returning %s from delayed free list\n", self->filename.c_str(), toString(delayedFreePageID.get()).c_str()); + return delayedFreePageID.get().pageID; + } + + // Lastly, add a new page to the pager + LogicalPageID id = self->newLastPageID(); + debug_printf("DWALPager(%s) newPageID() returning %s at end of file\n", self->filename.c_str(), toString(id).c_str()); + return id; + }; + + // Grow the pager file by pone page and return it + LogicalPageID newLastPageID() { + LogicalPageID id = pHeader->pageCount; + ++pHeader->pageCount; + return id; + } + + Future newPageID() override { + return newPageID_impl(this); + } + + Future writePhysicalPage(PhysicalPageID pageID, Reference page, bool header = false) { + debug_printf("DWALPager(%s) op=%s %s ptr=%p\n", filename.c_str(), (header ? "writePhysicalHeader" : "writePhysical"), toString(pageID).c_str(), page->begin()); + + VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); + ((Page *)page.getPtr())->updateChecksum(pageID); + + // Note: Not using forwardError here so a write error won't be discovered until commit time. + int blockSize = header ? smallestPhysicalBlock : physicalPageSize; + Future f = holdWhile(page, map(pageFile->write(page->begin(), blockSize, (int64_t)pageID * blockSize), [=](Void) { + debug_printf("DWALPager(%s) op=%s %s ptr=%p\n", filename.c_str(), (header ? "writePhysicalHeaderComplete" : "writePhysicalComplete"), toString(pageID).c_str(), page->begin()); + return Void(); + })); + operations.add(f); + return f; + } + + Future writeHeaderPage(PhysicalPageID pageID, Reference page) { + return writePhysicalPage(pageID, page, true); + } + + void updatePage(LogicalPageID pageID, Reference data) override { + // Get the cache entry for this page, without counting it as a cache hit as we're replacing its contents now + PageCacheEntry &cacheEntry = pageCache.get(pageID, true); + debug_printf("DWALPager(%s) op=write %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.initialized(), cacheEntry.initialized() && cacheEntry.reading(), cacheEntry.initialized() && cacheEntry.writing()); + + // If the page is still being read then it's not also being written because a write places + // the new content into readFuture when the write is launched, not when it is completed. + // Read/write ordering is being enforced waiting readers will not see the new write. This + // is necessary for remap erasure to work correctly since the oldest version of a page, located + // at the original page ID, could have a pending read when that version is expired and the write + // of the next newest version over top of the original page begins. + if(!cacheEntry.initialized()) { + cacheEntry.writeFuture = writePhysicalPage(pageID, data); + } + else if(cacheEntry.reading()) { + // Wait for the read to finish, then start the write. + cacheEntry.writeFuture = map(success(cacheEntry.readFuture), [=](Void) { + writePhysicalPage(pageID, data); + return Void(); + }); + } + // If the page is being written, wait for this write before issuing the new write to ensure the + // writes happen in the correct order + else if(cacheEntry.writing()) { + cacheEntry.writeFuture = map(cacheEntry.writeFuture, [=](Void) { + writePhysicalPage(pageID, data); + return Void(); + }); + } + else { + cacheEntry.writeFuture = writePhysicalPage(pageID, data); + } + + // Always update the page contents immediately regardless of what happened above. + cacheEntry.readFuture = data; + } + + Future atomicUpdatePage(LogicalPageID pageID, Reference data, Version v) override { + debug_printf("DWALPager(%s) op=writeAtomic %s @%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v); + // This pager does not support atomic update, so it always allocates and uses a new pageID + Future f = map(newPageID(), [=](LogicalPageID newPageID) { + updatePage(newPageID, data); + // TODO: Possibly limit size of remap queue since it must be recovered on cold start + RemappedPage r{v, pageID, newPageID}; + remapQueue.pushBack(r); + remappedPages[pageID][v] = newPageID; + debug_printf("DWALPager(%s) pushed %s\n", filename.c_str(), RemappedPage(r).toString().c_str()); + return pageID; + }); + + // No need for forwardError here because newPageID() is already wrapped in forwardError + return f; + } + + void freePage(LogicalPageID pageID, Version v) override { + // If pageID has been remapped, then it can't be freed until all existing remaps for that page have been undone, so queue it for later deletion + if(remappedPages.find(pageID) != remappedPages.end()) { + debug_printf("DWALPager(%s) op=freeRemapped %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); + remapQueue.pushBack(RemappedPage{v, pageID, invalidLogicalPageID}); + return; + } + + // If v is older than the oldest version still readable then mark pageID as free as of the next commit + if(v < effectiveOldestVersion()) { + debug_printf("DWALPager(%s) op=freeNow %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); + freeList.pushBack(pageID); + } + else { + // Otherwise add it to the delayed free list + debug_printf("DWALPager(%s) op=freeLater %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); + delayedFreeList.pushBack({v, pageID}); + } + }; + + // Read a physical page from the page file. Note that header pages use a page size of smallestPhysicalBlock + // If the user chosen physical page size is larger, then there will be a gap of unused space after the header pages + // and before the user-chosen sized pages. + ACTOR static Future> readPhysicalPage(DWALPager *self, PhysicalPageID pageID, bool header = false) { + if(g_network->getCurrentTask() > TaskPriority::DiskRead) { + wait(delay(0, TaskPriority::DiskRead)); + } + + state Reference page = header ? Reference(new FastAllocatedPage(smallestPhysicalBlock, smallestPhysicalBlock)) : self->newPageBuffer(); + debug_printf("DWALPager(%s) op=readPhysicalStart %s ptr=%p\n", self->filename.c_str(), toString(pageID).c_str(), page->begin()); + + int blockSize = header ? smallestPhysicalBlock : self->physicalPageSize; + // TODO: Could a dispatched read try to write to page after it has been destroyed if this actor is cancelled? + int readBytes = wait(self->pageFile->read(page->mutate(), blockSize, (int64_t)pageID * blockSize)); + debug_printf("DWALPager(%s) op=readPhysicalComplete %s ptr=%p bytes=%d\n", self->filename.c_str(), toString(pageID).c_str(), page->begin(), readBytes); + + // Header reads are checked explicitly during recovery + if(!header) { + Page *p = (Page *)page.getPtr(); + if(!p->verifyChecksum(pageID)) { + debug_printf("DWALPager(%s) checksum failed for %s\n", self->filename.c_str(), toString(pageID).c_str()); + Error e = checksum_failed(); + TraceEvent(SevError, "DWALPagerChecksumFailed") + .detail("Filename", self->filename.c_str()) + .detail("PageID", pageID) + .detail("PageSize", self->physicalPageSize) + .detail("Offset", pageID * self->physicalPageSize) + .detail("CalculatedChecksum", p->calculateChecksum(pageID)) + .detail("ChecksumInPage", p->getChecksum()) + .error(e); + throw e; + } + } + return page; + } + + static Future> readHeaderPage(DWALPager *self, PhysicalPageID pageID) { + return readPhysicalPage(self, pageID, true); + } + + // Reads the most recent version of pageID either committed or written using updatePage() + Future> readPage(LogicalPageID pageID, bool cacheable, bool noHit = false) override { + // Use cached page if present, without triggering a cache hit. + // Otherwise, read the page and return it but don't add it to the cache + if(!cacheable) { + debug_printf("DWALPager(%s) op=readUncached %s\n", filename.c_str(), toString(pageID).c_str()); + PageCacheEntry *pCacheEntry = pageCache.getIfExists(pageID); + if(pCacheEntry != nullptr) { + debug_printf("DWALPager(%s) op=readUncachedHit %s\n", filename.c_str(), toString(pageID).c_str()); + return pCacheEntry->readFuture; + } + + debug_printf("DWALPager(%s) op=readUncachedMiss %s\n", filename.c_str(), toString(pageID).c_str()); + return forwardError(readPhysicalPage(this, (PhysicalPageID)pageID), errorPromise); + } + + PageCacheEntry &cacheEntry = pageCache.get(pageID, noHit); + debug_printf("DWALPager(%s) op=read %s cached=%d reading=%d writing=%d noHit=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.initialized(), cacheEntry.initialized() && cacheEntry.reading(), cacheEntry.initialized() && cacheEntry.writing(), noHit); + + if(!cacheEntry.initialized()) { + debug_printf("DWALPager(%s) issuing actual read of %s\n", filename.c_str(), toString(pageID).c_str()); + cacheEntry.readFuture = readPhysicalPage(this, (PhysicalPageID)pageID); + cacheEntry.writeFuture = Void(); + } + + cacheEntry.readFuture = forwardError(cacheEntry.readFuture, errorPromise); + return cacheEntry.readFuture; + } + + Future> readPageAtVersion(LogicalPageID pageID, Version v, bool cacheable, bool noHit) { + auto i = remappedPages.find(pageID); + + if(i != remappedPages.end()) { + auto j = i->second.upper_bound(v); + if(j != i->second.begin()) { + --j; + debug_printf("DWALPager(%s) read %s @%" PRId64 " -> %s\n", filename.c_str(), toString(pageID).c_str(), v, toString(j->second).c_str()); + pageID = j->second; + } + } + else { + debug_printf("DWALPager(%s) read %s @%" PRId64 " (not remapped)\n", filename.c_str(), toString(pageID).c_str(), v); + } + + return readPage(pageID, cacheable, noHit); + } + + // Get snapshot as of the most recent committed version of the pager + Reference getReadSnapshot(Version v) override; + void addLatestSnapshot(); + + // Set the pending oldest versiont to keep as of the next commit + void setOldestVersion(Version v) override { + ASSERT(v >= pHeader->oldestVersion); + ASSERT(v <= pHeader->committedVersion); + pHeader->oldestVersion = v; + expireSnapshots(v); + }; + + // Get the oldest version set as of the last commit. + Version getOldestVersion() override { + return pLastCommittedHeader->oldestVersion; + }; + + // Calculate the *effective* oldest version, which can be older than the one set in the last commit since we + // are allowing active snapshots to temporarily delay page reuse. + Version effectiveOldestVersion() { + return std::min(pLastCommittedHeader->oldestVersion, snapshots.front().version); + } + + ACTOR static Future undoRemaps(DWALPager *self) { + state RemappedPage cutoff; + cutoff.version = self->effectiveOldestVersion(); + + // TODO: Use parallel reads + // TODO: One run of this actor might write to the same original page more than once, in which case just unmap the latest + loop { + if(self->remapUndoStop) { + break; + } + state Optional p = wait(self->remapQueue.pop(cutoff)); + if(!p.present()) { + break; + } + debug_printf("DWALPager(%s) undoRemaps popped %s\n", self->filename.c_str(), p.get().toString().c_str()); + + if(p.get().newPageID == invalidLogicalPageID) { + debug_printf("DWALPager(%s) undoRemaps freeing %s\n", self->filename.c_str(), p.get().toString().c_str()); + self->freePage(p.get().originalPageID, p.get().version); + } + else { + // Read the data from the page that the original was mapped to + Reference data = wait(self->readPage(p.get().newPageID, false)); + + // Write the data to the original page so it can be read using its original pageID + self->updatePage(p.get().originalPageID, data); + + // Remove the remap from this page, deleting the entry for the pageID if its map becomes empty + auto i = self->remappedPages.find(p.get().originalPageID); + if(i->second.size() == 1) { + self->remappedPages.erase(i); + } + else { + i->second.erase(p.get().version); + } + + // Now that the remap has been undone nothing will read this page so it can be freed as of the next commit. + self->freePage(p.get().newPageID, 0); + } + } + + debug_printf("DWALPager(%s) undoRemaps stopped, remapQueue size is %d\n", self->filename.c_str(), self->remapQueue.numEntries); + return Void(); + } + + // Flush all queues so they have no operations pending. + ACTOR static Future flushQueues(DWALPager *self) { + ASSERT(self->remapUndoFuture.isReady()); + + // Flush remap queue separately, it's not involved in free page management + wait(self->remapQueue.flush()); + + // Flush the free list and delayed free list queues together as they are used by freePage() and newPageID() + loop { + state bool freeBusy = wait(self->freeList.preFlush()); + state bool delayedFreeBusy = wait(self->delayedFreeList.preFlush()); + + // Once preFlush() returns false for both queues then there are no more operations pending + // on either queue. If preFlush() returns true for either queue in one loop execution then + // it could have generated new work for itself or the other queue. + if(!freeBusy && !delayedFreeBusy) { + break; + } + } + self->freeList.finishFlush(); + self->delayedFreeList.finishFlush(); + + return Void(); + } + + ACTOR static Future commit_impl(DWALPager *self) { + debug_printf("DWALPager(%s) commit begin\n", self->filename.c_str()); + + // Write old committed header to Page 1 + self->writeHeaderPage(1, self->lastCommittedHeaderPage); + + // Trigger the remap eraser to stop and then wait for it. + self->remapUndoStop = true; + wait(self->remapUndoFuture); + + wait(flushQueues(self)); + + self->pHeader->remapQueue = self->remapQueue.getState(); + self->pHeader->freeList = self->freeList.getState(); + self->pHeader->delayedFreeList = self->delayedFreeList.getState(); + + // Wait for all outstanding writes to complete + debug_printf("DWALPager(%s) waiting for outstanding writes\n", self->filename.c_str()); + wait(self->operations.signalAndCollapse()); + debug_printf("DWALPager(%s) Syncing\n", self->filename.c_str()); + + // Sync everything except the header + if(g_network->getCurrentTask() > TaskPriority::DiskWrite) { + wait(delay(0, TaskPriority::DiskWrite)); + } + wait(self->pageFile->sync()); + debug_printf("DWALPager(%s) commit version %" PRId64 " sync 1\n", self->filename.c_str(), self->pHeader->committedVersion); + + // Update header on disk and sync again. + wait(self->writeHeaderPage(0, self->headerPage)); + if(g_network->getCurrentTask() > TaskPriority::DiskWrite) { + wait(delay(0, TaskPriority::DiskWrite)); + } + wait(self->pageFile->sync()); + debug_printf("DWALPager(%s) commit version %" PRId64 " sync 2\n", self->filename.c_str(), self->pHeader->committedVersion); + + // Update the last committed header for use in the next commit. + self->updateCommittedHeader(); + self->addLatestSnapshot(); + + // Try to expire snapshots up to the oldest version, in case some were being kept around due to being in use, + // because maybe some are no longer in use. + self->expireSnapshots(self->pHeader->oldestVersion); + + // Start unmapping pages for expired versions + self->remapUndoStop = false; + self->remapUndoFuture = undoRemaps(self); + + return Void(); + } + + Future commit() override { + // Can't have more than one commit outstanding. + ASSERT(commitFuture.isReady()); + commitFuture = forwardError(commit_impl(this), errorPromise); + return commitFuture; + } + + Key getMetaKey() const override { + return pHeader->getMetaKey(); + } + + void setCommitVersion(Version v) override { + pHeader->committedVersion = v; + } + + void setMetaKey(KeyRef metaKey) override { + pHeader->setMetaKey(metaKey); + } + + ACTOR void shutdown(DWALPager *self, bool dispose) { + debug_printf("DWALPager(%s) shutdown cancel recovery\n", self->filename.c_str()); + self->recoverFuture.cancel(); + debug_printf("DWALPager(%s) shutdown cancel commit\n", self->filename.c_str()); + self->commitFuture.cancel(); + debug_printf("DWALPager(%s) shutdown cancel remap\n", self->filename.c_str()); + self->remapUndoFuture.cancel(); + + if(self->errorPromise.canBeSet()) { + debug_printf("DWALPager(%s) shutdown sending error\n", self->filename.c_str()); + self->errorPromise.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress + } + + // Must wait for pending operations to complete, canceling them can cause a crash because the underlying + // operations may be uncancellable and depend on memory from calling scope's page reference + debug_printf("DWALPager(%s) shutdown wait for operations\n", self->filename.c_str()); + wait(self->operations.signal()); + + debug_printf("DWALPager(%s) shutdown destroy page cache\n", self->filename.c_str()); + wait(self->pageCache.clear()); + + // Unreference the file and clear + self->pageFile.clear(); + if(dispose) { + debug_printf("DWALPager(%s) shutdown deleting file\n", self->filename.c_str()); + wait(IAsyncFileSystem::filesystem()->incrementalDeleteFile(self->filename, true)); + } + + self->closedPromise.send(Void()); + delete self; + } + + void dispose() override { + shutdown(this, true); + } + + void close() override { + shutdown(this, false); + } + + Future getError() override { + return errorPromise.getFuture(); + } + + Future onClosed() override { + return closedPromise.getFuture(); + } + + StorageBytes getStorageBytes() override { + ASSERT(recoverFuture.isReady()); + int64_t free; + int64_t total; + g_network->getDiskBytes(parentDirectory(filename), free, total); + int64_t pagerSize = pHeader->pageCount * physicalPageSize; + + // It is not exactly known how many pages on the delayed free list are usable as of right now. It could be, + // if each commit delayed entries that were freeable were shuffled from the delayed free queue to the free queue. + // but this doesn't seem necessary most of the time. + int64_t reusable = (freeList.numEntries + delayedFreeList.numEntries) * physicalPageSize; + + return StorageBytes(free, total, pagerSize, free + reusable); + } + + ACTOR static Future getUserPageCount_cleanup(DWALPager *self) { + // Wait for the remap eraser to finish all of its work (not triggering stop) + wait(self->remapUndoFuture); + + // Flush queues so there are no pending freelist operations + wait(flushQueues(self)); + + return Void(); + } + + // Get the number of pages in use by the pager's user + Future getUserPageCount() override { + return map(getUserPageCount_cleanup(this), [=](Void) { + int64_t userPages = pHeader->pageCount - 2 - freeList.numPages - freeList.numEntries - delayedFreeList.numPages - delayedFreeList.numEntries - remapQueue.numPages; + debug_printf("DWALPager(%s) userPages=%" PRId64 " totalPageCount=%" PRId64 " freeQueuePages=%" PRId64 " freeQueueCount=%" PRId64 " delayedFreeQueuePages=%" PRId64 " delayedFreeQueueCount=%" PRId64 " remapQueuePages=%" PRId64 " remapQueueCount=%" PRId64 "\n", + filename.c_str(), userPages, pHeader->pageCount, freeList.numPages, freeList.numEntries, delayedFreeList.numPages, delayedFreeList.numEntries, remapQueue.numPages, remapQueue.numEntries); + return userPages; + }); + } + + Future init() override { + return recoverFuture; + } + + Version getLatestVersion() override { + return pLastCommittedHeader->committedVersion; + } + +private: + ~DWALPager() {} + + // Try to expire snapshots up to but not including v, but do not expire any snapshots that are in use. + void expireSnapshots(Version v); + +#pragma pack(push, 1) + // Header is the format of page 0 of the database + struct Header { + static constexpr int FORMAT_VERSION = 2; + uint16_t formatVersion; + uint32_t pageSize; + int64_t pageCount; + FIFOQueue::QueueState freeList; + FIFOQueue::QueueState delayedFreeList; + FIFOQueue::QueueState remapQueue; + Version committedVersion; + Version oldestVersion; + int32_t metaKeySize; + + KeyRef getMetaKey() const { + return KeyRef((const uint8_t *)(this + 1), metaKeySize); + } + + void setMetaKey(StringRef key) { + ASSERT(key.size() < (smallestPhysicalBlock - sizeof(Header))); + metaKeySize = key.size(); + memcpy(this + 1, key.begin(), key.size()); + } + + int size() const { + return sizeof(Header) + metaKeySize; + } + + private: + Header(); + }; +#pragma pack(pop) + + struct PageCacheEntry { + Future> readFuture; + Future writeFuture; + + bool initialized() const { + return readFuture.isValid(); + } + + bool reading() const { + return !readFuture.isReady(); + } + + bool writing() const { + return !writeFuture.isReady(); + } + + bool evictable() const { + // Don't evict if a page is still being read or written + return !reading() && !writing(); + } + + Future onEvictable() const { + return ready(readFuture) && writeFuture; + } + }; + + // Physical page sizes will always be a multiple of 4k because AsyncFileNonDurable requires + // this in simulation, and it also makes sense for current SSDs. + // Allowing a smaller 'logical' page size is very useful for testing. + static constexpr int smallestPhysicalBlock = 4096; + int physicalPageSize; + int logicalPageSize; // In simulation testing it can be useful to use a small logical page size + + int64_t pageCacheBytes; + + // The header will be written to / read from disk as a smallestPhysicalBlock sized chunk. + Reference headerPage; + Header *pHeader; + + int desiredPageSize; + + Reference lastCommittedHeaderPage; + Header *pLastCommittedHeader; + + std::string filename; + + typedef ObjectCache PageCacheT; + PageCacheT pageCache; + + Promise closedPromise; + Promise errorPromise; + Future commitFuture; + SignalableActorCollection operations; + Future recoverFuture; + Future remapUndoFuture; + bool remapUndoStop; + + Reference pageFile; + + LogicalPageQueueT freeList; + + // The delayed free list will be approximately in Version order. + // TODO: Make this an ordered container some day. + DelayedFreePageQueueT delayedFreeList; + + RemapQueueT remapQueue; + + struct SnapshotEntry { + Version version; + Promise expired; + Reference snapshot; + }; + + struct SnapshotEntryLessThanVersion { + bool operator() (Version v, const SnapshotEntry &snapshot) { + return v < snapshot.version; + } + + bool operator() (const SnapshotEntry &snapshot, Version v) { + return snapshot.version < v; + } + }; + + // TODO: Better data structure + std::unordered_map> remappedPages; + + std::deque snapshots; +}; + +// Prevents pager from reusing freed pages from version until the snapshot is destroyed +class DWALPagerSnapshot : public IPagerSnapshot, public ReferenceCounted { +public: + DWALPagerSnapshot(DWALPager *pager, Key meta, Version version, Future expiredFuture) : pager(pager), metaKey(meta), version(version), expired(expiredFuture) { + } + virtual ~DWALPagerSnapshot() { + } + + Future> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool noHit) override { + if(expired.isError()) { + throw expired.getError(); + } + return map(pager->readPageAtVersion(pageID, version, cacheable, noHit), [=](Reference p) { + return Reference(p); + }); + } + + Key getMetaKey() const override { + return metaKey; + } + + Version getVersion() const override { + return version; + } + + void addref() override { + ReferenceCounted::addref(); + } + + void delref() override { + ReferenceCounted::delref(); + } + + DWALPager *pager; + Future expired; + Version version; + Key metaKey; +}; + +void DWALPager::expireSnapshots(Version v) { + debug_printf("DWALPager(%s) expiring snapshots through %" PRId64 " snapshot count %d\n", filename.c_str(), v, (int)snapshots.size()); + while(snapshots.size() > 1 && snapshots.front().version < v && snapshots.front().snapshot->isSoleOwner()) { + debug_printf("DWALPager(%s) expiring snapshot for %" PRId64 " soleOwner=%d\n", filename.c_str(), snapshots.front().version, snapshots.front().snapshot->isSoleOwner()); + // The snapshot contract could be made such that the expired promise isn't need anymore. In practice it + // probably is already not needed but it will gracefully handle the case where a user begins a page read + // with a snapshot reference, keeps the page read future, and drops the snapshot reference. + snapshots.front().expired.sendError(transaction_too_old()); + snapshots.pop_front(); + } +} + +Reference DWALPager::getReadSnapshot(Version v) { + ASSERT(!snapshots.empty()); + + auto i = std::upper_bound(snapshots.begin(), snapshots.end(), v, SnapshotEntryLessThanVersion()); + if(i == snapshots.begin()) { + throw version_invalid(); + } + --i; + return i->snapshot; +} + +void DWALPager::addLatestSnapshot() { + Promise expired; + snapshots.push_back({ + pLastCommittedHeader->committedVersion, + expired, + Reference(new DWALPagerSnapshot(this, pLastCommittedHeader->getMetaKey(), pLastCommittedHeader->committedVersion, expired.getFuture())) + }); +} + // TODO: Move this to a flow header once it is mature. struct SplitStringRef { @@ -146,6 +1886,14 @@ struct SplitStringRef { }; +// A BTree "page id" is actually a list of LogicalPageID's whose contents should be concatenated together. +// NOTE: Uses host byte order +typedef VectorRef BTreePageID; + +std::string toString(BTreePageID id) { + return std::string("BTreePageID") + toString(id.begin(), id.end()); +} + #define STR(x) LiteralStringRef(x) struct RedwoodRecordRef { typedef uint8_t byte; @@ -159,12 +1907,7 @@ struct RedwoodRecordRef { : key(arena, toCopy.key), version(toCopy.version), chunk(toCopy.chunk) { if(toCopy.value.present()) { - if(toCopy.localValue()) { - setPageID(toCopy.getPageID()); - } - else { - value = ValueRef(arena, toCopy.value.get()); - } + value = ValueRef(arena, toCopy.value.get()); } } @@ -174,54 +1917,24 @@ struct RedwoodRecordRef { deserializeIntFields(intFields); } - RedwoodRecordRef(const RedwoodRecordRef &toCopy) : key(toCopy.key), version(toCopy.version), chunk(toCopy.chunk) { - if(toCopy.value.present()) { - if(toCopy.localValue()) { - setPageID(toCopy.getPageID()); - } - else { - value = toCopy.value; - } - } - } - - RedwoodRecordRef & operator= (const RedwoodRecordRef &toCopy) { - key = toCopy.key; - version = toCopy.version; - chunk = toCopy.chunk; - if(toCopy.value.present()) { - if(toCopy.localValue()) { - setPageID(toCopy.getPageID()); - } - else { - value = toCopy.value; - } - } - - return *this; - } - - bool localValue() const { - return value.get().begin() == bigEndianPageIDSpace; - } - // RedwoodRecordRefs are used for both internal and leaf pages of the BTree. // Boundary records in internal pages are made from leaf records. // These functions make creating and working with internal page records more convenient. - inline LogicalPageID getPageID() const { + inline BTreePageID getChildPage() const { ASSERT(value.present()); - return bigEndian32(*(LogicalPageID *)value.get().begin()); + return BTreePageID((LogicalPageID *)value.get().begin(), value.get().size() / sizeof(LogicalPageID)); } - inline void setPageID(LogicalPageID id) { - *(LogicalPageID *)bigEndianPageIDSpace = bigEndian32(id); - value = ValueRef(bigEndianPageIDSpace, sizeof(bigEndianPageIDSpace)); + inline void setChildPage(BTreePageID id) { + value = ValueRef((const uint8_t *)id.begin(), id.size() * sizeof(LogicalPageID)); } - inline RedwoodRecordRef withPageID(LogicalPageID id) const { - RedwoodRecordRef rec(key, version, {}, chunk.total, chunk.start); - rec.setPageID(id); - return rec; + inline void setChildPage(Arena &arena, BTreePageID id) { + value = ValueRef(arena, (const uint8_t *)id.begin(), id.size() * sizeof(LogicalPageID)); + } + + inline RedwoodRecordRef withPageID(BTreePageID id) const { + return RedwoodRecordRef(key, version, ValueRef((const uint8_t *)id.begin(), id.size() * sizeof(LogicalPageID)), chunk.total, chunk.start); } inline RedwoodRecordRef withoutValue() const { @@ -302,13 +2015,10 @@ struct RedwoodRecordRef { Version version; struct { uint32_t total; - // TODO: Change start to chunk number. + // TODO: Change start to chunk number? uint32_t start; } chunk; - // If the value is a page ID it will be stored here - uint8_t bigEndianPageIDSpace[sizeof(LogicalPageID)]; - int expectedSize() const { return key.expectedSize() + value.expectedSize(); } @@ -462,10 +2172,12 @@ struct RedwoodRecordRef { StringRef k; + // Separate the borrowed key string byte count from the borrowed int field byte count int keyPrefixLen = std::min(prefixLen, base.key.size()); int intFieldPrefixLen = prefixLen - keyPrefixLen; int keySuffixLen = (flags & HAS_KEY_SUFFIX) ? r.readVarInt() : 0; + // If there is a key suffix, reconstitute the complete key into a contiguous string if(keySuffixLen > 0) { k = makeString(keyPrefixLen + keySuffixLen, arena); memcpy(mutateString(k), base.key.begin(), keyPrefixLen); @@ -537,6 +2249,30 @@ struct RedwoodRecordRef { size(), flagString.c_str(), prefixLen, keySuffixLen, intFieldSuffixLen, valueLen, StringRef((const uint8_t *)this, size()).toHexString().c_str()); } }; + + // Using this class as an alternative for Delta enables reading a DeltaTree while only decoding + // its values, so the Reader does not require the original prev/next ancestors. + struct DeltaValueOnly : Delta { + RedwoodRecordRef apply(const RedwoodRecordRef &base, Arena &arena) const { + Reader r(data()); + + // Skip prefix length + r.readVarInt(); + + // Get value length + int valueLen = (flags & HAS_VALUE) ? r.read() : 0; + + // Skip key suffix length and bytes if exists + if(flags & HAS_KEY_SUFFIX) { + r.readString(r.readVarInt()); + } + + // Skip int field suffix if present + r.readBytes(flags & INT_FIELD_SUFFIX_BITS); + + return RedwoodRecordRef(StringRef(), 0, (flags & HAS_VALUE ? r.readString(valueLen) : Optional()) ); + } + }; #pragma pack(pop) // Compares and orders by key, version, chunk.start, chunk.total. @@ -719,7 +2455,7 @@ struct RedwoodRecordRef { if(value.present()) { // Assume that values the size of a page ID are page IDs. It's not perfect but it's just for debugging. if(value.get().size() == sizeof(LogicalPageID)) { - r += format("[PageID=%u]", getPageID()); + r += format("[%s]", ::toString(getChildPage()).c_str()); } else { r += format("'%s'", kvformat(value.get(), hexLimit).c_str()); @@ -733,56 +2469,45 @@ struct RedwoodRecordRef { }; struct BTreePage { - - enum EPageFlags { IS_LEAF = 1}; - typedef DeltaTree BinaryTree; + typedef DeltaTree ValueTree; #pragma pack(push,1) struct { - uint8_t flags; - uint16_t count; + uint8_t height; + uint16_t itemCount; uint32_t kvBytes; - uint8_t extensionPageCount; }; #pragma pack(pop) - inline LogicalPageID * extensionPages() { - return (LogicalPageID *)(this + 1); - } - - inline const LogicalPageID * extensionPages() const { - return (const LogicalPageID *)(this + 1); - } - int size() const { const BinaryTree *t = &tree(); return (uint8_t *)t - (uint8_t *)this + t->size(); } bool isLeaf() const { - return flags & IS_LEAF; + return height == 1; } BinaryTree & tree() { - return *(BinaryTree *)(extensionPages() + extensionPageCount); + return *(BinaryTree *)(this + 1); } const BinaryTree & tree() const { - return *(const BinaryTree *)(extensionPages() + extensionPageCount); + return *(const BinaryTree *)(this + 1); } - static inline int GetHeaderSize(int extensionPages = 0) { - return sizeof(BTreePage) + (extensionPages * sizeof(LogicalPageID)); + const ValueTree & valueTree() const { + return *(const ValueTree *)(this + 1); } - std::string toString(bool write, LogicalPageID id, Version ver, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) const { + std::string toString(bool write, BTreePageID id, Version ver, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) const { std::string r; - r += format("BTreePage op=%s id=%d ver=%" PRId64 " ptr=%p flags=0x%X count=%d kvBytes=%d extPages=%d\n lowerBound: %s\n upperBound: %s\n", - write ? "write" : "read", id, ver, this, (int)flags, (int)count, (int)kvBytes, (int)extensionPageCount, + r += format("BTreePage op=%s %s @%" PRId64 " ptr=%p height=%d count=%d kvBytes=%d\n lowerBound: %s\n upperBound: %s\n", + write ? "write" : "read", ::toString(id).c_str(), ver, this, height, (int)itemCount, (int)kvBytes, lowerBound->toString().c_str(), upperBound->toString().c_str()); try { - if(count > 0) { + if(itemCount > 0) { // This doesn't use the cached reader for the page but it is only for debugging purposes BinaryTree::Reader reader(&tree(), lowerBound, upperBound); BinaryTree::Cursor c = reader.getCursor(); @@ -821,13 +2546,11 @@ struct BTreePage { } }; -static void makeEmptyPage(Reference page, uint8_t newFlags, int pageSize) { - VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); +static void makeEmptyRoot(Reference page) { BTreePage *btpage = (BTreePage *)page->begin(); - btpage->flags = newFlags; + btpage->height = 1; btpage->kvBytes = 0; - btpage->count = 0; - btpage->extensionPageCount = 0; + btpage->itemCount = 0; btpage->tree().build(nullptr, nullptr, nullptr, nullptr); } @@ -835,190 +2558,54 @@ BTreePage::BinaryTree::Reader * getReader(Reference page) { return (BTreePage::BinaryTree::Reader *)page->userData; } -struct BoundaryAndPage { +struct BoundaryRefAndPage { Standalone lowerBound; - // Only firstPage or multiPage will be in use at once Reference firstPage; std::vector> extPages; + + std::string toString() const { + return format("[%s, %d pages]", lowerBound.toString().c_str(), extPages.size() + (firstPage ? 1 : 0)); + } }; -// Returns a std::vector of pairs of lower boundary key indices within kvPairs and encoded pages. -// TODO: Refactor this as an accumulator you add sorted keys to which makes pages. -template -static std::vector buildPages(bool minimalBoundaries, const RedwoodRecordRef &lowerBound, const RedwoodRecordRef &upperBound, std::vector entries, uint8_t newFlags, Allocator const &newBlockFn, int usableBlockSize) { - // This is how much space for the binary tree exists in the page, after the header - int pageSize = usableBlockSize - BTreePage::GetHeaderSize(); +#define NOT_IMPLEMENTED { UNSTOPPABLE_ASSERT(false); } - // Each new block adds (usableBlockSize - sizeof(LogicalPageID)) more net usable space *for the binary tree* to pageSize. - int netTreeBlockSize = usableBlockSize - sizeof(LogicalPageID); +#pragma pack(push, 1) +template +struct InPlaceArray { + SizeT count; - int blockCount = 1; - std::vector pages; - - int kvBytes = 0; - int compressedBytes = BTreePage::BinaryTree::GetTreeOverhead(); - - int start = 0; - int i = 0; - const int iEnd = entries.size(); - // Lower bound of the page being added to - RedwoodRecordRef pageLowerBound = lowerBound.withoutValue(); - RedwoodRecordRef pageUpperBound; - - while(i <= iEnd) { - bool end = i == iEnd; - bool flush = end; - - // If not the end, add i to the page if necessary - if(end) { - pageUpperBound = upperBound.withoutValue(); - } - else { - // Get delta from previous record - const RedwoodRecordRef &entry = entries[i]; - int deltaSize = entry.deltaSize((i == start) ? pageLowerBound : entries[i - 1]); - int keySize = entry.key.size(); - int valueSize = entry.value.present() ? entry.value.get().size() : 0; - - int spaceNeeded = sizeof(BTreePage::BinaryTree::Node) + deltaSize; - - debug_printf("Trying to add record %3d of %3lu (i=%3d) klen %4d vlen %3d deltaSize %4d spaceNeeded %4d compressed %4d / page %4d bytes %s\n", - i + 1, entries.size(), i, keySize, valueSize, deltaSize, - spaceNeeded, compressedBytes, pageSize, entry.toString().c_str()); - - int spaceAvailable = pageSize - compressedBytes; - - // Does it fit? - bool fits = spaceAvailable >= spaceNeeded; - - // If it doesn't fit, either end the current page or increase the page size - if(!fits) { - // For leaf level where minimal boundaries are used require at least 1 entry, otherwise require 4 to enforce a minimum branching factor - int minimumEntries = minimalBoundaries ? 1 : 4; - int count = i - start; - - // If not enough entries or page less than half full, increase page size to make the entry fit - if(count < minimumEntries || spaceAvailable > pageSize / 2) { - // Figure out how many additional whole or partial blocks are needed - int newBlocks = 1 + (spaceNeeded - spaceAvailable - 1) / netTreeBlockSize; - int newPageSize = pageSize + (newBlocks * netTreeBlockSize); - if(newPageSize <= BTreePage::BinaryTree::MaximumTreeSize()) { - blockCount += newBlocks; - pageSize = newPageSize; - fits = true; - } - } - if(!fits) { - pageUpperBound = entry.withoutValue(); - } - } - - // If the record fits then add it to the page set - if(fits) { - kvBytes += keySize + valueSize; - compressedBytes += spaceNeeded; - ++i; - } - - flush = !fits; - } - - // If flush then write a page using records from start to i. It's guaranteed that pageUpperBound has been set above. - if(flush) { - end = i == iEnd; // i could have been moved above - - int count = i - start; - // If not writing the final page, reduce entry count of page by a third - if(!end) { - i -= count / 3; - pageUpperBound = entries[i].withoutValue(); - } - - // If this isn't the final page, shorten the upper boundary - if(!end && minimalBoundaries) { - int commonPrefix = pageUpperBound.getCommonPrefixLen(entries[i - 1], 0); - pageUpperBound.truncate(commonPrefix + 1); - } - - debug_printf("Flushing page start=%d i=%d count=%d\nlower: %s\nupper: %s\n", start, i, count, pageLowerBound.toString().c_str(), pageUpperBound.toString().c_str()); -#if REDWOOD_DEBUG - for(int j = start; j < i; ++j) { - debug_printf(" %3d: %s\n", j, entries[j].toString().c_str()); - if(j > start) { - //ASSERT(entries[j] > entries[j - 1]); - } - } - ASSERT(pageLowerBound.key <= pageUpperBound.key); -#endif - - union { - BTreePage *btPage; - uint8_t *btPageMem; - }; - - int allocatedSize; - if(blockCount == 1) { - Reference page = newBlockFn(); - VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); - btPageMem = page->mutate(); - allocatedSize = page->size(); - pages.push_back({pageLowerBound, page}); - } - else { - ASSERT(blockCount > 1); - allocatedSize = usableBlockSize * blockCount; - btPageMem = new uint8_t[allocatedSize]; - VALGRIND_MAKE_MEM_DEFINED(btPageMem, allocatedSize); - } - - btPage->flags = newFlags; - btPage->kvBytes = kvBytes; - btPage->count = i - start; - btPage->extensionPageCount = blockCount - 1; - - int written = btPage->tree().build(&entries[start], &entries[i], &pageLowerBound, &pageUpperBound); - if(written > pageSize) { - fprintf(stderr, "ERROR: Wrote %d bytes to %d byte page (%d blocks). recs %d kvBytes %d compressed %d\n", written, pageSize, blockCount, i - start, kvBytes, compressedBytes); - ASSERT(false); - } - - if(blockCount != 1) { - Reference page = newBlockFn(); - VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); - - const uint8_t *rptr = btPageMem; - memcpy(page->mutate(), rptr, usableBlockSize); - rptr += usableBlockSize; - - std::vector> extPages; - for(int b = 1; b < blockCount; ++b) { - Reference extPage = newBlockFn(); - VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); - - //debug_printf("block %d write offset %d\n", b, firstBlockSize + (b - 1) * usableBlockSize); - memcpy(extPage->mutate(), rptr, usableBlockSize); - rptr += usableBlockSize; - extPages.push_back(std::move(extPage)); - } - - pages.push_back({std::move(pageLowerBound), std::move(page), std::move(extPages)}); - delete btPageMem; - } - - if(end) - break; - start = i; - kvBytes = 0; - compressedBytes = BTreePage::BinaryTree::GetTreeOverhead(); - pageLowerBound = pageUpperBound.withoutValue(); - } + const T * begin() const { + return (T *)(this + 1); + } + + T * begin() { + return (T *)(this + 1); } - //debug_printf("buildPages: returning pages.size %lu, kvpairs %lu\n", pages.size(), kvPairs.size()); - return pages; -} + const T * end() const { + return begin() + count; + } + + T * end() { + return begin() + count; + } -#define NOT_IMPLEMENTED { UNSTOPPABLE_ASSERT(false); } + VectorRef get() { + return VectorRef(begin(), count); + } + + void set(VectorRef v, int availableSpace) { + ASSERT(sizeof(T) * v.size() <= availableSpace); + count = v.size(); + memcpy(begin(), v.begin(), sizeof(T) * v.size()); + } + + int extraSize() const { + return count * sizeof(T); + } +}; +#pragma pack(pop) class VersionedBTree : public IVersionedStore { public: @@ -1027,9 +2614,70 @@ public: // A record which is greater than the last possible record in the tree static RedwoodRecordRef dbEnd; + struct LazyDeleteQueueEntry { + Version version; + Standalone pageID; + + bool operator< (const LazyDeleteQueueEntry &rhs) const { + return version < rhs.version; + } + + int readFromBytes(const uint8_t *src) { + version = *(Version *)src; + src += sizeof(Version); + int count = *src++; + pageID = BTreePageID((LogicalPageID *)src, count); + return bytesNeeded(); + } + + int bytesNeeded() const { + return sizeof(Version) + 1 + (pageID.size() * sizeof(LogicalPageID)); + } + + int writeToBytes(uint8_t *dst) const { + *(Version *)dst = version; + dst += sizeof(Version); + *dst++ = pageID.size(); + memcpy(dst, pageID.begin(), pageID.size() * sizeof(LogicalPageID)); + return bytesNeeded(); + } + + std::string toString() const { + return format("{%s @%" PRId64 "}", ::toString(pageID).c_str(), version); + } + }; + + typedef FIFOQueue LazyDeleteQueueT; + +#pragma pack(push, 1) + struct MetaKey { + static constexpr int FORMAT_VERSION = 2; + // This serves as the format version for the entire tree, individual pages will not be versioned + uint16_t formatVersion; + uint8_t height; + LazyDeleteQueueT::QueueState lazyDeleteQueue; + InPlaceArray root; + + KeyRef asKeyRef() const { + return KeyRef((uint8_t *)this, sizeof(MetaKey) + root.extraSize()); + } + + void fromKeyRef(KeyRef k) { + memcpy(this, k.begin(), k.size()); + ASSERT(formatVersion == FORMAT_VERSION); + } + + std::string toString() { + return format("{height=%d formatVersion=%d root=%s lazyDeleteQueue=%s}", (int)height, (int)formatVersion, ::toString(root.get()).c_str(), lazyDeleteQueue.toString().c_str()); + } + + }; +#pragma pack(pop) + struct Counts { Counts() { memset(this, 0, sizeof(Counts)); + startTime = g_network ? now() : 0; } void clear() { @@ -1038,6 +2686,8 @@ public: int64_t pageReads; int64_t extPageReads; + int64_t pagePreloads; + int64_t extPagePreloads; int64_t setBytes; int64_t pageWrites; int64_t extPageWrites; @@ -1048,13 +2698,22 @@ public: int64_t getRanges; int64_t commitToPage; int64_t commitToPageStart; + double startTime; std::string toString(bool clearAfter = false) { - std::string s = format("set=%" PRId64 " clear=%" PRId64 " get=%" PRId64 " getRange=%" PRId64 " commit=%" PRId64 " pageRead=%" PRId64 " extPageRead=%" PRId64 " pageWrite=%" PRId64 " extPageWrite=%" PRId64 " commitPage=%" PRId64 " commitPageStart=%" PRId64 "", - sets, clears, gets, getRanges, commits, pageReads, extPageReads, pageWrites, extPageWrites, commitToPage, commitToPageStart); + const char *labels[] = {"set", "clear", "get", "getRange", "commit", "pageReads", "extPageRead", "pagePreloads", "extPagePreloads", "pageWrite", "extPageWrite", "commitPage", "commitPageStart"}; + const int64_t values[] = {sets, clears, gets, getRanges, commits, pageReads, extPageReads, pagePreloads, extPagePreloads, pageWrites, extPageWrites, commitToPage, commitToPageStart}; + + double elapsed = now() - startTime; + std::string s; + for(int i = 0; i < sizeof(values) / sizeof(int64_t); ++i) { + s += format("%s=%" PRId64 " (%d/s) ", labels[i], values[i], int(values[i] / elapsed)); + } + if(clearAfter) { clear(); } + return s; } }; @@ -1064,16 +2723,16 @@ public: // All async opts on the btree are based on pager reads, writes, and commits, so // we can mostly forward these next few functions to the pager - virtual Future getError() { + Future getError() { return m_pager->getError(); } - virtual Future onClosed() { + Future onClosed() { return m_pager->onClosed(); } void close_impl(bool dispose) { - IPager *pager = m_pager; + auto *pager = m_pager; delete this; if(dispose) pager->dispose(); @@ -1081,24 +2740,24 @@ public: pager->close(); } - virtual void dispose() { + void dispose() { return close_impl(true); } - virtual void close() { + void close() { return close_impl(false); } - virtual KeyValueStoreType getType() NOT_IMPLEMENTED - virtual bool supportsMutation(int op) NOT_IMPLEMENTED - virtual StorageBytes getStorageBytes() { + KeyValueStoreType getType() NOT_IMPLEMENTED + bool supportsMutation(int op) NOT_IMPLEMENTED + StorageBytes getStorageBytes() { return m_pager->getStorageBytes(); } // Writes are provided in an ordered stream. // A write is considered part of (a change leading to) the version determined by the previous call to setWriteVersion() // A write shall not become durable until the following call to commit() begins, and shall be durable once the following call to commit() returns - virtual void set(KeyValueRef keyValue) { + void set(KeyValueRef keyValue) { ++counts.sets; SingleKeyMutationsByVersion &changes = insertMutationBoundary(keyValue.key)->second.startKeyMutations; @@ -1117,7 +2776,7 @@ public: } } } - virtual void clear(KeyRangeRef range) { + void clear(KeyRangeRef range) { ++counts.clears; MutationBufferT::iterator iBegin = insertMutationBoundary(range.begin); MutationBufferT::iterator iEnd = insertMutationBoundary(range.end); @@ -1149,12 +2808,17 @@ public: } } - virtual void mutate(int op, StringRef param1, StringRef param2) NOT_IMPLEMENTED + void mutate(int op, StringRef param1, StringRef param2) NOT_IMPLEMENTED - // Versions [begin, end) no longer readable - virtual void forgetVersions(Version begin, Version end) NOT_IMPLEMENTED + void setOldestVersion(Version v) { + m_newOldestVersion = v; + } - virtual Future getLatestVersion() { + Version getOldestVersion() { + return m_pager->getOldestVersion(); + } + + Version getLatestVersion() { if(m_writeVersion != invalidVersion) return m_writeVersion; return m_pager->getLatestVersion(); @@ -1168,37 +2832,122 @@ public: return m_lastCommittedVersion; } - VersionedBTree(IPager *pager, std::string name, bool singleVersion = false, int target_page_size = -1) + VersionedBTree(IPager2 *pager, std::string name, bool singleVersion = false) : m_pager(pager), m_writeVersion(invalidVersion), - m_usablePageSizeOverride(pager->getUsablePageSize()), m_lastCommittedVersion(invalidVersion), m_pBuffer(nullptr), m_name(name), singleVersion(singleVersion) { - if(target_page_size > 0 && target_page_size < m_usablePageSizeOverride) - m_usablePageSizeOverride = target_page_size; m_init = init_impl(this); m_latestCommit = m_init; } + ACTOR static Future incrementalSubtreeClear(VersionedBTree *self, bool *pStop = nullptr, unsigned int minPages = 0, int maxPages = std::numeric_limits::max()) { + // TODO: Is it contractually okay to always to read at the latest version? + state Reference snapshot = self->m_pager->getReadSnapshot(self->m_pager->getLatestVersion()); + state int freedPages = 0; + loop { + // take a page from front of queue + state Optional q = wait(self->m_lazyDeleteQueue.pop()); + debug_printf("LazyDelete: popped %s\n", toString(q).c_str()); + if(!q.present()) { + break; + } + + // Read the page without caching + Reference p = wait(self->readPage(snapshot, q.get().pageID, nullptr, nullptr, true)); + const BTreePage &btPage = *(BTreePage *)p->begin(); + + // Level 1 (leaf) nodes should never be in the lazy delete queue + ASSERT(btPage.height > 1); + + // Iterate over page entries, skipping key decoding using BTreePage::ValueTree which uses + // RedwoodRecordRef::DeltaValueOnly as the delta type type to skip key decoding + BTreePage::ValueTree::Reader reader(&btPage.valueTree(), &dbBegin, &dbEnd); + auto c = reader.getCursor(); + ASSERT(c.moveFirst()); + Version v = q.get().version; + while(1) { + if(c.get().value.present()) { + BTreePageID btChildPageID = c.get().getChildPage(); + // If this page is height 2, then the children are leaves so free + if(btPage.height == 2) { + debug_printf("LazyDelete: freeing child %s\n", toString(btChildPageID).c_str()); + self->freeBtreePage(btChildPageID, v); + freedPages += btChildPageID.size(); + } + else { + // Otherwise, queue them for lazy delete. + debug_printf("LazyDelete: queuing child %s\n", toString(btChildPageID).c_str()); + self->m_lazyDeleteQueue.pushFront(LazyDeleteQueueEntry{v, btChildPageID}); + } + } + if(!c.moveNext()) { + break; + } + } + + // Free the page, now that its children have either been freed or queued + debug_printf("LazyDelete: freeing queue entry %s\n", toString(q.get().pageID).c_str()); + self->freeBtreePage(q.get().pageID, v); + freedPages += q.get().pageID.size(); + + // If stop is set and we've freed the minimum number of pages required, or the maximum is exceeded, return. + if((freedPages >= minPages && pStop != nullptr && *pStop) || freedPages >= maxPages) { + break; + } + } + + debug_printf("LazyDelete: freed %d pages, %s has %" PRId64 " entries\n", freedPages, self->m_lazyDeleteQueue.name.c_str(), self->m_lazyDeleteQueue.numEntries); + return freedPages; + } + ACTOR static Future init_impl(VersionedBTree *self) { - self->m_root = 0; - state Version latest = wait(self->m_pager->getLatestVersion()); - if(latest == 0) { + wait(self->m_pager->init()); + + state Version latest = self->m_pager->getLatestVersion(); + self->m_newOldestVersion = self->m_pager->getOldestVersion(); + + debug_printf("Recovered pager to version %" PRId64 ", oldest version is %" PRId64 "\n", self->m_newOldestVersion); + + state Key meta = self->m_pager->getMetaKey(); + if(meta.size() == 0) { + self->m_header.formatVersion = MetaKey::FORMAT_VERSION; + LogicalPageID id = wait(self->m_pager->newPageID()); + BTreePageID newRoot((LogicalPageID *)&id, 1); + debug_printf("new root %s\n", toString(newRoot).c_str()); + self->m_header.root.set(newRoot, sizeof(headerSpace) - sizeof(m_header)); + self->m_header.height = 1; ++latest; Reference page = self->m_pager->newPageBuffer(); - makeEmptyPage(page, BTreePage::IS_LEAF, self->m_usablePageSizeOverride); - self->writePage(self->m_root, page, latest, &dbBegin, &dbEnd); - self->m_pager->setLatestVersion(latest); + makeEmptyRoot(page); + self->m_pager->updatePage(id, page); + self->m_pager->setCommitVersion(latest); + + LogicalPageID newQueuePage = wait(self->m_pager->newPageID()); + self->m_lazyDeleteQueue.create(self->m_pager, newQueuePage, "LazyDeleteQueue"); + self->m_header.lazyDeleteQueue = self->m_lazyDeleteQueue.getState(); + self->m_pager->setMetaKey(self->m_header.asKeyRef()); wait(self->m_pager->commit()); + debug_printf("Committed initial commit.\n"); } + else { + self->m_header.fromKeyRef(meta); + self->m_lazyDeleteQueue.recover(self->m_pager, self->m_header.lazyDeleteQueue, "LazyDeleteQueueRecovered"); + } + + debug_printf("Recovered btree at version %" PRId64 ": %s\n", latest, self->m_header.toString().c_str()); + + self->m_maxPartSize = std::min(255, self->m_pager->getUsablePageSize() / 5); self->m_lastCommittedVersion = latest; return Void(); } - Future init() { return m_init; } + Future init() override { + return m_init; + } virtual ~VersionedBTree() { // This probably shouldn't be called directly (meaning deleting an instance directly) but it should be safe, @@ -1208,35 +2957,36 @@ public: m_latestCommit.cancel(); } - // readAtVersion() may only be called on a version which has previously been passed to setWriteVersion() and never previously passed - // to forgetVersion. The returned results when violating this precondition are unspecified; the store is not required to be able to detect violations. - // The returned read cursor provides a consistent snapshot of the versioned store, corresponding to all the writes done with write versions less - // than or equal to the given version. - // If readAtVersion() is called on the *current* write version, the given read cursor MAY reflect subsequent writes at the same - // write version, OR it may represent a snapshot as of the call to readAtVersion(). - virtual Reference readAtVersion(Version v) { - // TODO: Use the buffer to return uncommitted data - // For now, only committed versions can be read. + Reference readAtVersion(Version v) { + // Only committed versions can be read. Version recordVersion = singleVersion ? 0 : v; ASSERT(v <= m_lastCommittedVersion); if(singleVersion) { ASSERT(v == m_lastCommittedVersion); } - return Reference(new Cursor(m_pager->getReadSnapshot(v), m_root, recordVersion, m_usablePageSizeOverride)); + Reference snapshot = m_pager->getReadSnapshot(v); + + // Snapshot will continue to hold the metakey value memory + KeyRef m = snapshot->getMetaKey(); + + return Reference(new Cursor(snapshot, ((MetaKey *)m.begin())->root.get(), recordVersion)); } // Must be nondecreasing - virtual void setWriteVersion(Version v) { + void setWriteVersion(Version v) { ASSERT(v > m_lastCommittedVersion); // If there was no current mutation buffer, create one in the buffer map and update m_pBuffer if(m_pBuffer == nullptr) { // When starting a new mutation buffer its start version must be greater than the last write version ASSERT(v > m_writeVersion); m_pBuffer = &m_mutationBuffers[v]; + // Create range representing the entire keyspace. This reduces edge cases to applying mutations // because now all existing keys are within some range in the mutation map. - (*m_pBuffer)[dbBegin.key]; - (*m_pBuffer)[dbEnd.key]; + (*m_pBuffer)[dbBegin.key] = RangeMutation(); + // Setting the dbEnd key to be cleared prevents having to treat a range clear to dbEnd as a special + // case in order to avoid traversing down the rightmost edge of the tree. + (*m_pBuffer)[dbEnd.key].startKeyMutations[0] = SingleKeyMutation(); } else { // It's OK to set the write version to the same version repeatedly so long as m_pBuffer is not null @@ -1245,32 +2995,85 @@ public: m_writeVersion = v; } - virtual Future commit() { + Future commit() { if(m_pBuffer == nullptr) return m_latestCommit; return commit_impl(this); } + ACTOR static Future destroyAndCheckSanity_impl(VersionedBTree *self) { + ASSERT(g_network->isSimulated()); + + debug_printf("Clearing tree.\n"); + self->setWriteVersion(self->getLatestVersion() + 1); + self->clear(KeyRangeRef(dbBegin.key, dbEnd.key)); + + loop { + state int freedPages = wait(self->incrementalSubtreeClear(self)); + wait(self->commit()); + // Keep looping until the last commit doesn't do anything at all + if(self->m_lazyDeleteQueue.numEntries == 0 && freedPages == 0) { + break; + } + self->setWriteVersion(self->getLatestVersion() + 1); + } + + // Forget all but the latest version of the tree. + debug_printf("Discarding all old versions.\n"); + self->setOldestVersion(self->getLastCommittedVersion()); + self->setWriteVersion(self->getLatestVersion() + 1); + wait(self->commit()); + + // The lazy delete queue should now be empty and contain only the new page to start writing to + // on the next commit. + LazyDeleteQueueT::QueueState s = self->m_lazyDeleteQueue.getState(); + ASSERT(s.numEntries == 0); + ASSERT(s.numPages == 1); + + // The btree should now be a single non-oversized root page. + ASSERT(self->m_header.height == 1); + ASSERT(self->m_header.root.count == 1); + + // From the pager's perspective the only pages that should be in use are the btree root and + // the previously mentioned lazy delete queue page. + int64_t userPageCount = wait(self->m_pager->getUserPageCount()); + ASSERT(userPageCount == 2); + + return Void(); + } + + Future destroyAndCheckSanity() { + return destroyAndCheckSanity_impl(this); + } + bool isSingleVersion() const { return singleVersion; } private: - void writePage(LogicalPageID id, Reference page, Version ver, const RedwoodRecordRef *pageLowerBound, const RedwoodRecordRef *pageUpperBound) { - debug_printf("writePage(): %s\n", ((const BTreePage *)page->begin())->toString(true, id, ver, pageLowerBound, pageUpperBound).c_str()); - m_pager->writePage(id, page, ver); - } + struct VersionAndChildrenRef { + VersionAndChildrenRef(Version v, VectorRef children, RedwoodRecordRef upperBound) + : version(v), children(children), upperBound(upperBound) { + } - LogicalPageID m_root; + VersionAndChildrenRef(Arena &arena, const VersionAndChildrenRef &toCopy) + : version(toCopy.version), children(arena, toCopy.children), upperBound(arena, toCopy.upperBound) { + } + + int expectedSize() const { + return children.expectedSize() + upperBound.expectedSize(); + } + + std::string toString() const { + return format("{version=%" PRId64 " children=%s upperbound=%s}", version, ::toString(children).c_str(), upperBound.toString().c_str()); + } - // TODO: Don't use Standalone - struct VersionedChildPageSet { Version version; - std::vector> children; - Standalone upperBound; + VectorRef children; + RedwoodRecordRef upperBound; }; - typedef std::vector VersionedChildrenT; + typedef VectorRef VersionedChildrenT; // Utility class for building a vector of internal page entries. // Entries must be added in version order. Modified will be set to true @@ -1284,6 +3087,8 @@ private: { } + private: + // This must be called internally, on records whose arena has already been added to the entries arena inline void addEntry(const RedwoodRecordRef &rec) { if(rec.value.present()) { ++childPageCount; @@ -1307,10 +3112,11 @@ private: } } - entries.push_back(rec); + entries.push_back(entries.arena(), rec); } - - void addEntries(const VersionedChildPageSet &newSet) { + public: + // Add the child entries from newSet into entries + void addEntries(VersionAndChildrenRef newSet) { // If there are already entries, the last one links to a child page, and its upper bound is not the same // as the first lowerBound in newSet (or newSet is empty, as the next newSet is necessarily greater) // then add the upper bound of the previous set as a value-less record so that on future reads @@ -1368,32 +3174,12 @@ private: } BTreePage::BinaryTree::Cursor cursor; - std::vector> entries; - Standalone lastUpperBound; + Standalone> entries; + RedwoodRecordRef lastUpperBound; bool modified; int childPageCount; - Arena arena; }; - - template - static std::string toString(const T &o) { - return o.toString(); - } - - static std::string toString(const VersionedChildPageSet &c) { - return format("Version=%" PRId64 " children=%s upperBound=%s", c.version, toString(c.children).c_str(), c.upperBound.toString().c_str()); - } - - template - static std::string toString(const std::vector &v) { - std::string r = "{ "; - for(auto &o : v) { - r += toString(o) + ", "; - } - return r + " }"; - } - // Represents a change to a single key - set, clear, or atomic op struct SingleKeyMutation { // Clear @@ -1436,6 +3222,18 @@ private: // A clear range version, if cleared, for the range starting immediately AFTER the start key Optional rangeClearVersion; + bool keyCleared() const { + return startKeyMutations.size() == 1 && startKeyMutations.begin()->second.isClear(); + } + + bool keyChanged() const { + return !startKeyMutations.empty(); + } + + bool rangeCleared() const { + return rangeClearVersion.present(); + } + // Returns true if this RangeMutation doesn't actually mutate anything bool noChanges() const { return !rangeClearVersion.present() && startKeyMutations.empty(); @@ -1506,33 +3304,26 @@ private: * to be sorted later just before being merged into the existing leaf page. */ - IPager *m_pager; + IPager2 *m_pager; MutationBufferT *m_pBuffer; std::map m_mutationBuffers; Version m_writeVersion; Version m_lastCommittedVersion; + Version m_newOldestVersion; Future m_latestCommit; - int m_usablePageSizeOverride; Future m_init; std::string m_name; bool singleVersion; - void printMutationBuffer(MutationBufferT::const_iterator begin, MutationBufferT::const_iterator end) const { -#if REDWOOD_DEBUG - debug_printf("-------------------------------------\n"); - debug_printf("BUFFER\n"); - while(begin != end) { - debug_printf("'%s': %s\n", printable(begin->first).c_str(), begin->second.toString().c_str()); - ++begin; - } - debug_printf("-------------------------------------\n"); -#endif - } + // MetaKey changes size so allocate space for it to expand into + union { + uint8_t headerSpace[sizeof(MetaKey) + sizeof(LogicalPageID) * 20]; + MetaKey m_header; + }; - void printMutationBuffer(MutationBufferT *buf) const { - return printMutationBuffer(buf->begin(), buf->end()); - } + LazyDeleteQueueT m_lazyDeleteQueue; + int m_maxPartSize; // Find or create a mutation buffer boundary for bound and return an iterator to it MutationBufferT::iterator insertMutationBoundary(Key boundary) { @@ -1565,178 +3356,339 @@ private: return ib; } - void buildNewRoot(Version version, std::vector &pages, std::vector &logicalPageIDs, const BTreePage *pPage) { - //debug_printf("buildNewRoot start %lu\n", pages.size()); - // While there are multiple child pages for this version we must write new tree levels. - while(pages.size() > 1) { - std::vector childEntries; - for(int i=0; i>> writePages(VersionedBTree *self, bool minimalBoundaries, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound, VectorRef entries, int height, Version v, BTreePageID previousID) { + ASSERT(entries.size() > 0); + state Standalone> records; - pages = buildPages(false, dbBegin, dbEnd, childEntries, 0, [=](){ return m_pager->newPageBuffer(); }, m_usablePageSizeOverride); + // This is how much space for the binary tree exists in the page, after the header + state int blockSize = self->m_pager->getUsablePageSize(); + state int pageSize = blockSize - sizeof(BTreePage); + state int blockCount = 1; - debug_printf("Writing a new root level at version %" PRId64 " with %lu children across %lu pages\n", version, childEntries.size(), pages.size()); + state int kvBytes = 0; + state int compressedBytes = BTreePage::BinaryTree::GetTreeOverhead(); - logicalPageIDs = writePages(pages, version, m_root, pPage, &dbEnd, nullptr); - } - } + state int start = 0; + state int i = 0; + state bool end; - std::vector writePages(std::vector pages, Version version, LogicalPageID originalID, const BTreePage *originalPage, const RedwoodRecordRef *upperBound, void *actor_debug) { - debug_printf("%p: writePages(): %u @%" PRId64 " -> %lu replacement pages\n", actor_debug, originalID, version, pages.size()); + // For leaf level where minimal boundaries are used require at least 1 entry, otherwise require 4 to enforce a minimum branching factor + state int minimumEntries = minimalBoundaries ? 1 : 4; + + // Lower bound of the page being added to + state RedwoodRecordRef pageLowerBound = lowerBound->withoutValue(); + state RedwoodRecordRef pageUpperBound; - ASSERT(version != 0 || pages.size() == 1); + while(i <= entries.size()) { + end = i == entries.size(); + bool flush = end; - std::vector primaryLogicalPageIDs; - - // Reuse original primary page ID if it's not the root or if only one page is being written. - if(originalID != m_root || pages.size() == 1) - primaryLogicalPageIDs.push_back(originalID); - - // Allocate a primary page ID for each page to be written - while(primaryLogicalPageIDs.size() < pages.size()) { - primaryLogicalPageIDs.push_back(m_pager->allocateLogicalPage()); - } - - debug_printf("%p: writePages(): Writing %lu replacement pages for %d at version %" PRId64 "\n", actor_debug, pages.size(), originalID, version); - for(int i=0; iwritePage() is for whole primary pages - if(extPages.size() != 0) { - BTreePage *newPage = (BTreePage *)pages[i].firstPage->mutate(); - ASSERT(newPage->extensionPageCount == extPages.size()); - - for(int e = 0, eEnd = extPages.size(); e < eEnd; ++e) { - LogicalPageID eid = m_pager->allocateLogicalPage(); - debug_printf("%p: writePages(): Writing extension page op=write id=%u @%" PRId64 " (%d of %lu) referencePageID=%u\n", actor_debug, eid, version, e + 1, extPages.size(), id); - newPage->extensionPages()[e] = bigEndian32(eid); - // If replacing the primary page below (version == 0) then pass the primary page's ID as the reference page ID - m_pager->writePage(eid, extPages[e], version, (version == 0) ? id : invalidLogicalPageID); - ++counts.extPageWrites; - } - - debug_printf("%p: writePages(): Writing primary page op=write id=%u @%" PRId64 " (+%lu extension pages)\n", actor_debug, id, version, extPages.size()); - m_pager->writePage(id, pages[i].firstPage, version); + // If not the end, add i to the page if necessary + if(end) { + pageUpperBound = upperBound->withoutValue(); } else { - debug_printf("%p: writePages(): Writing normal page op=write id=%u @%" PRId64 "\n", actor_debug, id, version); - writePage(id, pages[i].firstPage, version, &pages[i].lowerBound, (i == pages.size() - 1) ? upperBound : &pages[i + 1].lowerBound); + // Get delta from previous record + const RedwoodRecordRef &entry = entries[i]; + int deltaSize = entry.deltaSize((i == start) ? pageLowerBound : entries[i - 1]); + int keySize = entry.key.size(); + int valueSize = entry.value.present() ? entry.value.get().size() : 0; + + int spaceNeeded = sizeof(BTreePage::BinaryTree::Node) + deltaSize; + + debug_printf("Trying to add record %3d of %3lu (i=%3d) klen %4d vlen %3d deltaSize %4d spaceNeeded %4d compressed %4d / page %4d bytes %s\n", + i + 1, entries.size(), i, keySize, valueSize, deltaSize, + spaceNeeded, compressedBytes, pageSize, entry.toString().c_str()); + + int spaceAvailable = pageSize - compressedBytes; + + // Does it fit? + bool fits = spaceAvailable >= spaceNeeded; + + // If it doesn't fit, either end the current page or increase the page size + if(!fits) { + int count = i - start; + + // If not enough entries or page less than half full, increase page size to make the entry fit + if(count < minimumEntries || spaceAvailable > pageSize / 2) { + // Figure out how many additional whole or partial blocks are needed + // newBlocks = ceil ( additional space needed / block size) + int newBlocks = 1 + (spaceNeeded - spaceAvailable - 1) / blockSize; + int newPageSize = pageSize + (newBlocks * blockSize); + if(newPageSize <= BTreePage::BinaryTree::MaximumTreeSize()) { + blockCount += newBlocks; + pageSize = newPageSize; + fits = true; + } + } + if(!fits) { + pageUpperBound = entry.withoutValue(); + } + } + + // If the record fits then add it to the page set + if(fits) { + kvBytes += keySize + valueSize; + compressedBytes += spaceNeeded; + ++i; + } + + flush = !fits; + } + + // If flush then write a page using records from start to i. It's guaranteed that pageUpperBound has been set above. + if(flush) { + int remaining = entries.size() - i; + end = remaining == 0; // i could have been moved above + int count = i - start; + + // If + // - this is not the last page + // - the number of entries remaining after this page is less than the count of the current page + // - the page that would be written ends on a user key boundary + // Then adjust the current page item count to half the amount remaining after the start position. + if(!end && remaining < count && entries[i - 1].key != entries[i].key) { + i = (start + entries.size()) / 2; + pageUpperBound = entries[i].withoutValue(); + } + + // If this isn't the final page, shorten the upper boundary + if(!end && minimalBoundaries) { + int commonPrefix = pageUpperBound.getCommonPrefixLen(entries[i - 1], 0); + pageUpperBound.truncate(commonPrefix + 1); + } + + state std::vector> pages; + BTreePage *btPage; + + if(blockCount == 1) { + Reference page = self->m_pager->newPageBuffer(); + btPage = (BTreePage *)page->mutate(); + pages.push_back(std::move(page)); + } + else { + ASSERT(blockCount > 1); + int size = blockSize * blockCount; + btPage = (BTreePage *)new uint8_t[size]; + } + + btPage->height = height; + btPage->kvBytes = kvBytes; + btPage->itemCount = i - start; + + int written = btPage->tree().build(&entries[start], &entries[i], &pageLowerBound, &pageUpperBound); + if(written > pageSize) { + fprintf(stderr, "ERROR: Wrote %d bytes to %d byte page (%d blocks). recs %d kvBytes %d compressed %d\n", written, pageSize, blockCount, i - start, kvBytes, compressedBytes); + ASSERT(false); + } + + // Create chunked pages + // TODO: Avoid copying page bytes, but this is not trivial due to how pager checksums are currently handled. + if(blockCount != 1) { + // Mark the slack in the page buffer as defined + VALGRIND_MAKE_MEM_DEFINED(((uint8_t *)btPage) + written, (blockCount * blockSize) - written); + const uint8_t *rptr = (const uint8_t *)btPage; + for(int b = 0; b < blockCount; ++b) { + Reference page = self->m_pager->newPageBuffer(); + memcpy(page->mutate(), rptr, blockSize); + rptr += blockSize; + pages.push_back(std::move(page)); + } + delete [] (uint8_t *)btPage; + } + + // Write this btree page, which is made of 1 or more pager pages. + state int p; + state BTreePageID childPageID; + + // If we are only writing 1 page and it has the same BTreePageID size as the original they try to reuse the + // LogicalPageIDs in previousID and try to update them atomically. + if(end && records.empty() && previousID.size() == pages.size()) { + for(p = 0; p < pages.size(); ++p) { + LogicalPageID id = wait(self->m_pager->atomicUpdatePage(previousID[p], pages[p], v)); + childPageID.push_back(records.arena(), id); + } + } + else { + // Either the original page is being split, or it's not but it has changed BTreePageID size. + // Either way, there is no point in reusing any of the original page IDs because the parent + // must be rewritten anyway to count for the change in child count or child links. + // Free the old IDs, but only once (before the first output record is added). + if(records.empty()) { + self->freeBtreePage(previousID, v); + } + for(p = 0; p < pages.size(); ++p) { + LogicalPageID id = wait(self->m_pager->newPageID()); + self->m_pager->updatePage(id, pages[p]); + childPageID.push_back(records.arena(), id); + } + } + wait(yield()); + + // Update activity counts + ++counts.pageWrites; + if(pages.size() > 1) { + counts.extPageWrites += pages.size() - 1; + } + + debug_printf("Flushing %s original=%s start=%d i=%d count=%d\nlower: %s\nupper: %s\n", toString(childPageID).c_str(), toString(previousID).c_str(), start, i, i - start, pageLowerBound.toString().c_str(), pageUpperBound.toString().c_str()); + if(REDWOOD_DEBUG) { + for(int j = start; j < i; ++j) { + debug_printf(" %3d: %s\n", j, entries[j].toString().c_str()); + } + ASSERT(pageLowerBound.key <= pageUpperBound.key); + } + + // Push a new record onto the results set, without the child page, copying it into the records arena + records.push_back_deep(records.arena(), pageLowerBound.withoutValue()); + // Set the child page value of the inserted record to childPageID, which has already been allocated in records.arena() above + records.back().setChildPage(childPageID); + + if(end) { + break; + } + + start = i; + kvBytes = 0; + compressedBytes = BTreePage::BinaryTree::GetTreeOverhead(); + pageLowerBound = pageUpperBound.withoutValue(); } } - // Free the old extension pages now that all replacement pages have been written - for(int i = 0; i < originalPage->extensionPageCount; ++i) { - //debug_printf("%p: writePages(): Freeing old extension op=del id=%u @latest\n", actor_debug, bigEndian32(originalPage->extensionPages()[i])); - //m_pager->freeLogicalPage(bigEndian32(originalPage->extensionPages()[i]), version); - } - - return primaryLogicalPageIDs; + return records; } - class SuperPage : public IPage, ReferenceCounted { + ACTOR static Future>> buildNewRoot(VersionedBTree *self, Version version, Standalone> records, int height) { + debug_printf("buildNewRoot start version %" PRId64 ", %lu records\n", version, records.size()); + + // While there are multiple child pages for this version we must write new tree levels. + while(records.size() > 1) { + self->m_header.height = ++height; + Standalone> newRecords = wait(writePages(self, false, &dbBegin, &dbEnd, records, height, version, BTreePageID())); + debug_printf("Wrote a new root level at version %" PRId64 " height %d size %lu pages\n", version, height, newRecords.size()); + records = newRecords; + } + + return records; + } + + class SuperPage : public IPage, ReferenceCounted, public FastAllocated{ public: - SuperPage(std::vector> pages, int usablePageSize) - : m_size(pages.size() * usablePageSize) { + SuperPage(std::vector> pages) { + int blockSize = pages.front()->size(); + m_size = blockSize * pages.size(); m_data = new uint8_t[m_size]; uint8_t *wptr = m_data; for(auto &p : pages) { - memcpy(wptr, p->begin(), usablePageSize); - wptr += usablePageSize; + ASSERT(p->size() == blockSize); + memcpy(wptr, p->begin(), blockSize); + wptr += blockSize; } } virtual ~SuperPage() { - delete m_data; + delete [] m_data; } - virtual void addref() const { + void addref() const { ReferenceCounted::addref(); } - virtual void delref() const { + void delref() const { ReferenceCounted::delref(); } - virtual int size() const { + int size() const { return m_size; } - virtual uint8_t const* begin() const { + uint8_t const* begin() const { return m_data; } - virtual uint8_t* mutate() { + uint8_t* mutate() { return m_data; } private: uint8_t *m_data; - const int m_size; + int m_size; }; - ACTOR static Future> readPage(Reference snapshot, LogicalPageID id, int usablePageSize, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) { - debug_printf("readPage() op=read id=%u @%" PRId64 " lower=%s upper=%s\n", id, snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); - wait(delay(0, TaskPriority::DiskRead)); - - state Reference result = wait(snapshot->getPhysicalPage(id)); - ++counts.pageReads; - state const BTreePage *pTreePage = (const BTreePage *)result->begin(); - - if(pTreePage->extensionPageCount == 0) { - debug_printf("readPage() Found normal page for op=read id=%u @%" PRId64 "\n", id, snapshot->getVersion()); + ACTOR static Future> readPage(Reference snapshot, BTreePageID id, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound, bool forLazyDelete = false) { + if(!forLazyDelete) { + debug_printf("readPage() op=read %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); } else { - std::vector>> pageGets; - pageGets.push_back(std::move(result)); + debug_printf("readPage() op=readForDeferredClear %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion()); + } - for(int i = 0; i < pTreePage->extensionPageCount; ++i) { - debug_printf("readPage() Reading extension page op=read id=%u @%" PRId64 " ext=%d/%d\n", bigEndian32(pTreePage->extensionPages()[i]), snapshot->getVersion(), i + 1, (int)pTreePage->extensionPageCount); - pageGets.push_back(snapshot->getPhysicalPage(bigEndian32(pTreePage->extensionPages()[i]))); + wait(yield()); + + state Reference page; + + ++counts.pageReads; + if(id.size() == 1) { + Reference p = wait(snapshot->getPhysicalPage(id.front(), !forLazyDelete, false)); + page = p; + } + else { + ASSERT(!id.empty()); + counts.extPageReads += (id.size() - 1); + std::vector>> reads; + for(auto &pageID : id) { + reads.push_back(snapshot->getPhysicalPage(pageID, !forLazyDelete, false)); } - - std::vector> pages = wait(getAll(pageGets)); - counts.extPageReads += pTreePage->extensionPageCount; - result = Reference(new SuperPage(pages, usablePageSize)); - pTreePage = (const BTreePage *)result->begin(); + std::vector> pages = wait(getAll(reads)); + // TODO: Cache reconstituted super pages somehow, perhaps with help from the Pager. + page = Reference(new SuperPage(pages)); } - if(result->userData == nullptr) { - debug_printf("readPage() Creating Reader for PageID=%u @%" PRId64 " lower=%s upper=%s\n", id, snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); - result->userData = new BTreePage::BinaryTree::Reader(&pTreePage->tree(), lowerBound, upperBound); - result->userDataDestructor = [](void *ptr) { delete (BTreePage::BinaryTree::Reader *)ptr; }; + debug_printf("readPage() op=readComplete %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion()); + const BTreePage *pTreePage = (const BTreePage *)page->begin(); + + if(!forLazyDelete && page->userData == nullptr) { + debug_printf("readPage() Creating Reader for %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); + page->userData = new BTreePage::BinaryTree::Reader(&pTreePage->tree(), lowerBound, upperBound); + page->userDataDestructor = [](void *ptr) { delete (BTreePage::BinaryTree::Reader *)ptr; }; } - debug_printf("readPage() %s\n", pTreePage->toString(false, id, snapshot->getVersion(), lowerBound, upperBound).c_str()); + if(!forLazyDelete) { + debug_printf("readPage() %s\n", pTreePage->toString(false, id, snapshot->getVersion(), lowerBound, upperBound).c_str()); + } - // Nothing should attempt to read bytes in the page outside the BTreePage structure - VALGRIND_MAKE_MEM_UNDEFINED(result->begin() + pTreePage->size(), result->size() - pTreePage->size()); - - return result; + return page; } - // Returns list of (version, list of (lower_bound, list of children) ) - // TODO: Probably should pass prev/next records by pointer in many places - ACTOR static Future commitSubtree(VersionedBTree *self, MutationBufferT *mutationBuffer, Reference snapshot, LogicalPageID root, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound, const RedwoodRecordRef *decodeLowerBound, const RedwoodRecordRef *decodeUpperBound) { + static void preLoadPage(IPagerSnapshot *snapshot, BTreePageID id) { + ++counts.pagePreloads; + counts.extPagePreloads += (id.size() - 1); + + for(auto pageID : id) { + snapshot->getPhysicalPage(pageID, true, true); + } + } + + void freeBtreePage(BTreePageID btPageID, Version v) { + // Free individual pages at v + for(LogicalPageID id : btPageID) { + m_pager->freePage(id, v); + } + } + + // Returns list of (version, internal page records, required upper bound) + ACTOR static Future> commitSubtree(VersionedBTree *self, MutationBufferT *mutationBuffer, Reference snapshot, BTreePageID rootID, bool isLeaf, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound, const RedwoodRecordRef *decodeLowerBound, const RedwoodRecordRef *decodeUpperBound) { state std::string context; if(REDWOOD_DEBUG) { - context = format("CommitSubtree(root=%u): ", root); + context = format("CommitSubtree(root=%s): ", toString(rootID).c_str()); } - debug_printf("%s root=%d lower=%s upper=%s\n", context.c_str(), root, lowerBound->toString().c_str(), upperBound->toString().c_str()); - debug_printf("%s root=%d decodeLower=%s decodeUpper=%s\n", context.c_str(), root, decodeLowerBound->toString().c_str(), decodeUpperBound->toString().c_str()); - self->counts.commitToPageStart++; + state Standalone results; - // If a boundary changed, the page must be rewritten regardless of KV mutations - state bool boundaryChanged = (lowerBound != decodeLowerBound) || (upperBound != decodeUpperBound); - debug_printf("%s id=%u boundaryChanged=%d\n", context.c_str(), root, boundaryChanged); + debug_printf("%s lower=%s upper=%s\n", context.c_str(), lowerBound->toString().c_str(), upperBound->toString().c_str()); + debug_printf("%s decodeLower=%s decodeUpper=%s\n", context.c_str(), decodeLowerBound->toString().c_str(), decodeUpperBound->toString().c_str()); + self->counts.commitToPageStart++; // Find the slice of the mutation buffer that is relevant to this subtree // TODO: Rather than two lower_bound searches, perhaps just compare each mutation to the upperBound key while iterating @@ -1745,94 +3697,162 @@ private: state MutationBufferT::const_iterator iMutationBoundaryEnd = mutationBuffer->lower_bound(upperBound->key); if(REDWOOD_DEBUG) { - self->printMutationBuffer(iMutationBoundary, iMutationBoundaryEnd); + debug_printf("%s ---------MUTATION BUFFER SLICE ---------------------\n", context.c_str()); + auto begin = iMutationBoundary; + while(1) { + debug_printf("%s Mutation: '%s': %s\n", context.c_str(), printable(begin->first).c_str(), begin->second.toString().c_str()); + if(begin == iMutationBoundaryEnd) { + break; + } + ++begin; + } + debug_printf("%s -------------------------------------\n", context.c_str()); } - // If the boundary range iterators are the same then upperbound and lowerbound have the same key. - // If the key is being mutated, them remove this subtree. + // iMutationBoundary is greatest boundary <= lowerBound->key + // iMutationBoundaryEnd is least boundary >= upperBound->key + + // If the boundary range iterators are the same then this subtree only has one unique key, which is the same key as the boundary + // record the iterators are pointing to. There only two outcomes possible: Clearing the subtree or leaving it alone. + // If there are any changes to the one key then the entire subtree should be deleted as the changes for the key + // do not go into this subtree. if(iMutationBoundary == iMutationBoundaryEnd) { - if(!iMutationBoundary->second.startKeyMutations.empty()) { - VersionedChildrenT c; - debug_printf("%s id=%u lower and upper bound key/version match and key is modified so deleting page, returning %s\n", context.c_str(), root, toString(c).c_str()); - return c; + if(iMutationBoundary->second.keyChanged()) { + debug_printf("%s lower and upper bound key/version match and key is modified so deleting page, returning %s\n", context.c_str(), toString(results).c_str()); + Version firstKeyChangeVersion = self->singleVersion ? self->getLastCommittedVersion() + 1 : iMutationBoundary->second.startKeyMutations.begin()->first; + if(isLeaf) { + self->freeBtreePage(rootID, firstKeyChangeVersion); + } + else { + self->m_lazyDeleteQueue.pushBack(LazyDeleteQueueEntry{firstKeyChangeVersion, rootID}); + } + return results; } - // If there are no forced boundary changes then this subtree is unchanged. - if(!boundaryChanged) { - VersionedChildrenT c({ {0, {*decodeLowerBound}, *decodeUpperBound} }); - debug_printf("%s id=%d page contains a single key '%s' which is not changing, returning %s\n", context.c_str(), root, lowerBound->key.toString().c_str(), toString(c).c_str()); - return c; - } + // Otherwise, no changes to this subtree + results.push_back_deep(results.arena(), VersionAndChildrenRef(0, VectorRef((RedwoodRecordRef *)decodeLowerBound, 1), *decodeUpperBound)); + debug_printf("%s page contains a single key '%s' which is not changing, returning %s\n", context.c_str(), lowerBound->key.toString().c_str(), toString(results).c_str()); + return results; } - // Another way to have no mutations is to have a single mutation range cover this - // subtree but have no changes in it + // If one mutation range covers the entire subtree, then check if the entire subtree is modified, + // unmodified, or possibly/partially modified. MutationBufferT::const_iterator iMutationBoundaryNext = iMutationBoundary; ++iMutationBoundaryNext; - if(!boundaryChanged && iMutationBoundaryNext == iMutationBoundaryEnd && - ( iMutationBoundary->second.noChanges() || - ( !iMutationBoundary->second.rangeClearVersion.present() && - iMutationBoundary->first < lowerBound->key) - ) - ) { - VersionedChildrenT c({ {0, {*decodeLowerBound}, *decodeUpperBound} }); - debug_printf("%s no changes because sole mutation range was not cleared, returning %s\n", context.c_str(), toString(c).c_str()); - return c; + if(iMutationBoundaryNext == iMutationBoundaryEnd) { + // Cleared means the entire range covering the subtree was cleared. It is assumed true + // if the range starting after the lower mutation boundary was cleared, and then proven false + // below if possible. + bool cleared = iMutationBoundary->second.rangeCleared(); + // Unchanged means the entire range covering the subtree was unchanged, it is assumed to be the + // opposite of cleared() and then proven false below if possible. + bool unchanged = !cleared; + debug_printf("%s cleared=%d unchanged=%d\n", context.c_str(), cleared, unchanged); + + // If the lower mutation boundary key is the same as the subtree lower bound then whether or not + // that key is being changed or cleared affects this subtree. + if(iMutationBoundary->first == lowerBound->key) { + // If subtree will be cleared (so far) but the lower boundary key is not cleared then the subtree is not cleared + if(cleared && !iMutationBoundary->second.keyCleared()) { + cleared = false; + debug_printf("%s cleared=%d unchanged=%d\n", context.c_str(), cleared, unchanged); + } + // If the subtree looked unchanged (so far) but the lower boundary is is changed then the subtree is changed + if(unchanged && iMutationBoundary->second.keyChanged()) { + unchanged = false; + debug_printf("%s cleared=%d unchanged=%d\n", context.c_str(), cleared, unchanged); + } + } + + // If the higher mutation boundary key is the same as the subtree upper bound key then whether + // or not it is being changed or cleared affects this subtree. + if((cleared || unchanged) && iMutationBoundaryEnd->first == upperBound->key) { + // If the key is being changed then the records in this subtree with the same key must be removed + // so the subtree is definitely not unchanged, though it may be cleared to achieve the same effect. + if(iMutationBoundaryEnd->second.keyChanged()) { + unchanged = false; + debug_printf("%s cleared=%d unchanged=%d\n", context.c_str(), cleared, unchanged); + } + else { + // If the key is not being changed then the records in this subtree can't be removed so the + // subtree is not being cleared. + cleared = false; + debug_printf("%s cleared=%d unchanged=%d\n", context.c_str(), cleared, unchanged); + } + } + + // The subtree cannot be both cleared and unchanged. + ASSERT(!(cleared && unchanged)); + + // If no changes in subtree + if(unchanged) { + results.push_back_deep(results.arena(), VersionAndChildrenRef(0, VectorRef((RedwoodRecordRef *)decodeLowerBound, 1), *decodeUpperBound)); + debug_printf("%s no changes on this subtree, returning %s\n", context.c_str(), toString(results).c_str()); + return results; + } + + // If subtree is cleared + if(cleared) { + debug_printf("%s %s cleared, deleting it, returning %s\n", context.c_str(), isLeaf ? "Page" : "Subtree", toString(results).c_str()); + Version clearVersion = self->singleVersion ? self->getLastCommittedVersion() + 1 : iMutationBoundary->second.rangeClearVersion.get(); + if(isLeaf) { + self->freeBtreePage(rootID, clearVersion); + } + else { + self->m_lazyDeleteQueue.pushBack(LazyDeleteQueueEntry{clearVersion, rootID}); + } + return results; + } } self->counts.commitToPage++; - state Reference rawPage = wait(readPage(snapshot, root, self->m_usablePageSizeOverride, decodeLowerBound, decodeUpperBound)); + state Reference rawPage = wait(readPage(snapshot, rootID, decodeLowerBound, decodeUpperBound)); state BTreePage *page = (BTreePage *) rawPage->begin(); - debug_printf("%s commitSubtree(): %s\n", context.c_str(), page->toString(false, root, snapshot->getVersion(), decodeLowerBound, decodeUpperBound).c_str()); + ASSERT(isLeaf == page->isLeaf()); + debug_printf("%s commitSubtree(): %s\n", context.c_str(), page->toString(false, rootID, snapshot->getVersion(), decodeLowerBound, decodeUpperBound).c_str()); state BTreePage::BinaryTree::Cursor cursor = getReader(rawPage)->getCursor(); cursor.moveFirst(); + state Version writeVersion; + // Leaf Page - if(page->flags & BTreePage::IS_LEAF) { - VersionedChildrenT results; - std::vector merged; + if(isLeaf) { + state Standalone> merged; - debug_printf("%s id=%u MERGING EXISTING DATA WITH MUTATIONS:\n", context.c_str(), root); - if(REDWOOD_DEBUG) { - self->printMutationBuffer(iMutationBoundary, iMutationBoundaryEnd); - } - - // It's a given that the mutation map is not empty so it's safe to do this - Key mutationRangeStart = iMutationBoundary->first; + debug_printf("%s Leaf page, merging changes.\n", context.c_str()); // If replacement pages are written they will be at the minimum version seen in the mutations for this leaf Version minVersion = invalidVersion; - int changes = 0; // Now, process each mutation range and merge changes with existing data. + bool firstMutationBoundary = true; while(iMutationBoundary != iMutationBoundaryEnd) { debug_printf("%s New mutation boundary: '%s': %s\n", context.c_str(), printable(iMutationBoundary->first).c_str(), iMutationBoundary->second.toString().c_str()); SingleKeyMutationsByVersion::const_iterator iMutations; - // If the mutation boundary key is less than the lower bound key then skip startKeyMutations for - // this bounary, we're only processing this mutation range here to apply any clears to existing data. - if(iMutationBoundary->first < lowerBound->key) { + // For the first mutation boundary only, if the boundary key is less than the lower bound for the page + // then skip startKeyMutations for this boundary, we're only processing this mutation range here to apply + // a possible clear to existing data. + if(firstMutationBoundary && iMutationBoundary->first < lowerBound->key) { iMutations = iMutationBoundary->second.startKeyMutations.end(); } - // If the mutation boundary key is the same as the page lowerBound key then start reading single - // key mutations at the first version greater than the lowerBound key's version. - else if(!self->singleVersion && iMutationBoundary->first == lowerBound->key) { - iMutations = iMutationBoundary->second.startKeyMutations.upper_bound(lowerBound->version); - } else { iMutations = iMutationBoundary->second.startKeyMutations.begin(); } + firstMutationBoundary = false; SingleKeyMutationsByVersion::const_iterator iMutationsEnd = iMutationBoundary->second.startKeyMutations.end(); // Iterate over old versions of the mutation boundary key, outputting if necessary + bool boundaryKeyWritten = false; while(cursor.valid() && cursor.get().key == iMutationBoundary->first) { // If not in single version mode or there were no changes to the key if(!self->singleVersion || iMutationBoundary->second.noChanges()) { - merged.push_back(cursor.get()); + merged.push_back(merged.arena(), cursor.get()); debug_printf("%s Added %s [existing, boundary start]\n", context.c_str(), merged.back().toString().c_str()); + boundaryKeyWritten = true; } else { ASSERT(self->singleVersion); @@ -1846,29 +3866,39 @@ private: // Output mutations for the mutation boundary start key while(iMutations != iMutationsEnd) { const SingleKeyMutation &m = iMutations->second; - int maxPartSize = std::min(255, self->m_usablePageSizeOverride / 5); - if(m.isClear() || m.value.size() <= maxPartSize) { - if(iMutations->first < minVersion || minVersion == invalidVersion) - minVersion = iMutations->first; - ++changes; - merged.push_back(m.toRecord(iMutationBoundary->first, iMutations->first)); - debug_printf("%s Added non-split %s [mutation, boundary start]\n", context.c_str(), merged.back().toString().c_str()); + if(m.isClear() || m.value.size() <= self->m_maxPartSize) { + // If the boundary key was not yet written to the merged list then clears can be skipped. + // Note that in a more complex scenario where there are multiple sibling pages for the same key, with different + // versions and/or part numbers, this is still a valid thing to do. This is because a changing boundary + // key (set or clear) will result in any instances (different versions, split parts) of this key + // on sibling pages to the left of this page to be removed, so an explicit clear need only be stored + // if a record with the mutation boundary key was already written to this page. + if(!boundaryKeyWritten && iMutations->second.isClear()) { + debug_printf("%s Skipped %s [mutation, unnecessary boundary key clear]\n", context.c_str(), m.toRecord(iMutationBoundary->first, iMutations->first).toString().c_str()); + } + else { + merged.push_back(merged.arena(), m.toRecord(iMutationBoundary->first, iMutations->first)); + debug_printf("%s Added non-split %s [mutation, boundary start]\n", context.c_str(), merged.back().toString().c_str()); + if(iMutations->first < minVersion || minVersion == invalidVersion) + minVersion = iMutations->first; + boundaryKeyWritten = true; + } } else { if(iMutations->first < minVersion || minVersion == invalidVersion) minVersion = iMutations->first; - ++changes; int bytesLeft = m.value.size(); int start = 0; RedwoodRecordRef whole(iMutationBoundary->first, iMutations->first, m.value); while(bytesLeft > 0) { - int partSize = std::min(bytesLeft, maxPartSize); + int partSize = std::min(bytesLeft, self->m_maxPartSize); // Don't copy the value chunk because this page will stay in memory until after we've built new version(s) of it - merged.push_back(whole.split(start, partSize)); + merged.push_back(merged.arena(), whole.split(start, partSize)); bytesLeft -= partSize; start += partSize; - debug_printf("%s Added split %s [mutation, boundary start]\n", context.c_str(), merged.back().toString().c_str()); + debug_printf("%s Added split %s [mutation, boundary start] bytesLeft %d\n", context.c_str(), merged.back().toString().c_str(), bytesLeft); } + boundaryKeyWritten = true; } ++iMutations; } @@ -1886,7 +3916,7 @@ private: bool remove = self->singleVersion && clearRangeVersion.present(); if(!remove) { - merged.push_back(cursor.get()); + merged.push_back(merged.arena(), cursor.get()); debug_printf("%s Added %s [existing, middle]\n", context.c_str(), merged.back().toString().c_str()); } else { @@ -1909,8 +3939,7 @@ private: Version clearVersion = clearRangeVersion.get(); if(clearVersion < minVersion || minVersion == invalidVersion) minVersion = clearVersion; - ++changes; - merged.push_back(RedwoodRecordRef(cursor.get().key, clearVersion)); + merged.push_back(merged.arena(), RedwoodRecordRef(cursor.get().key, clearVersion)); debug_printf("%s Added %s [existing, middle clear]\n", context.c_str(), merged.back().toString().c_str()); } cursor = nextCursor; @@ -1922,81 +3951,53 @@ private: } // Write any remaining existing keys, which are not subject to clears as they are beyond the cleared range. + bool upperMutationBoundaryKeyChanged = iMutationBoundaryEnd->second.keyChanged(); while(cursor.valid()) { - merged.push_back(cursor.get()); + // If the upper mutation boundary is being changed and the cursor's key matches it then stop because none of the earlier + // versions or fragments of that key should be written. + if(upperMutationBoundaryKeyChanged && cursor.get().key == iMutationBoundaryEnd->first) { + debug_printf("%s Skipped %s and beyond [existing, matches changed upper mutation boundary]\n", context.c_str(), cursor.get().toString().c_str()); + Version changedVersion = iMutationBoundaryEnd->second.startKeyMutations.begin()->first; + if(changedVersion < minVersion || minVersion == invalidVersion) + minVersion = changedVersion; + break; + } + merged.push_back(merged.arena(), cursor.get()); debug_printf("%s Added %s [existing, tail]\n", context.c_str(), merged.back().toString().c_str()); cursor.moveNext(); } - debug_printf("%s Done merging mutations into existing leaf contents, made %d changes\n", context.c_str(), changes); - // No changes were actually made. This could happen if the only mutations are clear ranges which do not match any records. - // But if a boundary was changed then we must rewrite the page anyway. - if(!boundaryChanged && minVersion == invalidVersion) { - VersionedChildrenT c({ {0, {*decodeLowerBound}, *decodeUpperBound} }); - debug_printf("%s No changes were made during mutation merge, returning %s\n", context.c_str(), toString(c).c_str()); - ASSERT(changes == 0); - return c; + if(minVersion == invalidVersion) { + results.push_back_deep(results.arena(), VersionAndChildrenRef(0, VectorRef((RedwoodRecordRef *)decodeLowerBound, 1), *decodeUpperBound)); + debug_printf("%s No changes were made during mutation merge, returning %s\n", context.c_str(), toString(results).c_str()); + return results; + } + else { + debug_printf("%s Changes were made, writing.\n", context.c_str()); } // TODO: Make version and key splits based on contents of merged list, if keeping history + writeVersion = self->singleVersion ? self->getLastCommittedVersion() + 1 : minVersion; // If everything in the page was deleted then this page should be deleted as of the new version // Note that if a single range clear covered the entire page then we should not get this far - if(merged.empty() && root != 0) { - // TODO: For multi version mode only delete this page as of the new version - VersionedChildrenT c({}); - debug_printf("%s id=%u All leaf page contents were cleared, returning %s\n", context.c_str(), root, toString(c).c_str()); - return c; + if(merged.empty()) { + debug_printf("%s All leaf page contents were cleared, returning %s\n", context.c_str(), toString(results).c_str()); + self->freeBtreePage(rootID, writeVersion); + return results; } - IPager *pager = self->m_pager; - std::vector pages = buildPages(true, *lowerBound, *upperBound, merged, BTreePage::IS_LEAF, [pager](){ return pager->newPageBuffer(); }, self->m_usablePageSizeOverride); - - if(!self->singleVersion) { - ASSERT(false); -// // If there isn't still just a single page of data then this page became too large and was split. -// // The new split pages will be valid as of minVersion, but the old page remains valid at the old version -// if(pages.size() != 1) { -// results.push_back( {0, {*decodeLowerBound}, ??} ); -// debug_printf("%s Added versioned child set #1: %s\n", context.c_str(), toString(results.back()).c_str()); -// } -// else { -// // The page was updated but not size-split or version-split so the last page version's data -// // can be replaced with the new page contents -// if(pages.size() == 1) -// minVersion = 0; -// } - } - - // Write page(s), get new page IDs - Version writeVersion = self->singleVersion ? self->getLastCommittedVersion() + 1 : minVersion; - std::vector newPageIDs = self->writePages(pages, writeVersion, root, page, upperBound, THIS); - - // If this commitSubtree() is operating on the root, write new levels if needed until until we're returning a single page - if(root == self->m_root && pages.size() > 1) { - debug_printf("%s Building new root\n", context.c_str()); - self->buildNewRoot(writeVersion, pages, newPageIDs, page); - } - - results.push_back({writeVersion, {}, *upperBound}); - for(int i=0; i> entries = wait(writePages(self, true, lowerBound, upperBound, merged, page->height, writeVersion, rootID)); + results.arena().dependsOn(entries.arena()); + results.push_back(results.arena(), VersionAndChildrenRef(writeVersion, entries, *upperBound)); debug_printf("%s Merge complete, returning %s\n", context.c_str(), toString(results).c_str()); - - debug_printf("%s DONE.\n", context.c_str()); return results; } else { // Internal Page - - // TODO: Combine these into one vector and/or do something more elegant - state std::vector> futureChildren; + ASSERT(!isLeaf); + state std::vector>> futureChildren; bool first = true; while(cursor.valid()) { @@ -2018,8 +4019,8 @@ private: const RedwoodRecordRef &decodeChildLowerBound = cursor.get(); - LogicalPageID pageID = cursor.get().getPageID(); - ASSERT(pageID != 0); + BTreePageID pageID = cursor.get().getChildPage(); + ASSERT(!pageID.empty()); const RedwoodRecordRef &decodeChildUpperBound = cursor.moveNext() ? cursor.get() : *decodeUpperBound; @@ -2030,8 +4031,8 @@ private: const RedwoodRecordRef &childUpperBound = cursor.valid() ? cursor.get() : *upperBound; - debug_printf("%s recursing to PageID=%u lower=%s upper=%s decodeLower=%s decodeUpper=%s\n", - context.c_str(), pageID, childLowerBound.toString().c_str(), childUpperBound.toString().c_str(), decodeChildLowerBound.toString().c_str(), decodeChildUpperBound.toString().c_str()); + debug_printf("%s recursing to %s lower=%s upper=%s decodeLower=%s decodeUpper=%s\n", + context.c_str(), toString(pageID).c_str(), childLowerBound.toString().c_str(), childUpperBound.toString().c_str(), decodeChildLowerBound.toString().c_str(), decodeChildUpperBound.toString().c_str()); /* // TODO: If lower bound and upper bound have the same key, do something intelligent if possible @@ -2063,7 +4064,8 @@ private: futureChildren.push_back(self->commitSubtree(self, mutationBuffer, snapshot, pageID, &childLowerBound, &childUpperBound)); } */ - futureChildren.push_back(self->commitSubtree(self, mutationBuffer, snapshot, pageID, &childLowerBound, &childUpperBound, &decodeChildLowerBound, &decodeChildUpperBound)); + // If this page has height of 2 then its children are leaf nodes + futureChildren.push_back(self->commitSubtree(self, mutationBuffer, snapshot, pageID, page->height == 2, &childLowerBound, &childUpperBound, &decodeChildLowerBound, &decodeChildUpperBound)); } // Waiting one at a time makes debugging easier @@ -2074,19 +4076,21 @@ private: } if(REDWOOD_DEBUG) { - debug_printf("%s Subtree update results for root PageID=%u\n", context.c_str(), root); + debug_printf("%s Subtree update results\n", context.c_str()); for(int i = 0; i < futureChildren.size(); ++i) { debug_printf("%s subtree result %s\n", context.c_str(), toString(futureChildren[i].get()).c_str()); } } - // TODO: Handle multi-versioned results + // TODO: Either handle multi-versioned results or change commitSubtree interface to return a single child set. ASSERT(self->singleVersion); + writeVersion = self->getLastCommittedVersion() + 1; cursor.moveFirst(); + // All of the things added to pageBuilder will exist in the arenas inside futureChildren or will be upperBound InternalPageBuilder pageBuilder(cursor); for(int i = 0; i < futureChildren.size(); ++i) { - const VersionedChildrenT &versionedChildren = futureChildren[i].get(); + VersionedChildrenT versionedChildren = futureChildren[i].get(); ASSERT(versionedChildren.size() <= 1); if(!versionedChildren.empty()) { @@ -2100,62 +4104,28 @@ private: if(pageBuilder.modified) { // If the page now has no children if(pageBuilder.childPageCount == 0) { - // If we are the root, write a new empty btree - if(root == 0) { - Reference page = self->m_pager->newPageBuffer(); - makeEmptyPage(page, BTreePage::IS_LEAF, self->m_usablePageSizeOverride); - RedwoodRecordRef rootEntry = dbBegin.withPageID(0); - self->writePage(0, page, self->getLastCommittedVersion() + 1, &dbBegin, &dbEnd); - VersionedChildrenT c({ {0, {dbBegin}, dbEnd } }); - debug_printf("%s id=%u All root page children were deleted, rewrote root as leaf, returning %s\n", context.c_str(), root, toString(c).c_str()); - return c; - } - else { - VersionedChildrenT c({}); - debug_printf("%s id=%u All internal page children were deleted #1 so deleting this page too, returning %s\n", context.c_str(), root, toString(c).c_str()); - return c; - } + debug_printf("%s All internal page children were deleted so deleting this page too, returning %s\n", context.c_str(), toString(results).c_str()); + self->freeBtreePage(rootID, writeVersion); + return results; } else { - debug_printf("%s Internal PageID=%u modified, creating replacements.\n", context.c_str(), root); + debug_printf("%s Internal page modified, creating replacements.\n", context.c_str()); debug_printf("%s newChildren=%s lastUpperBound=%s upperBound=%s\n", context.c_str(), toString(pageBuilder.entries).c_str(), pageBuilder.lastUpperBound.toString().c_str(), upperBound->toString().c_str()); ASSERT(pageBuilder.lastUpperBound == *upperBound); - // TODO: Don't do this! - std::vector entries; - for(auto &o : pageBuilder.entries) { - entries.push_back(o); - } + Standalone> childEntries = wait(holdWhile(pageBuilder.entries, writePages(self, false, lowerBound, upperBound, pageBuilder.entries, page->height, writeVersion, rootID))); - std::vector pages = buildPages(false, *lowerBound, *upperBound, entries, 0, [=](){ return self->m_pager->newPageBuffer(); }, self->m_usablePageSizeOverride); - - Version writeVersion = self->getLastCommittedVersion() + 1; - std::vector newPageIDs = self->writePages(pages, writeVersion, root, page, upperBound, THIS); - - // If this commitSubtree() is operating on the root, write new levels if needed until until we're returning a single page - if(root == self->m_root) { - self->buildNewRoot(writeVersion, pages, newPageIDs, page); - } - - VersionedChildrenT vc(1); - vc.resize(1); - VersionedChildPageSet &c = vc.front(); - c.version = writeVersion; - c.upperBound = *upperBound; - - for(int i=0; i((RedwoodRecordRef *)decodeLowerBound, 1), *decodeUpperBound)); + debug_printf("%s Page has no changes, returning %s\n", context.c_str(), toString(results).c_str()); + return results; } } } @@ -2178,19 +4148,62 @@ private: // Wait for the latest commit that started to be finished. wait(previousCommit); - debug_printf("%s: Beginning commit of version %" PRId64 "\n", self->m_name.c_str(), writeVersion); + + self->m_pager->setOldestVersion(self->m_newOldestVersion); + debug_printf("%s: Beginning commit of version %" PRId64 ", new oldest version set to %" PRId64 "\n", self->m_name.c_str(), writeVersion, self->m_newOldestVersion); + + state bool lazyDeleteStop = false; + state Future lazyDelete = incrementalSubtreeClear(self, &lazyDeleteStop); // Get the latest version from the pager, which is what we will read at - Version latestVersion = wait(self->m_pager->getLatestVersion()); + state Version latestVersion = self->m_pager->getLatestVersion(); debug_printf("%s: pager latestVersion %" PRId64 "\n", self->m_name.c_str(), latestVersion); - if(REDWOOD_DEBUG) { - self->printMutationBuffer(mutations); + state Standalone rootPageID = self->m_header.root.get(); + state RedwoodRecordRef lowerBound = dbBegin.withPageID(rootPageID); + Standalone versionedRoots = wait(commitSubtree(self, mutations, self->m_pager->getReadSnapshot(latestVersion), rootPageID, self->m_header.height == 1, &lowerBound, &dbEnd, &lowerBound, &dbEnd)); + debug_printf("CommitSubtree(root %s) returned %s\n", toString(rootPageID).c_str(), toString(versionedRoots).c_str()); + + // CommitSubtree on the root can only return 1 child at most because the pager interface only supports writing + // one meta record (which contains the root page) per commit. + ASSERT(versionedRoots.size() <= 1); + + // If the old root was deleted, write a new empty tree root node and free the old roots + if(versionedRoots.empty()) { + debug_printf("Writing new empty root.\n"); + LogicalPageID newRootID = wait(self->m_pager->newPageID()); + Reference page = self->m_pager->newPageBuffer(); + makeEmptyRoot(page); + self->m_header.height = 1; + self->m_pager->updatePage(newRootID, page); + rootPageID = BTreePageID((LogicalPageID *)&newRootID, 1); + } + else { + Standalone> newRootLevel(versionedRoots.front().children, versionedRoots.arena()); + if(newRootLevel.size() == 1) { + rootPageID = newRootLevel.front().getChildPage(); + } + else { + // If the new root level's size is not 1 then build new root level(s) + Standalone> newRootPage = wait(buildNewRoot(self, latestVersion, newRootLevel, self->m_header.height)); + rootPageID = newRootPage.front().getChildPage(); + } } - VersionedChildrenT newRoot = wait(commitSubtree(self, mutations, self->m_pager->getReadSnapshot(latestVersion), self->m_root, &dbBegin, &dbEnd, &dbBegin, &dbEnd)); + self->m_header.root.set(rootPageID, sizeof(headerSpace) - sizeof(m_header)); + + lazyDeleteStop = true; + wait(success(lazyDelete)); + debug_printf("Lazy delete freed %u pages\n", lazyDelete.get()); + + self->m_pager->setCommitVersion(writeVersion); + + wait(self->m_lazyDeleteQueue.flush()); + self->m_header.lazyDeleteQueue = self->m_lazyDeleteQueue.getState(); + + debug_printf("Setting metakey\n"); + self->m_pager->setMetaKey(self->m_header.asKeyRef()); - self->m_pager->setLatestVersion(writeVersion); debug_printf("%s: Committing pager %" PRId64 "\n", self->m_name.c_str(), writeVersion); wait(self->m_pager->commit()); debug_printf("%s: Committed version %" PRId64 "\n", self->m_name.c_str(), writeVersion); @@ -2201,8 +4214,7 @@ private: self->m_mutationBuffers.erase(self->m_mutationBuffers.begin()); self->m_lastCommittedVersion = writeVersion; - ++self->counts.commits; -printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); + ++counts.commits; committed.send(Void()); return Void(); @@ -2217,11 +4229,13 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); // PageCursors can be shared by many InternalCursors, making InternalCursor copying low overhead struct PageCursor : ReferenceCounted, FastAllocated { Reference parent; - LogicalPageID pageID; // Only needed for debugging purposes + BTreePageID pageID; // Only needed for debugging purposes Reference page; BTreePage::BinaryTree::Cursor cursor; - PageCursor(LogicalPageID id, Reference page, Reference parent = {}) + // id will normally reference memory owned by the parent, which is okay because a reference to the parent + // will be held in the cursor + PageCursor(BTreePageID id, Reference page, Reference parent = {}) : pageID(id), page(page), parent(parent), cursor(getReader().getCursor()) { } @@ -2234,35 +4248,50 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); return Reference(new PageCursor(*this)); } + const BTreePage * btPage() const { + return (const BTreePage *)page->begin(); + } + // Multiple InternalCursors can share a Page BTreePage::BinaryTree::Reader & getReader() const { return *(BTreePage::BinaryTree::Reader *)page->userData; } bool isLeaf() const { - const BTreePage *p = ((const BTreePage *)page->begin()); - return p->isLeaf(); + return btPage()->isLeaf(); } - Future> getChild(Reference pager, int usablePageSizeOverride) { + Future> getChild(Reference pager, int readAheadBytes = 0) { ASSERT(!isLeaf()); BTreePage::BinaryTree::Cursor next = cursor; next.moveNext(); const RedwoodRecordRef &rec = cursor.get(); - LogicalPageID id = rec.getPageID(); - Future> child = readPage(pager, id, usablePageSizeOverride, &rec, &next.getOrUpperBound()); + BTreePageID id = rec.getChildPage(); + Future> child = readPage(pager, id, &rec, &next.getOrUpperBound()); + + // Read ahead siblings at level 2 + if(readAheadBytes > 0 && btPage()->height == 2 && next.valid()) { + do { + debug_printf("preloading %s %d bytes left\n", ::toString(next.get().getChildPage()).c_str(), readAheadBytes); + // If any part of the page was already loaded then stop + if(next.get().value.present()) { + preLoadPage(pager.getPtr(), next.get().getChildPage()); + readAheadBytes -= page->size(); + } + } while(readAheadBytes > 0 && next.moveNext()); + } + return map(child, [=](Reference page) { return Reference(new PageCursor(id, page, Reference::addRef(this))); }); } std::string toString() const { - return format("PageID=%u, %s", pageID, cursor.valid() ? cursor.get().toString().c_str() : ""); + return format("%s, %s", ::toString(pageID).c_str(), cursor.valid() ? cursor.get().toString().c_str() : ""); } }; - LogicalPageID rootPageID; - int usablePageSizeOverride; + Standalone rootPageID; Reference pager; Reference pageCursor; @@ -2270,8 +4299,8 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); InternalCursor() { } - InternalCursor(Reference pager, LogicalPageID root, int usablePageSizeOverride) - : pager(pager), rootPageID(root), usablePageSizeOverride(usablePageSizeOverride) { + InternalCursor(Reference pager, BTreePageID root) + : pager(pager), rootPageID(root) { } std::string toString() const { @@ -2334,14 +4363,14 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); } // Otherwise read the root page - Future> root = readPage(pager, rootPageID, usablePageSizeOverride, &dbBegin, &dbEnd); + Future> root = readPage(pager, rootPageID, &dbBegin, &dbEnd); return map(root, [=](Reference p) { pageCursor = Reference(new PageCursor(rootPageID, p)); return Void(); }); } - ACTOR Future seekLessThanOrEqual_impl(InternalCursor *self, RedwoodRecordRef query) { + ACTOR Future seekLessThanOrEqual_impl(InternalCursor *self, RedwoodRecordRef query, int prefetchBytes) { Future f = self->moveToRoot(); // f will almost always be ready @@ -2368,7 +4397,7 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); return true; } - Reference child = wait(self->pageCursor->getChild(self->pager, self->usablePageSizeOverride)); + Reference child = wait(self->pageCursor->getChild(self->pager, prefetchBytes)); self->pageCursor = child; } else { @@ -2379,8 +4408,8 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); } } - Future seekLTE(RedwoodRecordRef query) { - return seekLessThanOrEqual_impl(this, query); + Future seekLTE(RedwoodRecordRef query, int prefetchBytes) { + return seekLessThanOrEqual_impl(this, query, prefetchBytes); } ACTOR Future move_impl(InternalCursor *self, bool forward) { @@ -2421,7 +4450,7 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); } } - Reference child = wait(self->pageCursor->getChild(self->pager, self->usablePageSizeOverride)); + Reference child = wait(self->pageCursor->getChild(self->pager)); forward ? child->cursor.moveFirst() : child->cursor.moveLast(); self->pageCursor = child; } @@ -2433,13 +4462,6 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); return move_impl(this, forward); } - Future moveNext() { - return move_impl(this, true); - } - Future movePrev() { - return move_impl(this, false); - } - // Move to the first or last record of the database. ACTOR Future move_end(InternalCursor *self, bool begin) { Future f = self->moveToRoot(); @@ -2469,7 +4491,7 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); return true; } - Reference child = wait(self->pageCursor->getChild(self->pager, self->usablePageSizeOverride)); + Reference child = wait(self->pageCursor->getChild(self->pager)); self->pageCursor = child; } else { @@ -2491,9 +4513,9 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); // KeyValueRefs returned become invalid once the cursor is moved class Cursor : public IStoreCursor, public ReferenceCounted, public FastAllocated, NonCopyable { public: - Cursor(Reference pageSource, LogicalPageID root, Version recordVersion, int usablePageSizeOverride) + Cursor(Reference pageSource, BTreePageID root, Version recordVersion) : m_version(recordVersion), - m_cur1(pageSource, root, usablePageSizeOverride), + m_cur1(pageSource, root), m_cur2(m_cur1) { } @@ -2517,42 +4539,56 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); Optional m_kv; public: - virtual Future findEqual(KeyRef key) { return find_impl(this, key, true, 0); } - virtual Future findFirstEqualOrGreater(KeyRef key, bool needValue, int prefetchNextBytes) { return find_impl(this, key, needValue, 1); } - virtual Future findLastLessOrEqual(KeyRef key, bool needValue, int prefetchPriorBytes) { return find_impl(this, key, needValue, -1); } + Future findEqual(KeyRef key) override { + return find_impl(this, key, 0); + } + Future findFirstEqualOrGreater(KeyRef key, int prefetchBytes) override { + return find_impl(this, key, 1, prefetchBytes); + } + Future findLastLessOrEqual(KeyRef key, int prefetchBytes) override { + return find_impl(this, key, -1, prefetchBytes); + } - virtual Future next(bool needValue) { return move(this, true, needValue); } - virtual Future prev(bool needValue) { return move(this, false, needValue); } + Future next() override { + return move(this, true); + } + Future prev() override { + return move(this, false); + } - virtual bool isValid() { + bool isValid() override { return m_kv.present(); } - virtual KeyRef getKey() { + KeyRef getKey() override { return m_kv.get().key; } - //virtual StringRef getCompressedKey() = 0; - virtual ValueRef getValue() { + ValueRef getValue() override { return m_kv.get().value; } - // TODO: Either remove this method or change the contract so that key and value strings returned are still valid after the cursor is - // moved and allocate them in some arena that this method resets. - virtual void invalidateReturnedStrings() { - } - - std::string toString() const { + std::string toString(bool includePaths = false) const { std::string r; r += format("Cursor(%p) ver: %" PRId64 " ", this, m_version); if(m_kv.present()) { - r += format(" KV: '%s' -> '%s'\n", m_kv.get().key.printable().c_str(), m_kv.get().value.printable().c_str()); + r += format(" KV: '%s' -> '%s'", m_kv.get().key.printable().c_str(), m_kv.get().value.printable().c_str()); } else { - r += " KV: \n"; + r += " KV: "; + } + if(includePaths) { + r += format("\n Cur1: %s", m_cur1.toString().c_str()); + r += format("\n Cur2: %s", m_cur2.toString().c_str()); + } + else { + if(m_cur1.valid()) { + r += format("\n Cur1: %s", m_cur1.get().toString().c_str()); + } + if(m_cur2.valid()) { + r += format("\n Cur2: %s", m_cur2.get().toString().c_str()); + } } - r += format(" Cur1: %s\n", m_cur1.toString().c_str()); - r += format(" Cur2: %s\n", m_cur2.toString().c_str()); return r; } @@ -2562,12 +4598,12 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); // for less than or equal use cmp < 0 // for greater than or equal use cmp > 0 // for equal use cmp == 0 - ACTOR static Future find_impl(Cursor *self, KeyRef key, bool needValue, int cmp) { + ACTOR static Future find_impl(Cursor *self, KeyRef key, int cmp, int prefetchBytes = 0) { // Search for the last key at or before (key, version, \xff) state RedwoodRecordRef query(key, self->m_version, {}, 0, std::numeric_limits::max()); self->m_kv.reset(); - wait(success(self->m_cur1.seekLTE(query))); + wait(success(self->m_cur1.seekLTE(query, prefetchBytes))); debug_printf("find%sE(%s): %s\n", cmp > 0 ? "GT" : (cmp == 0 ? "" : "LT"), query.toString().c_str(), self->toString().c_str()); // If we found the target key with a present value then return it as it is valid for any cmp type @@ -2610,7 +4646,7 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); } // Get the next present key at the target version. Handles invalid cursor too. - wait(self->next(needValue)); + wait(self->next()); } else if(cmp < 0) { // Mode is <=, which is the same as the seekLTE(query) @@ -2620,15 +4656,14 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); } // Move to previous present kv pair at the target version - wait(self->prev(needValue)); + wait(self->prev()); } return Void(); } - // TODO: use needValue - ACTOR static Future move(Cursor *self, bool fwd, bool needValue) { - debug_printf("Cursor::move(%d): Cursor = %s\n", fwd, self->toString().c_str()); + ACTOR static Future move(Cursor *self, bool fwd) { + debug_printf("Cursor::move(%d): Start %s\n", fwd, self->toString().c_str()); ASSERT(self->m_cur1.valid()); // If kv is present then the key/version at cur1 was already returned so move to a new key @@ -2637,6 +4672,7 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); ASSERT(self->m_cur1.valid()); loop { self->m_cur2 = self->m_cur1; + debug_printf("Cursor::move(%d): Advancing cur1 %s\n", fwd, self->toString().c_str()); bool valid = wait(self->m_cur1.move(fwd)); if(!valid || self->m_cur1.get().key != self->m_cur2.get().key) { break; @@ -2655,9 +4691,11 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); // TODO: This may already be the case, store state to track this condition and avoid the reset here if(self->m_cur1.valid()) { self->m_cur2 = self->m_cur1; + debug_printf("Cursor::move(%d): Advancing cur2 %s\n", fwd, self->toString().c_str()); wait(success(self->m_cur2.move(true))); } + self->m_kv.reset(); while(self->m_cur1.valid()) { if(self->m_cur1.presentAtVersion(self->m_version) && @@ -2670,20 +4708,19 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); if(fwd) { // Moving forward, move cur2 forward and keep cur1 pointing to the prior (predecessor) record - debug_printf("Cursor::move(%d): Moving forward, Cursor = %s\n", fwd, self->toString().c_str()); + debug_printf("Cursor::move(%d): Moving forward %s\n", fwd, self->toString().c_str()); self->m_cur1 = self->m_cur2; wait(success(self->m_cur2.move(true))); } else { // Moving backward, move cur1 backward and keep cur2 pointing to the prior (successor) record - debug_printf("Cursor::move(%d): Moving backward, Cursor = %s\n", fwd, self->toString().c_str()); + debug_printf("Cursor::move(%d): Moving backward %s\n", fwd, self->toString().c_str()); self->m_cur2 = self->m_cur1; wait(success(self->m_cur1.move(false))); } } - self->m_kv.reset(); debug_printf("Cursor::move(%d): Exit, end of db reached. Cursor = %s\n", fwd, self->toString().c_str()); return Void(); } @@ -2693,6 +4730,7 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); self->m_arena = Arena(); const RedwoodRecordRef &rec = self->m_cur1.get(); + self->m_kv.reset(); debug_printf("readFullKVPair: Starting at %s\n", self->toString().c_str()); // Unsplit value, cur1 will hold the key and value memory @@ -2703,12 +4741,12 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); return Void(); } + debug_printf("readFullKVPair: Split, first record %s\n", rec.toString().c_str()); + // Split value, need to coalesce split value parts into a buffer in arena, // after which cur1 will point to the first part and kv.key will reference its key ASSERT(rec.chunk.start + rec.value.get().size() == rec.chunk.total); - debug_printf("readFullKVPair: Split, totalsize %d %s\n", rec.chunk.total, self->toString().c_str()); - // Allocate space for the entire value in the same arena as the key state int bytesLeft = rec.chunk.total; state StringRef dst = makeString(bytesLeft, self->m_arena); @@ -2739,35 +4777,23 @@ RedwoodRecordRef VersionedBTree::dbBegin(StringRef(), 0); RedwoodRecordRef VersionedBTree::dbEnd(LiteralStringRef("\xff\xff\xff\xff\xff")); VersionedBTree::Counts VersionedBTree::counts; -ACTOR template -Future catchError(Promise error, Future f) { - try { - T result = wait(f); - return result; - } catch(Error &e) { - if(e.code() != error_code_actor_cancelled && error.canBeSet()) - error.sendError(e); - throw; - } -} - class KeyValueStoreRedwoodUnversioned : public IKeyValueStore { public: KeyValueStoreRedwoodUnversioned(std::string filePrefix, UID logID) : m_filePrefix(filePrefix) { // TODO: This constructor should really just take an IVersionedStore - IPager *pager = new IndirectShadowPager(filePrefix); - m_tree = new VersionedBTree(pager, filePrefix, true, pager->getUsablePageSize()); + IPager2 *pager = new DWALPager(4096, filePrefix, 0); + m_tree = new VersionedBTree(pager, filePrefix, true); m_init = catchError(init_impl(this)); } - virtual Future init() { + Future init() { return m_init; } ACTOR Future init_impl(KeyValueStoreRedwoodUnversioned *self) { TraceEvent(SevInfo, "RedwoodInit").detail("FilePrefix", self->m_filePrefix); wait(self->m_tree->init()); - Version v = wait(self->m_tree->getLatestVersion()); + Version v = self->m_tree->getLatestVersion(); self->m_tree->setWriteVersion(v + 1); TraceEvent(SevInfo, "RedwoodInitComplete").detail("FilePrefix", self->m_filePrefix); return Void(); @@ -2790,33 +4816,34 @@ public: delete self; } - virtual void close() { + void close() { shutdown(this, false); } - virtual void dispose() { + void dispose() { shutdown(this, true); } - virtual Future< Void > onClosed() { + Future< Void > onClosed() { return m_closed.getFuture(); } Future commit(bool sequential = false) { Future c = m_tree->commit(); + m_tree->setOldestVersion(m_tree->getLatestVersion()); m_tree->setWriteVersion(m_tree->getWriteVersion() + 1); return catchError(c); } - virtual KeyValueStoreType getType() { + KeyValueStoreType getType() { return KeyValueStoreType::SSD_REDWOOD_V1; } - virtual StorageBytes getStorageBytes() { + StorageBytes getStorageBytes() { return m_tree->getStorageBytes(); } - virtual Future< Void > getError() { + Future< Void > getError() { return delayed(m_error.getFuture()); }; @@ -2825,12 +4852,12 @@ public: m_tree->clear(range); } - virtual void set( KeyValueRef keyValue, const Arena* arena = NULL ) { + void set( KeyValueRef keyValue, const Arena* arena = NULL ) { debug_printf("SET %s\n", keyValue.key.printable().c_str()); m_tree->set(keyValue); } - virtual Future< Standalone< VectorRef< KeyValueRef > > > readRange(KeyRangeRef keys, int rowLimit = 1<<30, int byteLimit = 1<<30) { + Future< Standalone< VectorRef< KeyValueRef > > > readRange(KeyRangeRef keys, int rowLimit = 1<<30, int byteLimit = 1<<30) { debug_printf("READRANGE %s\n", printable(keys).c_str()); return catchError(readRange_impl(this, keys, rowLimit, byteLimit)); } @@ -2842,9 +4869,11 @@ public: ASSERT( byteLimit > 0 ); state Reference cur = self->m_tree->readAtVersion(self->m_tree->getLastCommittedVersion()); + // Prefetch is currently only done in the forward direction + state int prefetchBytes = rowLimit > 1 ? byteLimit : 0; if(rowLimit >= 0) { - wait(cur->findFirstEqualOrGreater(keys.begin, true, 0)); + wait(cur->findFirstEqualOrGreater(keys.begin, prefetchBytes)); while(cur->isValid() && cur->getKey() < keys.end) { KeyValueRef kv(KeyRef(result.arena(), cur->getKey()), ValueRef(result.arena(), cur->getValue())); accumulatedBytes += kv.expectedSize(); @@ -2852,21 +4881,21 @@ public: if(--rowLimit == 0 || accumulatedBytes >= byteLimit) { break; } - wait(cur->next(true)); + wait(cur->next()); } } else { - wait(cur->findLastLessOrEqual(keys.end, true, 0)); + wait(cur->findLastLessOrEqual(keys.end)); if(cur->isValid() && cur->getKey() == keys.end) - wait(cur->prev(true)); + wait(cur->prev()); while(cur->isValid() && cur->getKey() >= keys.begin) { KeyValueRef kv(KeyRef(result.arena(), cur->getKey()), ValueRef(result.arena(), cur->getValue())); accumulatedBytes += kv.expectedSize(); result.push_back(result.arena(), kv); - if(--rowLimit == 0 || accumulatedBytes >= byteLimit) { + if(++rowLimit == 0 || accumulatedBytes >= byteLimit) { break; } - wait(cur->prev(true)); + wait(cur->prev()); } } return result; @@ -2883,7 +4912,7 @@ public: return Optional(); } - virtual Future< Optional< Value > > readValue(KeyRef key, Optional< UID > debugID = Optional()) { + Future< Optional< Value > > readValue(KeyRef key, Optional< UID > debugID = Optional()) { return catchError(readValue_impl(this, key, debugID)); } @@ -2900,7 +4929,7 @@ public: return Optional(); } - virtual Future< Optional< Value > > readValuePrefix(KeyRef key, int maxLength, Optional< UID > debugID = Optional()) { + Future< Optional< Value > > readValuePrefix(KeyRef key, int maxLength, Optional< UID > debugID = Optional()) { return catchError(readValuePrefix_impl(this, key, maxLength, debugID)); } @@ -2915,7 +4944,7 @@ private: Promise m_error; template inline Future catchError(Future f) { - return ::catchError(m_error, f); + return forwardError(f, m_error); } }; @@ -2978,11 +5007,11 @@ ACTOR Future verifyRange(VersionedBTree *btree, Key start, Key end, Version if(deterministicRandom()->coinflip()) { state Key randomKey = randomKV().key; debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Dummy seek to '%s'\n", v, start.toString().c_str(), end.toString().c_str(), randomKey.toString().c_str()); - wait(deterministicRandom()->coinflip() ? cur->findFirstEqualOrGreater(randomKey, true, 0) : cur->findLastLessOrEqual(randomKey, true, 0)); + wait(deterministicRandom()->coinflip() ? cur->findFirstEqualOrGreater(randomKey) : cur->findLastLessOrEqual(randomKey)); } debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Actual seek\n", v, start.toString().c_str(), end.toString().c_str()); - wait(cur->findFirstEqualOrGreater(start, true, 0)); + wait(cur->findFirstEqualOrGreater(start)); state std::vector results; @@ -3030,7 +5059,7 @@ ACTOR Future verifyRange(VersionedBTree *btree, Key start, Key end, Version ASSERT(errors == 0); results.push_back(KeyValue(KeyValueRef(cur->getKey(), cur->getValue()))); - wait(cur->next(true)); + wait(cur->next()); } // Make sure there are no further written kv pairs that would be present at this version. @@ -3064,9 +5093,9 @@ ACTOR Future verifyRange(VersionedBTree *btree, Key start, Key end, Version } // Now read the range from the tree in reverse order and compare to the saved results - wait(cur->findLastLessOrEqual(end, true, 0)); + wait(cur->findLastLessOrEqual(end)); if(cur->isValid() && cur->getKey() == end) - wait(cur->prev(true)); + wait(cur->prev()); state std::vector::const_reverse_iterator r = results.rbegin(); @@ -3092,7 +5121,7 @@ ACTOR Future verifyRange(VersionedBTree *btree, Key start, Key end, Version } ++r; - wait(cur->prev(true)); + wait(cur->prev()); } if(r != results.rend()) { @@ -3185,7 +5214,7 @@ ACTOR Future verify(VersionedBTree *btree, FutureStream vStream, break; } } catch(Error &e) { - if(e.code() != error_code_end_of_stream) { + if(e.code() != error_code_end_of_stream && e.code() != error_code_transaction_too_old) { throw; } } @@ -3194,25 +5223,34 @@ ACTOR Future verify(VersionedBTree *btree, FutureStream vStream, // Does a random range read, doesn't trap/report errors ACTOR Future randomReader(VersionedBTree *btree) { - state Reference cur; - loop { - wait(yield()); - if(!cur || deterministicRandom()->random01() > .1) { - Version v = btree->getLastCommittedVersion(); - if(!btree->isSingleVersion()) { - v = deterministicRandom()->randomInt(1, v + 1); - } - cur = btree->readAtVersion(v); - } - - state KeyValue kv = randomKV(10, 0); - wait(cur->findFirstEqualOrGreater(kv.key, true, 0)); - state int c = deterministicRandom()->randomInt(0, 100); - while(cur->isValid() && c-- > 0) { - wait(success(cur->next(true))); + try { + state Reference cur; + loop { wait(yield()); + if(!cur || deterministicRandom()->random01() > .1) { + Version v = btree->getLastCommittedVersion(); + if(!btree->isSingleVersion()) { + v = deterministicRandom()->randomInt(1, v + 1); + } + cur = btree->readAtVersion(v); + } + + state KeyValue kv = randomKV(10, 0); + wait(cur->findFirstEqualOrGreater(kv.key)); + state int c = deterministicRandom()->randomInt(0, 100); + while(cur->isValid() && c-- > 0) { + wait(success(cur->next())); + wait(yield()); + } } } + catch(Error &e) { + if(e.code() != error_code_transaction_too_old) { + throw e; + } + } + + return Void(); } struct IntIntPair { @@ -3344,18 +5382,16 @@ TEST_CASE("!/redwood/correctness/unit/RedwoodRecordRef") { // Test pageID stuff. { - LogicalPageID id = 1; + LogicalPageID ids[] = {1, 5}; + BTreePageID id(ids, 2); RedwoodRecordRef r; - r.setPageID(id); - ASSERT(r.getPageID() == id); - RedwoodRecordRef s; - s = r; - ASSERT(s.getPageID() == id); - RedwoodRecordRef t(r); - ASSERT(t.getPageID() == id); - r.setPageID(id + 1); - ASSERT(s.getPageID() == id); - ASSERT(t.getPageID() == id); + r.setChildPage(id); + ASSERT(r.getChildPage() == id); + ASSERT(r.getChildPage().begin() == id.begin()); + + Standalone r2 = r; + ASSERT(r2.getChildPage() == id); + ASSERT(r2.getChildPage().begin() != id.begin()); } // Testing common prefix calculation for integer fields using the member function that calculates this directly @@ -3569,7 +5605,11 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { DeltaTree::Cursor fwd = r.getCursor(); DeltaTree::Cursor rev = r.getCursor(); + DeltaTree::Reader rValuesOnly(tree, &prev, &next); + DeltaTree::Cursor fwdValueOnly = rValuesOnly.getCursor(); + ASSERT(fwd.moveFirst()); + ASSERT(fwdValueOnly.moveFirst()); ASSERT(rev.moveLast()); int i = 0; while(1) { @@ -3583,9 +5623,21 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { printf("Delta: %s\n", rev.node->raw->delta().toString().c_str()); ASSERT(false); } + if(fwdValueOnly.get().value != items[i].value) { + printf("forward values-only iterator i=%d\n %s found\n %s expected\n", i, fwdValueOnly.get().toString().c_str(), items[i].toString().c_str()); + printf("Delta: %s\n", fwdValueOnly.node->raw->delta().toString().c_str()); + ASSERT(false); + } ++i; - ASSERT(fwd.moveNext() == rev.movePrev()); - ASSERT(fwd.valid() == rev.valid()); + + bool more = fwd.moveNext(); + ASSERT(fwdValueOnly.moveNext() == more); + ASSERT(rev.movePrev() == more); + + ASSERT(fwd.valid() == more); + ASSERT(fwdValueOnly.valid() == more); + ASSERT(rev.valid() == more); + if(!fwd.valid()) { break; } @@ -3696,48 +5748,55 @@ struct SimpleCounter { }; TEST_CASE("!/redwood/correctness/btree") { - state bool useDisk = true; // MemoryPager is not being maintained currently. - - state std::string pagerFile = "unittest_pageFile"; - IPager *pager; + state std::string pagerFile = "unittest_pageFile.redwood"; + IPager2 *pager; state bool serialTest = deterministicRandom()->coinflip(); state bool shortTest = deterministicRandom()->coinflip(); state bool singleVersion = true; // Multi-version mode is broken / not finished - state double startTime = now(); - printf("serialTest: %d shortTest: %d singleVersion: %d\n", serialTest, shortTest, singleVersion); - - if(useDisk) { - printf("Deleting existing test data...\n"); - deleteFile(pagerFile); - deleteFile(pagerFile + "0.pagerlog"); - deleteFile(pagerFile + "1.pagerlog"); - pager = new IndirectShadowPager(pagerFile); - } - else - pager = createMemoryPager(); - - printf("Initializing...\n"); - state int pageSize = shortTest ? 200 : (deterministicRandom()->coinflip() ? pager->getUsablePageSize() : deterministicRandom()->randomInt(200, 400)); - state VersionedBTree *btree = new VersionedBTree(pager, pagerFile, singleVersion, pageSize); - wait(btree->init()); + state int pageSize = shortTest ? 200 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(200, 400)); // We must be able to fit at least two any two keys plus overhead in a page to prevent // a situation where the tree cannot be grown upward with decreasing level size. - // TODO: Handle arbitrarily large keys state int maxKeySize = deterministicRandom()->randomInt(4, pageSize * 2); state int maxValueSize = deterministicRandom()->randomInt(0, pageSize * 4); - state int maxCommitSize = shortTest ? 1000 : randomSize(10e6); - state int mutationBytesTarget = shortTest ? 5000 : randomSize(50e6); - state double clearChance = deterministicRandom()->random01() * .1; + state int maxCommitSize = shortTest ? 1000 : randomSize(std::min((maxKeySize + maxValueSize) * 20000, 10e6)); + state int mutationBytesTarget = shortTest ? 5000 : randomSize(std::min(maxCommitSize * 100, 100e6)); + state double clearProbability = deterministicRandom()->random01() * .1; + state double clearPostSetProbability = deterministicRandom()->random01() * .1; + state double coldStartProbability = deterministicRandom()->random01(); + state double advanceOldVersionProbability = deterministicRandom()->random01(); + state double maxDuration = 60; - printf("Using page size %d, max key size %d, max value size %d, clearchance %f, total mutation byte target %d\n", pageSize, maxKeySize, maxValueSize, clearChance, mutationBytesTarget); + printf("\n"); + printf("serialTest: %d\n", serialTest); + printf("shortTest: %d\n", shortTest); + printf("singleVersion: %d\n", serialTest); + printf("pageSize: %d\n", pageSize); + printf("maxKeySize: %d\n", maxKeySize); + printf("maxValueSize: %d\n", maxValueSize); + printf("maxCommitSize: %d\n", maxCommitSize); + printf("mutationBytesTarget: %d\n", mutationBytesTarget); + printf("clearProbability: %f\n", clearProbability); + printf("clearPostSetProbability: %f\n", clearPostSetProbability); + printf("coldStartProbability: %f\n", coldStartProbability); + printf("advanceOldVersionProbability: %f\n", advanceOldVersionProbability); + printf("\n"); + + printf("Deleting existing test data...\n"); + deleteFile(pagerFile); + + printf("Initializing...\n"); + state double startTime = now(); + pager = new DWALPager(pageSize, pagerFile, 0); + state VersionedBTree *btree = new VersionedBTree(pager, pagerFile, singleVersion); + wait(btree->init()); state std::map, Optional> written; state std::set keys; - state Version lastVer = wait(btree->getLatestVersion()); + state Version lastVer = btree->getLatestVersion(); printf("Starting from version: %" PRId64 "\n", lastVer); state Version version = lastVer + 1; @@ -3759,7 +5818,7 @@ TEST_CASE("!/redwood/correctness/btree") { state Future commit = Void(); - while(mutationBytes.get() < mutationBytesTarget) { + while(mutationBytes.get() < mutationBytesTarget && (now() - startTime) < maxDuration) { if(now() - startTime > 600) { mutationBytesTarget = mutationBytes.get(); } @@ -3771,7 +5830,7 @@ TEST_CASE("!/redwood/correctness/btree") { } // Sometimes do a clear range - if(deterministicRandom()->random01() < clearChance) { + if(deterministicRandom()->random01() < clearProbability) { Key start = randomKV(maxKeySize, 1).key; Key end = (deterministicRandom()->random01() < .01) ? keyAfter(start) : randomKV(maxKeySize, 1).key; @@ -3824,6 +5883,22 @@ TEST_CASE("!/redwood/correctness/btree") { } btree->clear(range); + + // Sometimes set the range start after the clear + if(deterministicRandom()->random01() < clearPostSetProbability) { + KeyValue kv = randomKV(0, maxValueSize); + kv.key = range.begin; + btree->set(kv); + written[std::make_pair(kv.key.toString(), version)] = kv.value.toString(); + } + + // Sometimes set the range end after the clear + if(deterministicRandom()->random01() < clearPostSetProbability) { + KeyValue kv = randomKV(0, maxValueSize); + kv.key = range.end; + btree->set(kv); + written[std::make_pair(kv.key.toString(), version)] = kv.value.toString(); + } } else { // Set a key @@ -3864,7 +5939,13 @@ TEST_CASE("!/redwood/correctness/btree") { Version v = version; // Avoid capture of version as a member of *this + // Sometimes advance the oldest version to close the gap between the oldest and latest versions by a random amount. + if(deterministicRandom()->random01() < advanceOldVersionProbability) { + btree->setOldestVersion(btree->getLastCommittedVersion() - deterministicRandom()->randomInt(0, btree->getLastCommittedVersion() - btree->getOldestVersion() + 1)); + } + commit = map(btree->commit(), [=](Void) { + printf("Committed: %s\n", VersionedBTree::counts.toString(true).c_str()); // Notify the background verifier that version is committed and therefore readable committedVersions.send(v); return Void(); @@ -3884,8 +5965,8 @@ TEST_CASE("!/redwood/correctness/btree") { mutationBytesTargetThisCommit = randomSize(maxCommitSize); // Recover from disk at random - if(!serialTest && useDisk && deterministicRandom()->random01() < .02) { - printf("Recovering from disk.\n"); + if(!serialTest && deterministicRandom()->random01() < coldStartProbability) { + printf("Recovering from disk after next commit.\n"); // Wait for outstanding commit debug_printf("Waiting for outstanding commit\n"); @@ -3896,16 +5977,17 @@ TEST_CASE("!/redwood/correctness/btree") { debug_printf("Waiting for verification to complete.\n"); wait(verifyTask); + debug_printf("Closing btree\n"); Future closedFuture = btree->onClosed(); btree->close(); wait(closedFuture); - debug_printf("Reopening btree\n"); - IPager *pager = new IndirectShadowPager(pagerFile); - btree = new VersionedBTree(pager, pagerFile, singleVersion, pageSize); + printf("Reopening btree from disk.\n"); + IPager2 *pager = new DWALPager(pageSize, pagerFile, 0); + btree = new VersionedBTree(pager, pagerFile, singleVersion); wait(btree->init()); - Version v = wait(btree->getLatestVersion()); + Version v = btree->getLatestVersion(); ASSERT(v == version); printf("Recovered from disk. Latest version %" PRId64 "\n", v); @@ -3927,6 +6009,7 @@ TEST_CASE("!/redwood/correctness/btree") { debug_printf("Waiting for outstanding commit\n"); wait(commit); committedVersions.sendError(end_of_stream()); + randomTask.cancel(); debug_printf("Waiting for verification to complete.\n"); wait(verifyTask); @@ -3934,48 +6017,131 @@ TEST_CASE("!/redwood/correctness/btree") { if(errorCount != 0) throw internal_error(); + wait(btree->destroyAndCheckSanity()); + Future closedFuture = btree->onClosed(); btree->close(); + debug_printf("Closing.\n"); wait(closedFuture); return Void(); } -ACTOR Future randomSeeks(VersionedBTree *btree, int count) { - state Version readVer = wait(btree->getLatestVersion()); +ACTOR Future randomSeeks(VersionedBTree *btree, int count, char firstChar, char lastChar) { + state Version readVer = btree->getLatestVersion(); state int c = 0; state double readStart = timer(); printf("Executing %d random seeks\n", count); state Reference cur = btree->readAtVersion(readVer); while(c < count) { - state Key k = randomString(20, 'a', 'b'); - wait(success(cur->findFirstEqualOrGreater(k, false, 0))); + state Key k = randomString(20, firstChar, lastChar); + wait(success(cur->findFirstEqualOrGreater(k))); ++c; } double elapsed = timer() - readStart; - printf("Point read speed %d/s\n", int(count / elapsed)); + printf("Random seek speed %d/s\n", int(count / elapsed)); return Void(); } - -TEST_CASE("!/redwood/performance/set") { - state std::string pagerFile = "unittest_pageFile"; +ACTOR Future randomScans(VersionedBTree *btree, int count, int width, int readAhead, char firstChar, char lastChar) { + state Version readVer = btree->getLatestVersion(); + state int c = 0; + state double readStart = timer(); + printf("Executing %d random scans\n", count); + state Reference cur = btree->readAtVersion(readVer); + state bool adaptive = readAhead < 0; + state int totalScanBytes = 0; + while(c++ < count) { + state Key k = randomString(20, firstChar, lastChar); + wait(success(cur->findFirstEqualOrGreater(k, readAhead))); + if(adaptive) { + readAhead = totalScanBytes / c; + } + state int w = width; + while(w > 0 && cur->isValid()) { + totalScanBytes += cur->getKey().size(); + totalScanBytes += cur->getValue().size(); + wait(cur->next()); + --w; + } + } + double elapsed = timer() - readStart; + printf("Completed %d scans: readAhead=%d width=%d bytesRead=%d scansRate=%d/s\n", count, readAhead, width, totalScanBytes, int(count / elapsed)); + return Void(); +} + +TEST_CASE("!/redwood/correctness/pager/cow") { + state std::string pagerFile = "unittest_pageFile.redwood"; printf("Deleting old test data\n"); deleteFile(pagerFile); - deleteFile(pagerFile + "0.pagerlog"); - deleteFile(pagerFile + "1.pagerlog"); - IPager *pager = new IndirectShadowPager(pagerFile); + int pageSize = 4096; + state IPager2 *pager = new DWALPager(pageSize, pagerFile, 0); + + wait(success(pager->init())); + state LogicalPageID id = wait(pager->newPageID()); + Reference p = pager->newPageBuffer(); + memset(p->mutate(), (char)id, p->size()); + pager->updatePage(id, p); + pager->setMetaKey(LiteralStringRef("asdfasdf")); + wait(pager->commit()); + Reference p2 = wait(pager->readPage(id, true)); + printf("%s\n", StringRef(p2->begin(), p2->size()).toHexString().c_str()); + + // TODO: Verify reads, do more writes and reads to make this a real pager validator + + Future onClosed = pager->onClosed(); + pager->close(); + wait(onClosed); + + return Void(); +} + +TEST_CASE("!/redwood/performance/set") { + state SignalableActorCollection actors; + VersionedBTree::counts.clear(); + + // If a test file is passed in by environment then don't write new data to it. + state bool reload = getenv("TESTFILE") == nullptr; + state std::string pagerFile = reload ? "unittest.redwood" : getenv("TESTFILE"); + + if(reload) { + printf("Deleting old test data\n"); + deleteFile(pagerFile); + } + + state int pageSize = 4096; + state int64_t pageCacheBytes = FLOW_KNOBS->PAGE_CACHE_4K; + DWALPager *pager = new DWALPager(pageSize, pagerFile, pageCacheBytes); state bool singleVersion = true; - state VersionedBTree *btree = new VersionedBTree(pager, "unittest_pageFile", singleVersion); + state VersionedBTree *btree = new VersionedBTree(pager, pagerFile, singleVersion); wait(btree->init()); state int nodeCount = 1e9; - state int maxChangesPerVersion = 500000; - state int64_t kvBytesTarget = 200e6; - state int maxKeyPrefixSize = 50; - state int maxValueSize = 100; - state int maxConsecutiveRun = 1; + state int maxChangesPerVersion = 5000; + state int64_t kvBytesTarget = 4e9; + state int commitTarget = 20e6; + state int minKeyPrefixBytes = 0; + state int maxKeyPrefixBytes = 25; + state int minValueSize = 0; + state int maxValueSize = 500; + state int maxConsecutiveRun = 10; + state char firstKeyChar = 'a'; + state char lastKeyChar = 'b'; + + printf("pageSize: %d\n", pageSize); + printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes); + printf("trailingIntegerIndexRange: %d\n", nodeCount); + printf("maxChangesPerVersion: %d\n", maxChangesPerVersion); + printf("minKeyPrefixBytes: %d\n", minKeyPrefixBytes); + printf("maxKeyPrefixBytes: %d\n", maxKeyPrefixBytes); + printf("maxConsecutiveRun: %d\n", maxConsecutiveRun); + printf("minValueSize: %d\n", minValueSize); + printf("maxValueSize: %d\n", maxValueSize); + printf("commitTarget: %d\n", commitTarget); + printf("kvBytesTarget: %" PRId64 "\n", kvBytesTarget); + printf("KeyLexicon '%c' to '%c'\n", firstKeyChar, lastKeyChar); + state int64_t kvBytes = 0; state int64_t kvBytesTotal = 0; state int records = 0; @@ -3986,60 +6152,110 @@ TEST_CASE("!/redwood/performance/set") { state double intervalStart = timer(); state double start = intervalStart; - while(kvBytesTotal < kvBytesTarget) { - Version lastVer = wait(btree->getLatestVersion()); - state Version version = lastVer + 1; - btree->setWriteVersion(version); - int changes = deterministicRandom()->randomInt(0, maxChangesPerVersion); + if(reload) { + while(kvBytesTotal < kvBytesTarget) { + wait(yield()); - while(changes > 0) { - KeyValue kv; - kv.key = randomString(kv.arena(), deterministicRandom()->randomInt(sizeof(uint32_t), maxKeyPrefixSize + sizeof(uint32_t) + 1), 'a', 'b'); - int32_t index = deterministicRandom()->randomInt(0, nodeCount); - int runLength = deterministicRandom()->randomInt(1, maxConsecutiveRun + 1); + Version lastVer = btree->getLatestVersion(); + state Version version = lastVer + 1; + btree->setWriteVersion(version); + int changes = deterministicRandom()->randomInt(0, maxChangesPerVersion); - while(runLength > 0 && changes > 0) { - *(uint32_t *)(kv.key.end() - sizeof(uint32_t)) = bigEndian32(index++); - kv.value = StringRef((uint8_t *)value.data(), deterministicRandom()->randomInt(0, value.size())); + while(changes > 0 && kvBytes < commitTarget) { + KeyValue kv; + kv.key = randomString(kv.arena(), deterministicRandom()->randomInt(minKeyPrefixBytes + sizeof(uint32_t), maxKeyPrefixBytes + sizeof(uint32_t) + 1), firstKeyChar, lastKeyChar); + int32_t index = deterministicRandom()->randomInt(0, nodeCount); + int runLength = deterministicRandom()->randomInt(1, maxConsecutiveRun + 1); - btree->set(kv); + while(runLength > 0 && changes > 0) { + *(uint32_t *)(kv.key.end() - sizeof(uint32_t)) = bigEndian32(index++); + kv.value = StringRef((uint8_t *)value.data(), deterministicRandom()->randomInt(minValueSize, maxValueSize + 1)); - --runLength; - --changes; - kvBytes += kv.key.size() + kv.value.size(); - ++records; + btree->set(kv); + + --runLength; + --changes; + kvBytes += kv.key.size() + kv.value.size(); + ++records; + } + } + + if(kvBytes >= commitTarget) { + btree->setOldestVersion(btree->getLastCommittedVersion()); + wait(commit); + printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, kvBytesTotal / (timer() - start) / 1e6); + + // Avoid capturing via this to freeze counter values + int recs = records; + int kvb = kvBytes; + + // Capturing invervalStart via this->intervalStart makes IDE's unhappy as they do not know about the actor state object + double *pIntervalStart = &intervalStart; + + commit = map(btree->commit(), [=](Void result) { + printf("Committed: %s\n", VersionedBTree::counts.toString(true).c_str()); + double elapsed = timer() - *pIntervalStart; + printf("Committed %d kvBytes in %d records in %f seconds, %.2f MB/s\n", kvb, recs, elapsed, kvb / elapsed / 1e6); + *pIntervalStart = timer(); + return Void(); + }); + + kvBytesTotal += kvBytes; + kvBytes = 0; + records = 0; } } - if(kvBytes > 2e6) { - wait(commit); - printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, kvBytesTotal / (timer() - start) / 1e6); - - // Avoid capturing via this to freeze counter values - int recs = records; - int kvb = kvBytes; - - // Capturing invervalStart via this->intervalStart makes IDE's unhappy as they do not know about the actor state object - double *pIntervalStart = &intervalStart; - - commit = map(btree->commit(), [=](Void result) { - double elapsed = timer() - *pIntervalStart; - printf("Committed %d kvBytes in %d records in %f seconds, %.2f MB/s\n", kvb, recs, elapsed, kvb / elapsed / 1e6); - *pIntervalStart = timer(); - return Void(); - }); - - kvBytesTotal += kvBytes; - kvBytes = 0; - records = 0; - } + wait(commit); + printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, kvBytesTotal / (timer() - start) / 1e6); } - wait(commit); - printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, kvBytesTotal / (timer() - start) / 1e6); + int seeks = 1e6; + printf("Warming cache with seeks\n"); + actors.add(randomSeeks(btree, seeks/3, firstKeyChar, lastKeyChar)); + actors.add(randomSeeks(btree, seeks/3, firstKeyChar, lastKeyChar)); + actors.add(randomSeeks(btree, seeks/3, firstKeyChar, lastKeyChar)); + wait(actors.signalAndReset()); + printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); - state int reads = 30000; - wait(randomSeeks(btree, reads) && randomSeeks(btree, reads) && randomSeeks(btree, reads)); + state int ops = 10000; + + printf("Serial scans with adaptive readAhead...\n"); + actors.add(randomScans(btree, ops, 50, -1, firstKeyChar, lastKeyChar)); + wait(actors.signalAndReset()); + printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + + printf("Serial scans with readAhead 3 pages...\n"); + actors.add(randomScans(btree, ops, 50, 12000, firstKeyChar, lastKeyChar)); + wait(actors.signalAndReset()); + printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + + printf("Serial scans with readAhead 2 pages...\n"); + actors.add(randomScans(btree, ops, 50, 8000, firstKeyChar, lastKeyChar)); + wait(actors.signalAndReset()); + printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + + printf("Serial scans with readAhead 1 page...\n"); + actors.add(randomScans(btree, ops, 50, 4000, firstKeyChar, lastKeyChar)); + wait(actors.signalAndReset()); + printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + + printf("Serial scans...\n"); + actors.add(randomScans(btree, ops, 50, 0, firstKeyChar, lastKeyChar)); + wait(actors.signalAndReset()); + printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + + printf("Serial seeks...\n"); + actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); + wait(actors.signalAndReset()); + printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + + printf("Parallel seeks...\n"); + actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); + actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); + actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); + wait(actors.signalAndReset()); + printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); Future closedFuture = btree->onClosed(); btree->close(); diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index c50ffde07f..b7b5a07fca 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -386,6 +386,7 @@ struct Role { static const Role LOG_ROUTER; static const Role DATA_DISTRIBUTOR; static const Role RATEKEEPER; + static const Role STORAGE_CACHE; static const Role COORDINATOR; std::string roleName; @@ -455,6 +456,7 @@ ACTOR Future logRouter(TLogInterface interf, InitializeLogRouterRequest re Reference> db); ACTOR Future dataDistributor(DataDistributorInterface ddi, Reference> db); ACTOR Future ratekeeper(RatekeeperInterface rki, Reference> db); +ACTOR Future storageCache(StorageServerInterface interf, uint16_t id, Reference> db); void registerThreadForProfiling(); void updateCpuProfiler(ProfilerRequest req); diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index 0f1b533bc0..d3f7377046 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -34,7 +34,7 @@ #include "fdbclient/FailureMonitorClient.h" #include "fdbserver/CoordinationInterface.h" #include "fdbserver/WorkerInterface.actor.h" -#include "fdbserver/RestoreWorkerInterface.h" +#include "fdbclient/RestoreWorkerInterface.actor.h" #include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/MoveKeys.actor.h" @@ -183,7 +183,7 @@ extern void createTemplateDatabase(); // FIXME: this really belongs in a header somewhere since it is actually used. extern IPAddress determinePublicIPAutomatically(ClusterConnectionString const& ccs); -extern const char* getHGVersion(); +extern const char* getSourceVersion(); extern void flushTraceFileVoid(); @@ -518,7 +518,7 @@ void* parentWatcher(void *arg) { static void printVersion() { printf("FoundationDB " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n"); - printf("source version %s\n", getHGVersion()); + printf("source version %s\n", getSourceVersion()); printf("protocol %" PRIx64 "\n", currentProtocolVersion.version()); } @@ -1672,7 +1672,7 @@ int main(int argc, char* argv[]) { TraceEvent("ProgramStart") .setMaxEventLength(12000) .detail("RandomSeed", opts.randomSeed) - .detail("SourceVersion", getHGVersion()) + .detail("SourceVersion", getSourceVersion()) .detail("Version", FDB_VT_VERSION) .detail("PackageName", FDB_VT_PACKAGE_NAME) .detail("FileSystem", opts.fileSystemPath) diff --git a/fdbserver/fdbserver.vcxproj b/fdbserver/fdbserver.vcxproj index 783bcb160c..d2be7b965d 100644 --- a/fdbserver/fdbserver.vcxproj +++ b/fdbserver/fdbserver.vcxproj @@ -46,7 +46,6 @@ - @@ -54,6 +53,7 @@ + @@ -63,7 +63,6 @@ - @@ -188,7 +187,6 @@ - @@ -198,7 +196,6 @@ - false false diff --git a/fdbserver/fdbserver.vcxproj.filters b/fdbserver/fdbserver.vcxproj.filters index 653b3324ff..9a1cabdec9 100644 --- a/fdbserver/fdbserver.vcxproj.filters +++ b/fdbserver/fdbserver.vcxproj.filters @@ -197,6 +197,7 @@ workloads + @@ -274,8 +275,6 @@ workloads - - @@ -330,7 +329,6 @@ - @@ -387,8 +385,6 @@ - - diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index 56e6bf1cbc..bfc8eb6a15 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -684,6 +684,9 @@ ACTOR Future readTransactionSystemState( Reference self, Refer Standalone> rawTags = wait( self->txnStateStore->readRange( serverTagKeys ) ); self->allTags.clear(); + if(self->lastEpochEnd > 0) { + self->allTags.push_back(cacheTag); + } if(self->forceRecovery) { self->safeLocality = oldLogSystem->getLogSystemConfig().tLogs[0].locality; @@ -1345,6 +1348,15 @@ ACTOR Future masterCore( Reference self ) { tr.set(recoveryCommitRequest.arena, coordinatorsKey, self->coordinators.ccf->getConnectionString().toString()); tr.set(recoveryCommitRequest.arena, logsKey, self->logSystem->getLogsValue()); tr.set(recoveryCommitRequest.arena, primaryDatacenterKey, self->myInterface.locality.dcId().present() ? self->myInterface.locality.dcId().get() : StringRef()); + + //FIXME: remove this code, caching the entire normal keyspace as a test of functionality + //TODO: caching disabled for this merge + //tr.set(recoveryCommitRequest.arena, storageCacheKey(normalKeys.begin), storageCacheValue({0})); + //tr.set(recoveryCommitRequest.arena, storageCacheKey(normalKeys.end), storageCacheValue({})); + //tr.set(recoveryCommitRequest.arena, cacheKeysKey(0, normalKeys.begin), serverKeysTrue); + //tr.set(recoveryCommitRequest.arena, cacheKeysKey(0, normalKeys.end), serverKeysFalse); + //tr.set(recoveryCommitRequest.arena, cacheChangeKeyFor(0), BinaryWriter::toValue(deterministicRandom()->randomUniqueID(),Unversioned())); + //tr.set(recoveryCommitRequest.arena, cacheChangeKey, BinaryWriter::toValue(deterministicRandom()->randomUniqueID(),Unversioned())); tr.clear(recoveryCommitRequest.arena, tLogDatacentersKeys); for(auto& dc : self->primaryDcId) { @@ -1356,7 +1368,7 @@ ACTOR Future masterCore( Reference self ) { } } - applyMetadataMutations(self->dbgid, recoveryCommitRequest.arena, tr.mutations.slice(mmApplied, tr.mutations.size()), self->txnStateStore, NULL, NULL); + applyMetadataMutations(self->dbgid, recoveryCommitRequest.arena, tr.mutations.slice(mmApplied, tr.mutations.size()), self->txnStateStore, nullptr, nullptr); mmApplied = tr.mutations.size(); tr.read_snapshot = self->recoveryTransactionVersion; // lastEpochEnd would make more sense, but isn't in the initial window of the resolver(s) diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 12d5c9e7a9..c72b3829fc 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -75,25 +75,6 @@ inline bool canReplyWith(Error e) { }; } -struct StorageServer; -class ValueOrClearToRef { -public: - static ValueOrClearToRef value(ValueRef const& v) { return ValueOrClearToRef(v, false); } - static ValueOrClearToRef clearTo(KeyRef const& k) { return ValueOrClearToRef(k, true); } - - bool isValue() const { return !isClear; }; - bool isClearTo() const { return isClear; } - - ValueRef const& getValue() const { ASSERT( isValue() ); return item; }; - KeyRef const& getEndKey() const { ASSERT(isClearTo()); return item; }; - -private: - ValueOrClearToRef( StringRef item, bool isClear ) : item(item), isClear(isClear) {} - - StringRef item; - bool isClear; -}; - struct AddingShard : NonCopyable { KeyRange keys; Future fetchClient; // holds FetchKeys() actor @@ -390,6 +371,8 @@ public: KeyRangeMap< Reference > shards; uint64_t shardChangeCounter; // max( shards->changecounter ) + KeyRangeMap cachedRangeMap; // indicates if a key-range is being cached + // newestAvailableVersion[k] // == invalidVersion -> k is unavailable at all versions // <= storageVersion -> k is unavailable at all versions (but might be read anyway from storage if we are in the process of committing makeShardDurable) @@ -516,6 +499,8 @@ public: specialCounter(cc, "VersionLag", [self](){ return self->versionLag; }); specialCounter(cc, "LocalRate", [self]{ return self->currentRate() * 100; }); + specialCounter(cc, "BytesReadSampleCount", [self]() { return self->metrics.bytesReadSample.queue.size(); }); + specialCounter(cc, "FetchKeysFetchActive", [self](){ return self->fetchKeysParallelismLock.activePermits(); }); specialCounter(cc, "FetchKeysWaiting", [self](){ return self->fetchKeysParallelismLock.waiters(); }); @@ -890,9 +875,10 @@ ACTOR Future getValueQ( StorageServer* data, GetValueRequest req ) { } StorageMetrics metrics; - metrics.bytesReadPerKSecond = v.present() ? std::max((int64_t)(req.key.size() + v.get().size()), - SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE) - : SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; + // If the read yields no value, randomly sample the empty read. + metrics.bytesReadPerKSecond = + v.present() ? std::max((int64_t)(req.key.size() + v.get().size()), SERVER_KNOBS->EMPTY_READ_PENALTY) + : SERVER_KNOBS->EMPTY_READ_PENALTY; data->metrics.notify(req.key, metrics); if( req.debugID.present() ) @@ -1082,7 +1068,6 @@ void merge( Arena& arena, VectorRef& output ASSERT( output.size() <= originalLimit ); } -// readRange reads up to |limit| rows from the given range and version, combining data->storage and data->versionedData. // If limit>=0, it returns the first rows in the range (sorted ascending), otherwise the last rows (sorted descending). // readRange has O(|result|) + O(log |data|) cost ACTOR Future readRange( StorageServer* data, Version version, KeyRange range, int limit, int* pLimitBytes ) { @@ -1100,6 +1085,12 @@ ACTOR Future readRange( StorageServer* data, Version version, //state int originalLimitBytes = *pLimitBytes; //state bool track = rrid.first() == 0x1bc134c2f752187cLL; + // Check if the desired key-range intersects the cached key-ranges + // TODO Find a more efficient way to do it + // TODO Also need this check in single key/value lookup + auto cached = data->cachedRangeMap.intersectingRanges(range); + result.cached = (cached.begin() != cached.end()); + // FIXME: Review pLimitBytes behavior // if (limit >= 0) we are reading forward, else backward @@ -1271,15 +1262,15 @@ ACTOR Future readRange( StorageServer* data, Version version, result.more = limit == 0 || *pLimitBytes<=0; // FIXME: Does this have to be exact? result.version = version; StorageMetrics metrics; - metrics.bytesReadPerKSecond = std::max(readSize, SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE); + metrics.bytesReadPerKSecond = std::max(readSize, SERVER_KNOBS->EMPTY_READ_PENALTY); data->metrics.notify(limit >= 0 ? range.begin : range.end, metrics); return result; } -bool selectorInRange( KeySelectorRef const& sel, KeyRangeRef const& range ) { +//bool selectorInRange( KeySelectorRef const& sel, KeyRangeRef const& range ) { // Returns true if the given range suffices to at least begin to resolve the given KeySelectorRef - return sel.getKey() >= range.begin && (sel.isBackward() ? sel.getKey() <= range.end : sel.getKey() < range.end); -} +// return sel.getKey() >= range.begin && (sel.isBackward() ? sel.getKey() <= range.end : sel.getKey() < range.end); +//} ACTOR Future findKey( StorageServer* data, KeySelectorRef sel, Version version, KeyRange range, int* pOffset) // Attempts to find the key indicated by sel in the data at version, within range. @@ -1327,14 +1318,13 @@ ACTOR Future findKey( StorageServer* data, KeySelectorRef sel, Version vers *pOffset = 0; StorageMetrics metrics; - metrics.bytesReadPerKSecond = - std::max((int64_t)rep.data[index].key.size(), SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE); + metrics.bytesReadPerKSecond = std::max((int64_t)rep.data[index].key.size(), SERVER_KNOBS->EMPTY_READ_PENALTY); data->metrics.notify(sel.getKey(), metrics); return rep.data[ index ].key; } else { StorageMetrics metrics; - metrics.bytesReadPerKSecond = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; + metrics.bytesReadPerKSecond = SERVER_KNOBS->EMPTY_READ_PENALTY; data->metrics.notify(sel.getKey(), metrics); // FIXME: If range.begin=="" && !forward, return success? @@ -1466,7 +1456,7 @@ ACTOR Future getKeyValues( StorageServer* data, GetKeyValuesRequest req ) for (int i = 0; i < r.data.size(); i++) { StorageMetrics m; - m.bytesReadPerKSecond = r.data[i].expectedSize(); + m.bytesReadPerKSecond = std::max((int64_t)r.data[i].expectedSize(), SERVER_KNOBS->EMPTY_READ_PENALTY); data->metrics.notify(r.data[i].key, m); } @@ -1772,11 +1762,6 @@ bool expandMutation( MutationRef& m, StorageServer::VersionedData const& data, U return true; } -bool isClearContaining( StorageServer::VersionedData::ViewAtVersion const& view, KeyRef key ) { - auto i = view.lastLessOrEqual(key); - return i && i->isClearTo() && i->getEndKey() > key; -} - void applyMutation( StorageServer *self, MutationRef const& m, Arena& arena, StorageServer::VersionedData &data ) { // m is expected to be in arena already // Clear split keys are added to arena @@ -1806,7 +1791,7 @@ void applyMutation( StorageServer *self, MutationRef const& m, Arena& arena, Sto } else if (m.type == MutationRef::ClearRange) { data.erase( m.param1, m.param2 ); ASSERT( m.param2 > m.param1 ); - ASSERT( !isClearContaining( data.atLatest(), m.param1 ) ); + ASSERT( !data.isClearContaining( data.atLatest(), m.param1 ) ); data.insert( m.param1, ValueOrClearToRef::clearTo(m.param2) ); self->watches.triggerRange( m.param1, m.param2 ); } @@ -2461,6 +2446,8 @@ void StorageServer::addMutation(Version version, MutationRef const& mutation, Ke printf(" eager: %s\n", printable( eagerReads->getKeyEnd( mutation.param2 ) ).c_str() ); } applyMutation( this, expanded, mLog.arena(), mutableData() ); + //printf("\nSSUpdate: Printing versioned tree after applying mutation\n"); + //mutableData().printTree(version); } struct OrderByVersion { @@ -2490,8 +2477,8 @@ static const KeyRef persistPrimaryLocality = LiteralStringRef( PERSIST_PREFIX "P class StorageUpdater { public: - StorageUpdater() : fromVersion(invalidVersion), currentVersion(invalidVersion), restoredVersion(invalidVersion), processedStartKey(false) {} - StorageUpdater(Version fromVersion, Version restoredVersion) : fromVersion(fromVersion), currentVersion(fromVersion), restoredVersion(restoredVersion), processedStartKey(false) {} + StorageUpdater() : fromVersion(invalidVersion), currentVersion(invalidVersion), restoredVersion(invalidVersion), processedStartKey(false), processedCacheStartKey(false) {} + StorageUpdater(Version fromVersion, Version restoredVersion) : fromVersion(fromVersion), currentVersion(fromVersion), restoredVersion(restoredVersion), processedStartKey(false), processedCacheStartKey(false) {} void applyMutation(StorageServer* data, MutationRef const& m, Version ver) { //TraceEvent("SSNewVersion", data->thisServerID).detail("VerWas", data->mutableData().latestVersion).detail("ChVer", ver); @@ -2503,8 +2490,12 @@ public: } if (m.param1.startsWith( systemKeys.end )) { - //TraceEvent("PrivateData", data->thisServerID).detail("Mutation", m.toString()).detail("Version", ver); - applyPrivateData( data, m ); + if ((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(storageCachePrefix)) + applyPrivateCacheData( data, m); + else { + //TraceEvent("PrivateData", data->thisServerID).detail("Mutation", m.toString()).detail("Version", ver); + applyPrivateData( data, m ); + } } else { // FIXME: enable when debugMutation is active //for(auto m = changes[c].mutations.begin(); m; ++m) { @@ -2526,6 +2517,9 @@ private: bool nowAssigned; bool processedStartKey; + KeyRef cacheStartKey; + bool processedCacheStartKey; + void applyPrivateData( StorageServer* data, MutationRef const& m ) { TraceEvent(SevDebug, "SSPrivateMutation", data->thisServerID).detail("Mutation", m.toString()); @@ -2586,6 +2580,37 @@ private: ASSERT(false); // Unknown private mutation } } + + void applyPrivateCacheData( StorageServer* data, MutationRef const& m ) { + TraceEvent(SevDebug, "SSPrivateCacheMutation", data->thisServerID).detail("Mutation", m.toString()); + + if (processedCacheStartKey) { + // Because of the implementation of the krm* functions, we expect changes in pairs, [begin,end) + ASSERT((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(storageCachePrefix)); + KeyRangeRef keys( cacheStartKey.removePrefix(systemKeys.begin).removePrefix( storageCachePrefix ), + m.param1.removePrefix(systemKeys.begin).removePrefix( storageCachePrefix )); + data->cachedRangeMap.insert(keys, true); + //TraceEvent(SevDebug, "SSPrivateCacheMutation", data->thisServerID).detail("Begin", keys.begin).detail("End", keys.end); + //fprintf(stderr, "applyPrivateCacheData : begin: %s, end: %s\n", printable(keys.begin).c_str(), printable(keys.end).c_str()); + + //Figure out the affected shard ranges and maintain the cached key-range information in the in-memory map + // TODO revisit- we are not splitting the cached ranges based on shards as of now. + if (0) { + auto cachedRanges = data->shards.intersectingRanges(keys); + for(auto shard = cachedRanges.begin(); shard != cachedRanges.end(); ++shard) { + KeyRangeRef intersectingRange = shard.range() & keys; + data->cachedRangeMap.insert(KeyRangeRef(intersectingRange.begin, intersectingRange.end), true); + } + } + processedStartKey = false; + } else if ((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(storageCachePrefix)) { + // Because of the implementation of the krm* functions, we expect changes in pairs, [begin,end) + cacheStartKey = m.param1; + processedCacheStartKey = true; + } else { + ASSERT(false); // Unknown private mutation + } + } }; ACTOR Future update( StorageServer* data, bool* pReceivedUpdate ) diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 688f34cfca..b88bcfc475 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -360,6 +360,7 @@ ACTOR Future pingDatabase( Database cx ) { loop { try { tr.setOption( FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE ); + tr.setOption( FDBTransactionOptions::LOCK_AWARE ); Optional v = wait( tr.get( StringRef("/Liveness/" + deterministicRandom()->randomUniqueID().toString() ) ) ); tr.makeSelfConflicting(); wait( tr.commit() ); @@ -1092,11 +1093,12 @@ ACTOR Future runTests( Reference runTests( Reference registrationClient( ProcessClass initialClass, Reference>> ddInterf, Reference>> rkInterf, - Reference> degraded) { + Reference> degraded, + PromiseStream< ErrorInfo > errors, + LocalityData locality, + Reference> dbInfo) { // Keeps the cluster controller (as it may be re-elected) informed that this worker exists // The cluster controller uses waitFailureClient to find out if we die, and returns from registrationReply (requiring us to re-register) // The registration request piggybacks optional distributor interface if it exists. state Generation requestGeneration = 0; state ProcessClass processClass = initialClass; + state Reference>>> scInterf( new AsyncVar>>() ); + state Future cacheProcessFuture; + state Future cacheErrorsFuture; loop { - RegisterWorkerRequest request(interf, initialClass, processClass, asyncPriorityInfo->get(), requestGeneration++, ddInterf->get(), rkInterf->get(), degraded->get()); + RegisterWorkerRequest request(interf, initialClass, processClass, asyncPriorityInfo->get(), requestGeneration++, ddInterf->get(), rkInterf->get(), scInterf->get(), degraded->get()); Future registrationReply = ccInterface->get().present() ? brokenPromiseToNever( ccInterface->get().get().registerWorker.getReply(request) ) : Never(); choose { when ( RegisterWorkerReply reply = wait( registrationReply )) { processClass = reply.processClass; asyncPriorityInfo->set( reply.priorityInfo ); + + if(!reply.storageCache.present()) { + cacheProcessFuture.cancel(); + scInterf->set(Optional>()); + } else if (!scInterf->get().present() || scInterf->get().get().first != reply.storageCache.get()) { + StorageServerInterface recruited; + recruited.locality = locality; + recruited.initEndpoints(); + + std::map details; + startRole( Role::STORAGE_CACHE, recruited.id(), interf.id(), details ); + + //DUMPTOKEN(recruited.getVersion); + DUMPTOKEN(recruited.getValue); + DUMPTOKEN(recruited.getKey); + DUMPTOKEN(recruited.getKeyValues); + DUMPTOKEN(recruited.getShardState); + DUMPTOKEN(recruited.waitMetrics); + DUMPTOKEN(recruited.splitMetrics); + DUMPTOKEN(recruited.getStorageMetrics); + DUMPTOKEN(recruited.waitFailure); + DUMPTOKEN(recruited.getQueuingMetrics); + DUMPTOKEN(recruited.getKeyValueStoreType); + DUMPTOKEN(recruited.watchValue); + + cacheProcessFuture = storageCache( recruited, reply.storageCache.get(), dbInfo ); + cacheErrorsFuture = forwardError(errors, Role::STORAGE_CACHE, recruited.id(), setWhenDoneOrError(cacheProcessFuture, scInterf, Optional>())); + scInterf->set(std::make_pair(reply.storageCache.get(), recruited)); + } } when ( wait( ccInterface->onChange() )) {} when ( wait( ddInterf->onChange() ) ) {} when ( wait( rkInterf->onChange() ) ) {} + when ( wait( scInterf->onChange() ) ) {} when ( wait( degraded->onChange() ) ) {} } } @@ -956,7 +992,7 @@ ACTOR Future workerServer( wait(waitForAll(recoveries)); recoveredDiskFiles.send(Void()); - errorForwarders.add( registrationClient( ccInterface, interf, asyncPriorityInfo, initialClass, ddInterf, rkInterf, degraded ) ); + errorForwarders.add( registrationClient( ccInterface, interf, asyncPriorityInfo, initialClass, ddInterf, rkInterf, degraded, errors, locality, dbInfo ) ); TraceEvent("RecoveriesComplete", interf.id()); @@ -964,8 +1000,10 @@ ACTOR Future workerServer( when( RebootRequest req = waitNext( interf.clientInterface.reboot.getFuture() ) ) { state RebootRequest rebootReq = req; + // If suspendDuration is INT_MAX, the trace will not be logged if it was inside the next block + // Also a useful trace to have even if suspendDuration is 0 + TraceEvent("RebootRequestSuspendingProcess").detail("Duration", req.waitForDuration); if(req.waitForDuration) { - TraceEvent("RebootRequestSuspendingProcess").detail("Duration", req.waitForDuration); flushTraceFileVoid(); setProfilingEnabled(0); g_network->stop(); @@ -1496,4 +1534,5 @@ const Role Role::TESTER("Tester", "TS"); const Role Role::LOG_ROUTER("LogRouter", "LR"); const Role Role::DATA_DISTRIBUTOR("DataDistributor", "DD"); const Role Role::RATEKEEPER("Ratekeeper", "RK"); +const Role Role::STORAGE_CACHE("StorageCache", "SC"); const Role Role::COORDINATOR("Coordinator", "CD"); diff --git a/fdbserver/workloads/AtomicOps.actor.cpp b/fdbserver/workloads/AtomicOps.actor.cpp index 9188f6d094..33519ee333 100644 --- a/fdbserver/workloads/AtomicOps.actor.cpp +++ b/fdbserver/workloads/AtomicOps.actor.cpp @@ -33,6 +33,7 @@ struct AtomicOpsWorkload : TestWorkload { double testDuration, transactionsPerSecond; vector> clients; + uint64_t lbsum, ubsum; // The lower bound and upper bound sum of operations when opType = AddValue AtomicOpsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx), opNum(0) @@ -47,6 +48,9 @@ struct AtomicOpsWorkload : TestWorkload { apiVersion500 = ((sharedRandomNumber % 10) == 0); TraceEvent("AtomicOpsApiVersion500").detail("ApiVersion500", apiVersion500); + lbsum = 0; + ubsum = 0; + int64_t randNum = sharedRandomNumber / 10; if(opType == -1) opType = randNum % 8; @@ -102,10 +106,11 @@ struct AtomicOpsWorkload : TestWorkload { } virtual Future start( Database const& cx ) { - for(int c=0; cclone(), this, actorCount / transactionsPerSecond ), testDuration, Void()) ); + for (int c = 0; c < actorCount; c++) { + clients.push_back( + timeout(atomicOpWorker(cx->clone(), this, actorCount / transactionsPerSecond), testDuration, Void())); + } + return delay(testDuration); } @@ -118,9 +123,37 @@ struct AtomicOpsWorkload : TestWorkload { virtual void getMetrics( vector& m ) { } - Key logKey( int group ) { return StringRef(format("log%08x%08x%08x",group,clientId,opNum++));} + std::pair logDebugKey(int group) { + Key logKey(format("log%08x%08x%08x", group, clientId, opNum)); + Key debugKey(format("debug%08x%08x%08x", group, clientId, opNum)); + opNum++; + return std::make_pair(logKey, debugKey); + } ACTOR Future _setup( Database cx, AtomicOpsWorkload* self ) { + // Sanity check if log keyspace has elements + state ReadYourWritesTransaction tr1(cx); + loop { + try { + Key begin(std::string("log")); + Standalone log = + wait(tr1.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + if (!log.empty()) { + TraceEvent(SevError, "AtomicOpSetup") + .detail("LogKeySpace", "Not empty") + .detail("Result", log.toString()); + for (auto& kv : log) { + TraceEvent(SevWarn, "AtomicOpSetup") + .detail("K", kv.key.toString()) + .detail("V", kv.value.toString()); + } + } + break; + } catch (Error& e) { + wait(tr1.onError(e)); + } + } + state int g = 0; for(; g < 100; g++) { state ReadYourWritesTransaction tr(cx); @@ -148,30 +181,174 @@ struct AtomicOpsWorkload : TestWorkload { loop { try { int group = deterministicRandom()->randomInt(0,100); - uint64_t intValue = deterministicRandom()->randomInt( 0, 10000000 ); + state uint64_t intValue = deterministicRandom()->randomInt(0, 10000000); Key val = StringRef((const uint8_t*) &intValue, sizeof(intValue)); - tr.set(self->logKey(group), val); + state std::pair logDebugKey = self->logDebugKey(group); int nodeIndex = deterministicRandom()->randomInt(0, self->nodeCount / 100); - tr.atomicOp(StringRef(format("ops%08x%08x", group, nodeIndex)), val, self->opType); - // TraceEvent(SevDebug, "AtomicOpWorker") - // .detail("LogKey", self->logKey(group)) - // .detail("Value", val) - // .detail("ValueInt", intValue); - // TraceEvent(SevDebug, "AtomicOpWorker") - // .detail("OpKey", format("ops%08x%08x", group, nodeIndex)) - // .detail("Value", val) - // .detail("ValueInt", intValue) - // .detail("AtomicOp", self->opType); + state Key opsKey(format("ops%08x%08x", group, nodeIndex)); + tr.set(logDebugKey.first, val); // set log key + tr.set(logDebugKey.second, opsKey); // set debug key; one opsKey can have multiple logs key + tr.atomicOp(opsKey, val, self->opType); wait( tr.commit() ); + if (self->opType == MutationRef::AddValue) { + self->lbsum += intValue; + self->ubsum += intValue; + } break; } catch( Error &e ) { - wait( tr.onError(e) ); - // self->opNum--; + if (e.code() == 1021) { + self->ubsum += intValue; + TraceEvent(SevWarnAlways, "TxnCommitUnknownResult") + .detail("Value", intValue) + .detail("LogKey", logDebugKey.first) + .detail("OpsKey", opsKey); + } + wait(tr.onError(e)); } } } } + ACTOR Future dumpLogKV(Database cx, int g) { + try { + state ReadYourWritesTransaction tr(cx); + Key begin(format("log%08x", g)); + Standalone log = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + if (log.more) { + TraceEvent(SevError, "LogHitTxnLimits").detail("Result", log.toString()); + } + uint64_t sum = 0; + for (auto& kv : log) { + uint64_t intValue = 0; + memcpy(&intValue, kv.value.begin(), kv.value.size()); + sum += intValue; + TraceEvent("AtomicOpLog") + .detail("Key", kv.key) + .detail("Val", kv.value) + .detail("IntValue", intValue) + .detail("CurSum", sum); + } + } catch( Error &e ) { + TraceEvent("DumpLogKVError").detail("Error", e.what()); + wait( tr.onError(e) ); + } + return Void(); + } + + ACTOR Future dumpDebugKV(Database cx, int g) { + try { + state ReadYourWritesTransaction tr(cx); + Key begin(format("debug%08x", g)); + Standalone debuglog = + wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + if (debuglog.more) { + TraceEvent(SevError, "DebugLogHitTxnLimits").detail("Result", debuglog.toString()); + } + for (auto& kv : debuglog) { + TraceEvent("AtomicOpDebug").detail("Key", kv.key).detail("Val", kv.value); + } + } catch( Error &e ) { + TraceEvent("DumpDebugKVError").detail("Error", e.what()); + wait( tr.onError(e) ); + } + return Void(); + } + + ACTOR Future dumpOpsKV(Database cx, int g) { + try { + state ReadYourWritesTransaction tr(cx); + Key begin(format("ops%08x", g)); + Standalone ops = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + if (ops.more) { + TraceEvent(SevError, "OpsHitTxnLimits").detail("Result", ops.toString()); + } + uint64_t sum = 0; + for (auto& kv : ops) { + uint64_t intValue = 0; + memcpy(&intValue, kv.value.begin(), kv.value.size()); + sum += intValue; + TraceEvent("AtomicOpOps") + .detail("Key", kv.key) + .detail("Val", kv.value) + .detail("IntVal", intValue) + .detail("CurSum", sum); + } + } catch( Error &e ) { + TraceEvent("DumpOpsKVError").detail("Error", e.what()); + wait( tr.onError(e) ); + } + return Void(); + } + + ACTOR Future validateOpsKey(Database cx, AtomicOpsWorkload* self, int g) { + // Get mapping between opsKeys and debugKeys + state ReadYourWritesTransaction tr1(cx); + state std::map records; // + Key begin(format("debug%08x", g)); + Standalone debuglog = + wait(tr1.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + if (debuglog.more) { + TraceEvent(SevError, "DebugLogHitTxnLimits").detail("Result", debuglog.toString()); + return Void(); + } + for (auto& kv : debuglog) { + records[kv.value] = kv.key; + } + + // Get log key's value and assign it to the associated debugKey + state ReadYourWritesTransaction tr2(cx); + state std::map logVal; // debugKey, log's value + Key begin(format("log%08x", g)); + Standalone log = wait(tr2.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + if (log.more) { + TraceEvent(SevError, "LogHitTxnLimits").detail("Result", log.toString()); + return Void(); + } + for (auto& kv : log) { + uint64_t intValue = 0; + memcpy(&intValue, kv.value.begin(), kv.value.size()); + logVal[kv.key.removePrefix(LiteralStringRef("log")).withPrefix(LiteralStringRef("debug"))] = intValue; + } + + // Get opsKeys and validate if it has correct value + state ReadYourWritesTransaction tr3(cx); + state std::map opsVal; // ops key, ops value + Key begin(format("ops%08x", g)); + Standalone ops = wait(tr3.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + if (ops.more) { + TraceEvent(SevError, "OpsHitTxnLimits").detail("Result", ops.toString()); + return Void(); + } + // Validate if ops' key value is consistent with logs' key value + for (auto& kv : ops) { + bool inRecord = records.find(kv.key) != records.end(); + uint64_t intValue = 0; + memcpy(&intValue, kv.value.begin(), kv.value.size()); + opsVal[kv.key] = intValue; + if (!inRecord) { + TraceEvent(SevError, "MissingLogKey").detail("OpsKey", kv.key); + } + if (inRecord && (self->actorCount == 1 && intValue != logVal[records[kv.key]])) { + // When multiple actors exist, 1 opsKey can have multiple log keys + TraceEvent(SevError, "InconsistentOpsKeyValue") + .detail("OpsKey", kv.key) + .detail("DebugKey", records[kv.key]) + .detail("LogValue", logVal[records[kv.key]]) + .detail("OpValue", intValue); + } + } + + // Validate if there is any ops key missing + for (auto& kv : records) { + if (opsVal.find(kv.first) == opsVal.end()) { + TraceEvent(SevError, "MissingOpsKey2") + .detail("OpsKey", kv.first) + .detail("DebugKey", kv.second); + } + } + return Void(); + } + ACTOR Future _check( Database cx, AtomicOpsWorkload* self ) { state int g = 0; state bool ret = true; @@ -228,7 +405,17 @@ struct AtomicOpsWorkload : TestWorkload { logResult += intValue; } if(logResult != opsResult) { - TraceEvent(SevError, "LogAddMismatch").detail("LogResult", logResult).detail("OpResult", opsResult).detail("OpsResultStr", printable(opsResultStr)).detail("Size", opsResultStr.size()); + TraceEvent(SevError, "LogAddMismatch") + .detail("LogResult", logResult) + .detail("OpResult", opsResult) + .detail("OpsResultStr", printable(opsResultStr)) + .detail("Size", opsResultStr.size()) + .detail("LowerBoundSum", self->lbsum) + .detail("UpperBoundSum", self->ubsum); + wait(self->dumpLogKV(cx, g)); + wait(self->dumpDebugKV(cx, g)); + wait(self->dumpOpsKV(cx, g)); + wait(self->validateOpsKey(cx, self, g)); } } break; diff --git a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp index 8266883298..389764353e 100644 --- a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp +++ b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp @@ -23,7 +23,7 @@ #include "fdbclient/BackupContainer.h" #include "fdbserver/workloads/workloads.actor.h" #include "fdbserver/workloads/BulkSetup.actor.h" -#include "fdbserver/RestoreWorkerInterface.h" +#include "fdbclient/RestoreWorkerInterface.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. // A workload which test the correctness of backup and restore process @@ -119,7 +119,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload { return; } - printf("[CheckDB] KV Number. Prev DB:%d Current DB:%d\n", self->dbKVs.size(), newDbKVs.size()); + printf("[CheckDB] KV Number. Prev DB:%ld Current DB:%ld\n", self->dbKVs.size(), newDbKVs.size()); // compare the KV pairs in the DB printf("------------------Now print out the diff between the prev DB and current DB-------------------\n"); if (self->dbKVs.size() >= newDbKVs.size()) { @@ -251,23 +251,19 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload { state int retryCount = 0; loop { try { - tr.reset(); - state Version v = wait(tr.getReadVersion()); state Standalone data = wait( tr.getRange(firstGreaterOrEqual(doubleToTestKey(0.0, keyPrefix)), firstGreaterOrEqual(doubleToTestKey(1.0, keyPrefix)), std::numeric_limits::max())); printf("dump DB, at %s. retryCount:%d Data size:%d, rangeResultInfo:%s\n", when.c_str(), retryCount, data.size(), data.contents().toString().c_str()); dumpDBKVs(data, self); - break; + return Void(); } catch (Error& e) { retryCount++; TraceEvent(retryCount > 20 ? SevWarnAlways : SevWarn, "dumpDBError").error(e); wait(tr.onError(e)); } } - - return Void(); } virtual std::string description() { return "BackupAndParallelRestoreCorrectness"; } @@ -755,15 +751,6 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload { state int64_t taskCount = wait(backupAgent.getTaskCount(tr)); state int waitCycles = 0; - if ((taskCount) && (0)) { - TraceEvent("BARW_EndingNonzeroTaskCount", randomID) - .detail("BackupTag", printable(self->backupTag)) - .detail("TaskCount", taskCount) - .detail("WaitCycles", waitCycles); - printf("EndingNonZeroTasks: %ld\n", (long)taskCount); - wait(TaskBucket::debugPrintRange(cx, LiteralStringRef("\xff"), StringRef())); - } - loop { waitCycles++; diff --git a/fdbserver/workloads/ConfigureDatabase.actor.cpp b/fdbserver/workloads/ConfigureDatabase.actor.cpp index eec11a54d4..1dcaf853f7 100644 --- a/fdbserver/workloads/ConfigureDatabase.actor.cpp +++ b/fdbserver/workloads/ConfigureDatabase.actor.cpp @@ -267,7 +267,6 @@ struct ConfigureDatabaseWorkload : TestWorkload { ACTOR Future singleDB( ConfigureDatabaseWorkload *self, Database cx ) { state Transaction tr; - state int i; loop { if(g_simulator.speedUpSimulation) { return Void(); diff --git a/fdbserver/workloads/MachineAttrition.actor.cpp b/fdbserver/workloads/MachineAttrition.actor.cpp index 9fd9245971..b9bef9f5c6 100644 --- a/fdbserver/workloads/MachineAttrition.actor.cpp +++ b/fdbserver/workloads/MachineAttrition.actor.cpp @@ -19,6 +19,8 @@ */ #include "fdbclient/NativeAPI.actor.h" +#include "fdbclient/CoordinationInterface.h" +#include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbserver/TesterInterface.actor.h" #include "fdbserver/WorkerInterface.actor.h" #include "fdbserver/workloads/workloads.actor.h" @@ -38,7 +40,7 @@ static std::set const& normalAttritionErrors() { ACTOR Future ignoreSSFailuresForDuration(Database cx, double duration) { // duration doesn't matter since this won't timeout TraceEvent("IgnoreSSFailureStart"); - bool _ = wait(setHealthyZone(cx, ignoreSSFailuresZoneString, 0)); + wait(success(setHealthyZone(cx, ignoreSSFailuresZoneString, 0))); TraceEvent("IgnoreSSFailureWait"); wait(delay(duration)); TraceEvent("IgnoreSSFailureClear"); @@ -59,10 +61,13 @@ ACTOR Future ignoreSSFailuresForDuration(Database cx, double duration) { struct MachineAttritionWorkload : TestWorkload { bool enabled; int machinesToKill, machinesToLeave; - double testDuration; + double testDuration, suspendDuration; bool reboot; bool killDc; + bool killMachine; + bool killDatahall; bool killSelf; + Standalone targetId; bool replacement; bool waitForVersion; bool allowFaultInjection; @@ -78,9 +83,13 @@ struct MachineAttritionWorkload : TestWorkload { machinesToKill = getOption( options, LiteralStringRef("machinesToKill"), 2 ); machinesToLeave = getOption( options, LiteralStringRef("machinesToLeave"), 1 ); testDuration = getOption( options, LiteralStringRef("testDuration"), 10.0 ); + suspendDuration = getOption( options, LiteralStringRef("suspendDuration"), 1.0 ); reboot = getOption( options, LiteralStringRef("reboot"), false ); killDc = getOption( options, LiteralStringRef("killDc"), deterministicRandom()->random01() < 0.25 ); + killMachine = getOption( options, LiteralStringRef("killMachine"), false); + killDatahall = getOption( options, LiteralStringRef("killDatahall"), false); killSelf = getOption( options, LiteralStringRef("killSelf"), false ); + targetId = getOption( options, LiteralStringRef("targetId"), LiteralStringRef("")); replacement = getOption( options, LiteralStringRef("replacement"), reboot && deterministicRandom()->random01() < 0.5 ); waitForVersion = getOption( options, LiteralStringRef("waitForVersion"), false ); allowFaultInjection = getOption( options, LiteralStringRef("allowFaultInjection"), true ); @@ -124,6 +133,12 @@ struct MachineAttritionWorkload : TestWorkload { reportErrorsExcept( machineKillWorker( this, meanDelay, cx ), "machineKillWorkerError", UID(), &normalAttritionErrors()), testDuration, Void() ); } + if (!clientId && !g_network->isSimulated()) { + double meanDelay = testDuration / machinesToKill; + return timeout( + reportErrorsExcept(noSimMachineKillWorker(this, meanDelay, cx), "noSimMachineKillWorkerError", UID(), &normalAttritionErrors()), + testDuration, Void()); + } if(killSelf) throw please_reboot(); return Void(); @@ -132,17 +147,114 @@ struct MachineAttritionWorkload : TestWorkload { virtual void getMetrics( vector& m ) { } - struct UIDPredicate { - UIDPredicate(StringRef uid ) : uid( uid ) {} - bool operator() ( WorkerInterface rhs ) { return rhs.locality.zoneId() != uid; } - private: - StringRef uid; - }; + static bool noSimIsViableKill(WorkerDetails worker) { + if (worker.processClass == ProcessClass::ClassType::TesterClass) return false; + return true; + } + + ACTOR static Future noSimMachineKillWorker(MachineAttritionWorkload *self, double meanDelay, Database cx) { + ASSERT(!g_network->isSimulated()); + state int killedMachines = 0; + state double delayBeforeKill = deterministicRandom()->random01() * meanDelay; + state std::vector allWorkers = + wait(self->dbInfo->get().clusterInterface.getWorkers.getReply(GetWorkersRequest())); + // Can reuse reboot request to send to each interface since no reply promise needed + state RebootRequest rbReq; + if (self->reboot) { + rbReq.waitForDuration = self->suspendDuration; + } else { + rbReq.waitForDuration = std::numeric_limits::max(); + } + state std::vector workers; + // Pre-processing step: remove all testers from list of workers + for (const auto& worker : allWorkers) { + if (noSimIsViableKill(worker)) { + workers.push_back(worker); + } + } + deterministicRandom()->randomShuffle(workers); + if (self->killDc) { + wait(delay(delayBeforeKill)); + // Pick a dcId to kill + Optional> killDcId = self->targetId.toString().empty() ? workers.back().interf.locality.dcId() : self->targetId; + TraceEvent("Assassination").detail("TargetDataCenterId", killDcId); + for (const auto& worker : workers) { + // kill all matching dcId workers + if (worker.interf.locality.dcId().present() && worker.interf.locality.dcId() == killDcId) { + TraceEvent("SendingRebootRequest").detail("TargetMachine", worker.interf.locality.toString()); + worker.interf.clientInterface.reboot.send(rbReq); + } + } + } else if (self->killMachine) { + wait(delay(delayBeforeKill)); + // Pick a machine to kill + Optional> killMachineId = self->targetId.toString().empty() ? workers.back().interf.locality.machineId() : self->targetId; + TraceEvent("Assassination").detail("TargetMachineId", killMachineId); + for (const auto& worker : workers) { + // kill all matching machine workers + if (worker.interf.locality.machineId().present() && worker.interf.locality.machineId() == killMachineId) { + TraceEvent("SendingRebootRequest").detail("TargetMachine", worker.interf.locality.toString()); + worker.interf.clientInterface.reboot.send(rbReq); + } + } + } else if (self->killDatahall) { + wait(delay(delayBeforeKill)); + // Pick a datahall to kill + Optional> killDatahallId = self->targetId.toString().empty() ? workers.back().interf.locality.dataHallId() : self->targetId; + TraceEvent("Assassination").detail("TargetDatahallId", killDatahallId); + for (const auto& worker : workers) { + // kill all matching datahall workers + if (worker.interf.locality.dataHallId().present() && worker.interf.locality.dataHallId() == killDatahallId) { + TraceEvent("SendingRebootRequest").detail("TargetMachine", worker.interf.locality.toString()); + worker.interf.clientInterface.reboot.send(rbReq); + } + } + } else { + while (killedMachines < self->machinesToKill && workers.size() > self->machinesToLeave) { + TraceEvent("WorkerKillBegin") + .detail("KilledMachines", killedMachines) + .detail("MachinesToKill", self->machinesToKill) + .detail("MachinesToLeave", self->machinesToLeave) + .detail("Machines", workers.size()); + wait(delay(delayBeforeKill)); + TraceEvent("WorkerKillAfterDelay").detail("Delay", delayBeforeKill); + if (self->waitForVersion) { + state Transaction tr(cx); + loop { + try { + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + wait(success(tr.getReadVersion())); + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + // Pick a machine to kill + state WorkerDetails targetMachine; + targetMachine = workers.back(); + TraceEvent("Assassination") + .detail("TargetMachine", targetMachine.interf.locality.toString()) + .detail("ZoneId", targetMachine.interf.locality.zoneId()) + .detail("KilledMachines", killedMachines) + .detail("MachinesToKill", self->machinesToKill) + .detail("MachinesToLeave", self->machinesToLeave) + .detail("Machines", workers.size()); + targetMachine.interf.clientInterface.reboot.send(rbReq); + killedMachines++; + workers.pop_back(); + wait(delay(meanDelay - delayBeforeKill)); + delayBeforeKill = deterministicRandom()->random01() * meanDelay; + TraceEvent("WorkerKillAfterMeanDelay").detail("DelayBeforeKill", delayBeforeKill); + } + } + return Void(); + } ACTOR static Future machineKillWorker( MachineAttritionWorkload *self, double meanDelay, Database cx ) { state int killedMachines = 0; state double delayBeforeKill = deterministicRandom()->random01() * meanDelay; - state std::set killedUIDs; ASSERT( g_network->isSimulated() ); @@ -194,9 +306,8 @@ struct MachineAttritionWorkload : TestWorkload { state LocalityData targetMachine = self->machines.back(); if(BUGGIFY_WITH_PROB(0.01)) { TEST(true); //Marked a zone for maintenance before killing it - bool _ = - wait(setHealthyZone(cx, targetMachine.zoneId().get(), deterministicRandom()->random01() * 20)); - // } + wait(success( + setHealthyZone(cx, targetMachine.zoneId().get(), deterministicRandom()->random01() * 20))); } else if (BUGGIFY_WITH_PROB(0.005)) { TEST(true); // Disable DD for all storage server failures self->ignoreSSFailures = diff --git a/fdbserver/workloads/Mako.actor.cpp b/fdbserver/workloads/Mako.actor.cpp index c8482a5402..044ee49cbe 100644 --- a/fdbserver/workloads/Mako.actor.cpp +++ b/fdbserver/workloads/Mako.actor.cpp @@ -427,7 +427,7 @@ struct MakoWorkload : TestWorkload { ACTOR template static Future logLatency(Future f, ContinuousSample* opLatencies){ state double opBegin = now(); - T value = wait(f); + wait(success(f)); opLatencies->addSample(now() - opBegin); return Void(); } diff --git a/fdbserver/workloads/ParallelRestore.actor.cpp b/fdbserver/workloads/ParallelRestore.actor.cpp index d9f24c212c..c877048a43 100644 --- a/fdbserver/workloads/ParallelRestore.actor.cpp +++ b/fdbserver/workloads/ParallelRestore.actor.cpp @@ -23,7 +23,7 @@ #include "fdbclient/BackupContainer.h" #include "fdbserver/workloads/workloads.actor.h" #include "fdbserver/workloads/BulkSetup.actor.h" -#include "fdbserver/RestoreWorkerInterface.h" +#include "fdbclient/RestoreWorkerInterface.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. // A workload which test the correctness of backup and restore process @@ -45,7 +45,7 @@ struct RunRestoreWorkerWorkload : TestWorkload { for (int i = 0; i < num_myWorkers; ++i) { myWorkers.push_back(_restoreWorker(cx, LocalityData())); } - printf("RunParallelRestoreWorkerWorkload, wait on reply from %d restore workers\n", myWorkers.size()); + printf("RunParallelRestoreWorkerWorkload, wait on reply from %ld restore workers\n", myWorkers.size()); worker = waitForAll(myWorkers); printf("RunParallelRestoreWorkerWorkload, got all replies from restore workers\n"); return Void(); diff --git a/fdbserver/workloads/RemoveServersSafely.actor.cpp b/fdbserver/workloads/RemoveServersSafely.actor.cpp index d44e4e5b08..1900ddeeaa 100644 --- a/fdbserver/workloads/RemoveServersSafely.actor.cpp +++ b/fdbserver/workloads/RemoveServersSafely.actor.cpp @@ -452,18 +452,36 @@ struct RemoveServersSafelyWorkload : TestWorkload { // Swap coordinator with one server in the kill set to ensure the number of processes to kill does not increase. // This is needed only if a new coordinator is added to the toKill set in this function and safety check passes if (markExcludeAsFailed && coordExcl.isValid()) { + // Situation where the entirety of original kill set is selected and extra coordinator is added + // Shrink down failed vector to maintain size guarantees + if (toKillMarkFailedArray.size() > toKillArray.size()) { + auto removeServer = toKillMarkFailedArray.begin(); + TraceEvent("RemoveAndKill", functionId) + .detail("Step", "ShrinkFailedKillSet") + .detail("Removing", removeServer->toString()); + toKillMarkFailedArray.erase(removeServer); + } + ASSERT(toKillMarkFailedArray.size() <= toKillArray.size()); auto removeServer = toKill.begin(); TraceEvent("RemoveAndKill", functionId) - .detail("Step", "ReplaceKillSet") + .detail("Step", "ReplaceNonFailedKillSet") .detail("Removing", removeServer->toString()) .detail("Adding", coordExcl.toString()); - toKill.erase(removeServer); - toKill.insert(coordExcl); toKillArray.erase(std::remove(toKillArray.begin(), toKillArray.end(), *removeServer), toKillArray.end()); toKillArray.push_back(coordExcl); + toKill.erase(removeServer); + toKill.insert(coordExcl); } killProcArray = self->getProcesses(toKill); - TraceEvent("RemoveAndKill", functionId).detail("Step", "Activate Server Exclusion").detail("KillAddrs", toKill.size()).detail("KillProcs", killProcArray.size()).detail("MissingProcs", toKill.size()!=killProcArray.size()).detail("ToKill", describe(toKill)).detail("Addresses", describe(toKillArray)).detail("ClusterAvailable", g_simulator.isAvailable()); + TraceEvent("RemoveAndKill", functionId) + .detail("Step", "Activate Server Exclusion") + .detail("KillAddrs", toKill.size()) + .detail("KillProcs", killProcArray.size()) + .detail("MissingProcs", toKill.size() != killProcArray.size()) + .detail("ToKill", describe(toKill)) + .detail("Addresses", describe(toKillArray)) + .detail("FailedAddresses", describe(toKillMarkFailedArray)) + .detail("ClusterAvailable", g_simulator.isAvailable()); if (markExcludeAsFailed) { wait( excludeServers( cx, toKillMarkFailedArray, true ) ); } diff --git a/fdbserver/workloads/SnapTest.actor.cpp b/fdbserver/workloads/SnapTest.actor.cpp index aaed65ce11..78cd7580ae 100644 --- a/fdbserver/workloads/SnapTest.actor.cpp +++ b/fdbserver/workloads/SnapTest.actor.cpp @@ -159,7 +159,6 @@ public: // workload functions keys.push_back(deterministicRandom()->randomInt64(0, INT64_MAX - 2)); } - state int retry = 0; tr.reset(); loop { try { @@ -190,6 +189,7 @@ public: // workload functions ACTOR Future _start(Database cx, SnapTestWorkload* self) { state Transaction tr(cx); + state bool snapFailed = false; if (self->testID == 0) { // create even keys before the snapshot @@ -202,7 +202,6 @@ public: // workload functions wait(delay(toDelay)); state int retry = 0; - state bool snapFailed = false; loop { self->snapUID = deterministicRandom()->randomUniqueID(); try { diff --git a/flow/Arena.h b/flow/Arena.h index 3af189c8b4..4d8b5aa914 100644 --- a/flow/Arena.h +++ b/flow/Arena.h @@ -468,7 +468,7 @@ struct union_like_traits> : std::true_type { } template - static const void assign(Member& member, const U& t, Context&) { + static void assign(Member& member, const U& t, Context&) { member = t; } }; diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index 84184156c3..ace8930c72 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -63,7 +63,7 @@ set(FLOW_SRCS XmlTraceLogFormatter.cpp actorcompiler.h error_definitions.h - ${CMAKE_CURRENT_BINARY_DIR}/hgVersion.h + ${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h flat_buffers.h flat_buffers.cpp flow.cpp @@ -78,7 +78,7 @@ set(FLOW_SRCS stacktrace.h version.cpp) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/hgVersion.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/hgVersion.h) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/SourceVersion.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h) add_flow_target(STATIC_LIBRARY NAME flow SRCS ${FLOW_SRCS}) target_include_directories(flow PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}) @@ -92,7 +92,8 @@ target_link_libraries(flow PUBLIC boost_target Threads::Threads ${CMAKE_DL_LIBS} if(USE_VALGRIND) target_link_libraries(flow PUBLIC Valgrind) endif() -if(NOT WITH_TLS) +# TODO(atn34) Re-enable TLS for OPEN_FOR_IDE build once #2201 is resolved +if(NOT WITH_TLS OR OPEN_FOR_IDE) target_compile_definitions(flow PUBLIC TLS_DISABLED) else() target_link_libraries(flow PUBLIC FDBLibTLS) diff --git a/flow/FastAlloc.h b/flow/FastAlloc.h index 196a20b4ef..94a5c9d0e8 100644 --- a/flow/FastAlloc.h +++ b/flow/FastAlloc.h @@ -212,6 +212,10 @@ public: if (size <= 128) return FastAllocator<128>::allocate(); if (size <= 256) return FastAllocator<256>::allocate(); if (size <= 512) return FastAllocator<512>::allocate(); + if (size <= 1024) return FastAllocator<1024>::allocate(); + if (size <= 2048) return FastAllocator<2048>::allocate(); + if (size <= 4096) return FastAllocator<4096>::allocate(); + if (size <= 8192) return FastAllocator<8192>::allocate(); return new uint8_t[size]; } @@ -223,6 +227,10 @@ static void freeFast(int size, void* ptr) { if (size <= 128) return FastAllocator<128>::release(ptr); if (size <= 256) return FastAllocator<256>::release(ptr); if (size <= 512) return FastAllocator<512>::release(ptr); + if (size <= 1024) return FastAllocator<1024>::release(ptr); + if (size <= 2048) return FastAllocator<2048>::release(ptr); + if (size <= 4096) return FastAllocator<4096>::release(ptr); + if (size <= 8192) return FastAllocator<8192>::release(ptr); delete[](uint8_t*)ptr; } diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index b2a58fdd04..62d722ba83 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -27,6 +27,7 @@ FlowKnobs const* FLOW_KNOBS = new FlowKnobs(); #define init( knob, value ) initKnob( knob, value, #knob ) +// clang-format off FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) { init( AUTOMATIC_TRACE_DUMP, 1 ); init( PREVENT_FAST_SPIN_DELAY, .01 ); @@ -87,6 +88,10 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) { init( CACHE_EVICTION_POLICY, "random" ); init( PAGE_CACHE_TRUNCATE_LOOKUP_FRACTION, 0.1 ); if( randomize && BUGGIFY ) PAGE_CACHE_TRUNCATE_LOOKUP_FRACTION = 0.0; else if( randomize && BUGGIFY ) PAGE_CACHE_TRUNCATE_LOOKUP_FRACTION = 1.0; + //AsyncFileEIO + init( EIO_MAX_PARALLELISM, 4 ); + init( EIO_USE_ODIRECT, 0 ); + //AsyncFileKAIO init( MAX_OUTSTANDING, 64 ); init( MIN_SUBMIT, 10 ); @@ -136,7 +141,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) { init( ZERO_LENGTH_FILE_PAD, 1 ); init( TRACE_FLUSH_INTERVAL, 0.25 ); init( TRACE_RETRY_OPEN_INTERVAL, 1.00 ); - init( MIN_TRACE_SEVERITY, isSimulated ? 0 : 10 ); // Related to the trace severity in Trace.h + init( MIN_TRACE_SEVERITY, isSimulated ? 1 : 10 ); // Related to the trace severity in Trace.h init( MAX_TRACE_SUPPRESSIONS, 1e4 ); init( TRACE_SYNC_ENABLED, 0 ); init( TRACE_EVENT_METRIC_UNITS_PER_SAMPLE, 500 ); @@ -179,6 +184,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) { init( LOAD_BALANCE_MAX_BAD_OPTIONS, 1 ); //should be the same as MAX_MACHINES_FALLING_BEHIND init( LOAD_BALANCE_PENALTY_IS_BAD, true ); } +// clang-format on static std::string toLower( std::string const& name ) { std::string lower_name; diff --git a/flow/Knobs.h b/flow/Knobs.h index 0f45fb5599..ab9ab917b6 100644 --- a/flow/Knobs.h +++ b/flow/Knobs.h @@ -107,6 +107,10 @@ public: double TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY; int TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT; + //AsyncFileEIO + int EIO_MAX_PARALLELISM; + int EIO_USE_ODIRECT; + //AsyncFileKAIO int MAX_OUTSTANDING; int MIN_SUBMIT; diff --git a/flow/ObjectSerializerTraits.h b/flow/ObjectSerializerTraits.h index 2f560f441c..dc3dd8c9ae 100644 --- a/flow/ObjectSerializerTraits.h +++ b/flow/ObjectSerializerTraits.h @@ -133,7 +133,7 @@ struct union_like_traits : std::false_type { static const index_t& get(const Member&, Context&); template - static const void assign(Member&, const Alternative&, Context&); + static void assign(Member&, const Alternative&, Context&); template static void done(Member&, Context&); @@ -150,7 +150,7 @@ struct struct_like_traits : std::false_type { static const index_t& get(const Member&, Context&); template - static const void assign(Member&, const index_t&, Context&); + static void assign(Member&, const index_t&, Context&); template static void done(Member&, Context&); @@ -175,7 +175,7 @@ struct union_like_traits> : std::true_type { } template - static const void assign(Member& member, const Alternative& a, Context&) { + static void assign(Member& member, const Alternative& a, Context&) { static_assert(std::is_same_v, Alternative>); member = a; } diff --git a/flow/Platform.h b/flow/Platform.h index fc5427c63d..b20cb9e346 100644 --- a/flow/Platform.h +++ b/flow/Platform.h @@ -524,6 +524,7 @@ inline static void aligned_free(void* ptr) { free(ptr); } inline static void* aligned_alloc(size_t alignment, size_t size) { return memalign(alignment, size); } #endif #elif defined(__APPLE__) +#if !defined(HAS_ALIGNED_ALLOC) #include inline static void* aligned_alloc(size_t alignment, size_t size) { // Linux's aligned_alloc() requires alignment to be a power of 2. While posix_memalign() @@ -540,6 +541,7 @@ inline static void* aligned_alloc(size_t alignment, size_t size) { posix_memalign(&ptr, alignment, size); return ptr; } +#endif inline static void aligned_free(void* ptr) { free(ptr); } #endif diff --git a/flow/SourceVersion.h.cmake b/flow/SourceVersion.h.cmake new file mode 100644 index 0000000000..d4b4a390ab --- /dev/null +++ b/flow/SourceVersion.h.cmake @@ -0,0 +1,2 @@ +#pragma once +#define sourceVersion "${CURRENT_GIT_VERSION}" diff --git a/flow/TDMetric.actor.h b/flow/TDMetric.actor.h index 32eb8ceaae..73205b2481 100755 --- a/flow/TDMetric.actor.h +++ b/flow/TDMetric.actor.h @@ -1350,10 +1350,11 @@ typedef ContinuousMetric> StringMetric; // template struct MetricHandle { - template - MetricHandle(StringRef const &name = StringRef(), StringRef const &id = StringRef(), ValueType const &initial = ValueType()) - : ref(T::getOrCreateInstance(name, id, true, initial)) { - } + using ValueType = typename T::ValueType; + + MetricHandle(StringRef const& name = StringRef(), StringRef const& id = StringRef(), + ValueType const& initial = ValueType()) + : ref(T::getOrCreateInstance(name, id, true, initial)) {} // Initialize this handle to point to a new or existing metric with (name, id). If a new metric is created then the handle's // current metric's current value will be the new metric's initial value. This allows Metric handle users to treate their diff --git a/flow/Trace.h b/flow/Trace.h index 12d2bb3ade..ff9d6a9673 100644 --- a/flow/Trace.h +++ b/flow/Trace.h @@ -45,14 +45,15 @@ inline static bool TRACE_SAMPLE() { return false; } extern thread_local int g_trace_depth; enum Severity { - SevSample=1, - SevDebug=5, - SevInfo=10, - SevWarn=20, - SevWarnAlways=30, - SevError=40, - SevMaxUsed=SevError, - SevMax=1000000 + SevVerbose = 0, + SevSample = 1, + SevDebug = 5, + SevInfo = 10, + SevWarn = 20, + SevWarnAlways = 30, + SevError = 40, + SevMaxUsed = SevError, + SevMax = 1000000 }; class TraceEventFields { diff --git a/flow/flat_buffers.h b/flow/flat_buffers.h index 4794773a85..33e1cbedc9 100644 --- a/flow/flat_buffers.h +++ b/flow/flat_buffers.h @@ -73,7 +73,7 @@ struct struct_like_traits> : std::true_type { } template - static const void assign(Member& m, const Type& t, Context&) { + static void assign(Member& m, const Type& t, Context&) { std::get(m) = t; } }; diff --git a/flow/flow.h b/flow/flow.h index ecf25397d8..67e8bf6706 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -225,7 +225,7 @@ struct union_like_traits> : std::true_type { } template - static const void assign(Member& m, const Alternative& a, Context&) { + static void assign(Member& m, const Alternative& a, Context&) { if constexpr (i == 0) { m = a; } else { diff --git a/flow/flow.vcxproj b/flow/flow.vcxproj index fc0fa2a412..1adada93f4 100644 --- a/flow/flow.vcxproj +++ b/flow/flow.vcxproj @@ -142,8 +142,8 @@ - echo const char *hgVersion = "Current version id not currently supported within Windows."; > hgVersion.temp.h && fc /b hgVersion.temp.h hgVersion.h > nul || copy hgVersion.temp.h hgVersion.h > nul - Checking HG source version + echo const char *sourceVersion = "Current version id not currently supported within Windows."; > SourceVersion.temp.h && fc /b SourceVersion.temp.h SourceVersion.h > nul || copy SourceVersion.temp.h SourceVersion.h > nul + Checking source version diff --git a/flow/hgVersion.h.cmake b/flow/hgVersion.h.cmake deleted file mode 100644 index 7083caa285..0000000000 --- a/flow/hgVersion.h.cmake +++ /dev/null @@ -1,2 +0,0 @@ -#pragma once -#define hgVersion "${CURRENT_GIT_VERSION}" diff --git a/flow/local.mk b/flow/local.mk index 6ff17bb62e..6c6d0d69bb 100644 --- a/flow/local.mk +++ b/flow/local.mk @@ -28,12 +28,12 @@ ifeq ($(PLATFORM),osx) flow_LDFLAGS += -framework CoreFoundation -framework IOKit endif -GENERATED_SOURCES += flow/hgVersion.h versions.h +flow_GENERATED_SOURCES += flow/SourceVersion.h versions.h -flow/hgVersion.h: FORCE - @echo "Checking hgVersion.h" - @echo "const char *hgVersion = \"$(VERSION_ID)\";" > flow/hgVersion.h.new - @([ -e flow/hgVersion.h ] && diff -q flow/hgVersion.h flow/hgVersion.h.new >/dev/null && rm flow/hgVersion.h.new) || mv flow/hgVersion.h.new flow/hgVersion.h +flow/SourceVersion.h: FORCE + @echo "Checking SourceVersion.h" + @echo "const char *sourceVersion = \"$(VERSION_ID)\";" > flow/SourceVersion.h.new + @([ -e flow/SourceVersion.h ] && diff -q flow/SourceVersion.h flow/SourceVersion.h.new >/dev/null && rm flow/SourceVersion.h.new) || mv flow/SourceVersion.h.new flow/SourceVersion.h lib/libflow.a: bin/coverage.flow.xml diff --git a/flow/network.h b/flow/network.h index 9b5edc57f3..e479f2a597 100644 --- a/flow/network.h +++ b/flow/network.h @@ -75,6 +75,7 @@ enum class TaskPriority { DataDistribution = 3500, DiskWrite = 3010, UpdateStorage = 3000, + CompactCache = 2900, TLogSpilledPeekReply = 2800, FetchKeys = 2500, Low = 2000, diff --git a/flow/version.cpp b/flow/version.cpp index 61e1a6d2ef..2b2ffe8f68 100644 --- a/flow/version.cpp +++ b/flow/version.cpp @@ -18,8 +18,8 @@ * limitations under the License. */ -#include "flow/hgVersion.h" +#include "flow/SourceVersion.h" -const char* getHGVersion() { - return hgVersion; +const char* getSourceVersion() { + return sourceVersion; } diff --git a/packaging/msi/FDBInstaller.wxs b/packaging/msi/FDBInstaller.wxs index dd1524d0c7..01bc76c575 100644 --- a/packaging/msi/FDBInstaller.wxs +++ b/packaging/msi/FDBInstaller.wxs @@ -32,7 +32,7 @@