Merge remote-tracking branch 'origin/main' into expose-txn-cost

This commit is contained in:
sfc-gh-tclinkenbeard 2022-10-23 12:59:07 -07:00
commit 32ae7bb529
177 changed files with 4367 additions and 1345 deletions

View File

@ -274,85 +274,21 @@ if(NOT WIN32)
@CLUSTER_FILE@
${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
)
add_fdbclient_test(
NAME fdb_c_api_tests
DISABLE_LOG_DUMP
COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
--cluster-file
@CLUSTER_FILE@
--tester-binary
$<TARGET_FILE:fdb_c_api_tester>
--external-client-library
${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
--test-dir
${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
--tmp-dir
@TMP_DIR@
--log-dir
@LOG_DIR@
)
add_fdbclient_test(
NAME fdb_c_api_tests_local_only
DISABLE_LOG_DUMP
COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
--cluster-file
@CLUSTER_FILE@
--tester-binary
$<TARGET_FILE:fdb_c_api_tester>
--test-dir
${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/local_tests
--tmp-dir
@TMP_DIR@
--log-dir
@LOG_DIR@
)
add_fdbclient_test(
NAME fdb_c_api_tests_blob_granule
DISABLE_LOG_DUMP
API_TEST_BLOB_GRANULES_ENABLED
COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
--cluster-file
@CLUSTER_FILE@
--tester-binary
$<TARGET_FILE:fdb_c_api_tester>
--external-client-library
${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
--test-dir
${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/blobgranuletests
--blob-granule-local-file-path
@DATA_DIR@/fdbblob/
--tmp-dir
@TMP_DIR@
--log-dir
@LOG_DIR@
)
add_fdbclient_test(
NAME fdb_c_api_tests_with_tls
DISABLE_LOG_DUMP
TLS_ENABLED
COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
--cluster-file
@CLUSTER_FILE@
--tester-binary
$<TARGET_FILE:fdb_c_api_tester>
--external-client-library
${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
--test-dir
${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
--tmp-dir
@TMP_DIR@
--log-dir
@LOG_DIR@
--tls-cert-file
@CLIENT_CERT_FILE@
--tls-key-file
@CLIENT_KEY_FILE@
--tls-ca-file
@SERVER_CA_FILE@
)
file(GLOB API_TEST_FILES "${CMAKE_CURRENT_SOURCE_DIR}/test/apitester/tests/*.toml")
foreach(test_file ${API_TEST_FILES})
get_filename_component(file_name "${test_file}" NAME_WE)
set(test_name "fdb_c_api_test_${file_name}")
add_test(NAME "${test_name}"
COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
--build-dir ${CMAKE_BINARY_DIR}
--api-tester-bin $<TARGET_FILE:fdb_c_api_tester>
--external-client-library ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
--test-file ${test_file}
--knob delete-native-lib-after-loading=false
)
set_tests_properties("${test_name}" PROPERTIES TIMEOUT 300)
endforeach()
add_test(NAME fdb_c_upgrade_to_future_version
COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py

View File

@ -585,6 +585,58 @@ extern "C" DLLEXPORT FDBFuture* fdb_tenant_wait_purge_granules_complete(FDBTenan
.extractPtr());
}
extern "C" DLLEXPORT FDBFuture* fdb_tenant_blobbify_range(FDBTenant* tenant,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length) {
return (FDBFuture*)(TENANT(tenant)
->blobbifyRange(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length),
StringRef(end_key_name, end_key_name_length)))
.extractPtr());
}
extern "C" DLLEXPORT FDBFuture* fdb_tenant_unblobbify_range(FDBTenant* tenant,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length) {
return (FDBFuture*)(TENANT(tenant)
->unblobbifyRange(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length),
StringRef(end_key_name, end_key_name_length)))
.extractPtr());
}
extern "C" DLLEXPORT FDBFuture* fdb_tenant_list_blobbified_ranges(FDBTenant* tenant,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length,
int rangeLimit) {
return (FDBFuture*)(TENANT(tenant)
->listBlobbifiedRanges(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length),
StringRef(end_key_name, end_key_name_length)),
rangeLimit)
.extractPtr());
}
extern "C" DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_tenant_verify_blob_range(FDBTenant* tenant,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length,
int64_t version) {
Optional<Version> rv;
if (version != latestVersion) {
rv = version;
}
return (FDBFuture*)(TENANT(tenant)
->verifyBlobRange(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length),
StringRef(end_key_name, end_key_name_length)),
rv)
.extractPtr());
}
extern "C" DLLEXPORT void fdb_tenant_destroy(FDBTenant* tenant) {
try {
TENANT(tenant)->delref();

View File

@ -376,6 +376,39 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_tenant_wait_purge_granules_complete(
uint8_t const* purge_key_name,
int purge_key_name_length);
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_tenant_blobbify_range(FDBTenant* tenant,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length);
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_tenant_unblobbify_range(FDBTenant* tenant,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length);
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_tenant_list_blobbified_ranges(FDBTenant* tenant,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length,
int rangeLimit);
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_tenant_list_blobbified_ranges(FDBTenant* tenant,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length,
int rangeLimit);
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_tenant_verify_blob_range(FDBTenant* tenant,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length,
int64_t version);
DLLEXPORT void fdb_tenant_destroy(FDBTenant* tenant);
DLLEXPORT void fdb_transaction_destroy(FDBTransaction* tr);

View File

@ -1,15 +0,0 @@
[[test]]
title = 'Blob Granule API Correctness Single Threaded'
minClients = 1
maxClients = 3
multiThreaded = false
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -1,15 +0,0 @@
[[test]]
title = 'Blob Granule Errors Single Threaded'
minClients = 1
maxClients = 3
multiThreaded = false
[[test.workload]]
name = 'BlobGranuleErrors'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -279,9 +279,9 @@ bool parseArgs(TesterOptions& options, int argc, char** argv) {
return true;
}
void fdb_check(fdb::Error e) {
if (e) {
fmt::print(stderr, "Unexpected FDB error: {}({})\n", e.code(), e.what());
void fdb_check(fdb::Error e, std::string_view msg, fdb::Error::CodeType expectedError = error_code_success) {
if (e.code()) {
fmt::print(stderr, "{}, Error: {}({})\n", msg, e.code(), e.what());
std::abort();
}
}
@ -453,13 +453,13 @@ int main(int argc, char** argv) {
applyNetworkOptions(options);
fdb::network::setup();
std::thread network_thread{ &fdb::network::run };
std::thread network_thread{ [] { fdb_check(fdb::network::run(), "FDB network thread failed"); } };
if (!runWorkloads(options)) {
retCode = 1;
}
fdb_check(fdb::network::stop());
fdb_check(fdb::network::stop(), "Failed to stop FDB thread");
network_thread.join();
} catch (const std::exception& err) {
fmt::print(stderr, "ERROR: {}\n", err.what());

View File

@ -1,29 +0,0 @@
[[test]]
title = 'API Correctness Single Threaded'
minClients = 1
maxClients = 3
minDatabases = 1
maxDatabases = 3
multiThreaded = false
disableClientBypass = true
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -29,31 +29,39 @@ from pathlib import Path
import glob
import random
import string
import toml
sys.path[:0] = [os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "tests", "TestRunner")]
# fmt: off
from tmp_cluster import TempCluster
from local_cluster import TLSConfig
# fmt: on
TESTER_STATS_INTERVAL_SEC = 5
def random_string(len):
return ''.join(random.choice(string.ascii_letters + string.digits) for i in range(len))
return "".join(random.choice(string.ascii_letters + string.digits) for i in range(len))
def get_logger():
return logging.getLogger('foundationdb.run_c_api_tests')
return logging.getLogger("foundationdb.run_c_api_tests")
def initialize_logger_level(logging_level):
logger = get_logger()
assert logging_level in ['DEBUG', 'INFO', 'WARNING', 'ERROR']
assert logging_level in ["DEBUG", "INFO", "WARNING", "ERROR"]
logging.basicConfig(format='%(message)s')
if logging_level == 'DEBUG':
logging.basicConfig(format="%(message)s")
if logging_level == "DEBUG":
logger.setLevel(logging.DEBUG)
elif logging_level == 'INFO':
elif logging_level == "INFO":
logger.setLevel(logging.INFO)
elif logging_level == 'WARNING':
elif logging_level == "WARNING":
logger.setLevel(logging.WARNING)
elif logging_level == 'ERROR':
elif logging_level == "ERROR":
logger.setLevel(logging.ERROR)
@ -65,35 +73,52 @@ def dump_client_logs(log_dir):
print(">>>>>>>>>>>>>>>>>>>> End of {}:".format(log_file))
def run_tester(args, test_file):
cmd = [args.tester_binary,
"--cluster-file", args.cluster_file,
"--test-file", test_file,
"--stats-interval", str(TESTER_STATS_INTERVAL_SEC*1000)]
def run_tester(args, cluster, test_file):
build_dir = Path(args.build_dir).resolve()
tester_binary = Path(args.api_tester_bin).resolve()
external_client_library = build_dir.joinpath("bindings", "c", "libfdb_c_external.so")
log_dir = Path(cluster.log).joinpath("client")
log_dir.mkdir(exist_ok=True)
cmd = [
tester_binary,
"--cluster-file",
cluster.cluster_file,
"--test-file",
test_file,
"--stats-interval",
str(TESTER_STATS_INTERVAL_SEC * 1000),
"--tmp-dir",
cluster.tmp_dir,
"--log",
"--log-dir",
str(log_dir),
]
if args.external_client_library is not None:
cmd += ["--external-client-library", args.external_client_library]
if args.tmp_dir is not None:
cmd += ["--tmp-dir", args.tmp_dir]
log_dir = None
if args.log_dir is not None:
log_dir = Path(args.log_dir).joinpath(random_string(8))
log_dir.mkdir(exist_ok=True)
cmd += ['--log', "--log-dir", str(log_dir)]
external_client_library = Path(args.external_client_library).resolve()
cmd += ["--external-client-library", external_client_library]
if args.blob_granule_local_file_path is not None:
cmd += ["--blob-granule-local-file-path",
args.blob_granule_local_file_path]
if cluster.blob_granules_enabled:
cmd += [
"--blob-granule-local-file-path",
str(cluster.data.joinpath("fdbblob")) + os.sep,
]
if args.tls_ca_file is not None:
cmd += ["--tls-ca-file", args.tls_ca_file]
if cluster.tls_config is not None:
cmd += [
"--tls-ca-file",
cluster.server_ca_file,
"--tls-key-file",
cluster.client_key_file,
"--tls-cert-file",
cluster.client_cert_file,
]
if args.tls_key_file is not None:
cmd += ["--tls-key-file", args.tls_key_file]
for knob in args.knobs:
knob_name, knob_value = knob.split("=")
cmd += ["--knob-" + knob_name, knob_value]
if args.tls_cert_file is not None:
cmd += ["--tls-cert-file", args.tls_cert_file]
get_logger().info('\nRunning tester \'%s\'...' % ' '.join(cmd))
get_logger().info("\nRunning tester '%s'..." % " ".join(map(str, cmd)))
proc = Popen(cmd, stdout=sys.stdout, stderr=sys.stderr)
timed_out = False
ret_code = 1
@ -103,34 +128,76 @@ def run_tester(args, test_file):
proc.kill()
timed_out = True
except Exception as e:
raise Exception('Unable to run tester (%s)' % e)
raise Exception("Unable to run tester (%s)" % e)
if ret_code != 0:
if timed_out:
reason = 'timed out after %d seconds' % args.timeout
reason = "timed out after %d seconds" % args.timeout
elif ret_code < 0:
reason = signal.Signals(-ret_code).name
else:
reason = 'exit code: %d' % ret_code
get_logger().error('\n\'%s\' did not complete succesfully (%s)' %
(cmd[0], reason))
if (log_dir is not None):
reason = "exit code: %d" % ret_code
get_logger().error("\n'%s' did not complete succesfully (%s)" % (cmd[0], reason))
if log_dir is not None:
dump_client_logs(log_dir)
get_logger().info('')
get_logger().info("")
return ret_code
class TestConfig:
def __init__(self, test_file):
config = toml.load(test_file)
server_config = config.get("server", [{}])[0]
self.tenants_enabled = server_config.get("tenants_enabled", True)
self.blob_granules_enabled = server_config.get("blob_granules_enabled", False)
self.tls_enabled = server_config.get("tls_enabled", False)
self.client_chain_len = server_config.get("tls_client_chain_len", 2)
self.server_chain_len = server_config.get("tls_server_chain_len", 3)
self.min_num_processes = server_config.get("min_num_processes", 1)
self.max_num_processes = server_config.get("max_num_processes", 3)
self.num_processes = random.randint(self.min_num_processes, self.max_num_processes)
def run_test(args, test_file):
config = TestConfig(test_file)
tls_config = None
if config.tls_enabled:
tls_config = TLSConfig(
server_chain_len=config.client_chain_len,
client_chain_len=config.server_chain_len,
)
with TempCluster(
args.build_dir,
config.num_processes,
enable_tenants=config.tenants_enabled,
blob_granules_enabled=config.blob_granules_enabled,
tls_config=tls_config,
) as cluster:
ret_code = run_tester(args, cluster, test_file)
if not cluster.check_cluster_logs():
ret_code = 1 if ret_code == 0 else ret_code
return ret_code
def run_tests(args):
num_failed = 0
test_files = [f for f in os.listdir(args.test_dir) if os.path.isfile(
os.path.join(args.test_dir, f)) and f.endswith(".toml")]
if args.test_file is not None:
test_files = [Path(args.test_file).resolve()]
else:
test_files = [
f
for f in os.listdir(args.test_dir)
if os.path.isfile(os.path.join(args.test_dir, f)) and f.endswith(".toml")
]
for test_file in test_files:
get_logger().info('=========================================================')
get_logger().info('Running test %s' % test_file)
get_logger().info('=========================================================')
ret_code = run_tester(args, os.path.join(args.test_dir, test_file))
get_logger().info("=========================================================")
get_logger().info("Running test %s" % test_file)
get_logger().info("=========================================================")
ret_code = run_test(args, os.path.join(args.test_dir, test_file))
if ret_code != 0:
num_failed += 1
@ -138,32 +205,49 @@ def run_tests(args):
def parse_args(argv):
parser = argparse.ArgumentParser(description='FoundationDB C API Tester')
parser.add_argument('--cluster-file', type=str, default="fdb.cluster",
help='The cluster file for the cluster being connected to. (default: fdb.cluster)')
parser.add_argument('--tester-binary', type=str, default="fdb_c_api_tester",
help='Path to the fdb_c_api_tester executable. (default: fdb_c_api_tester)')
parser.add_argument('--external-client-library', type=str, default=None,
help='Path to the external client library. (default: None)')
parser.add_argument('--test-dir', type=str, default="./",
help='Path to a directory with test definitions. (default: ./)')
parser.add_argument('--timeout', type=int, default=300,
help='The timeout in seconds for running each individual test. (default 300)')
parser.add_argument('--log-dir', type=str, default=None,
help='The directory for storing logs (default: None)')
parser.add_argument('--logging-level', type=str, default='INFO',
choices=['ERROR', 'WARNING', 'INFO', 'DEBUG'], help='Specifies the level of detail in the tester output (default=\'INFO\').')
parser.add_argument('--tmp-dir', type=str, default=None,
help='The directory for storing temporary files (default: None)')
parser.add_argument('--blob-granule-local-file-path', type=str, default=None,
help='Enable blob granule tests if set, value is path to local blob granule files')
parser.add_argument('--tls-ca-file', type=str, default=None,
help='Path to client\'s TLS CA file: i.e. certificate of CA that signed the server certificate')
parser.add_argument('--tls-cert-file', type=str, default=None,
help='Path to client\'s TLS certificate file')
parser.add_argument('--tls-key-file', type=str, default=None,
help='Path to client\'s TLS private key file')
parser = argparse.ArgumentParser(description="FoundationDB C API Tester")
parser.add_argument("--build-dir", "-b", type=str, required=True, help="FDB build directory")
parser.add_argument("--api-tester-bin", type=str, help="Path to the fdb_c_api_tester executable.", required=True)
parser.add_argument("--external-client-library", type=str, help="Path to the external client library.")
parser.add_argument(
"--cluster-file",
type=str,
default="fdb.cluster",
help="The cluster file for the cluster being connected to. (default: fdb.cluster)",
)
parser.add_argument(
"--test-dir",
type=str,
default="./",
help="Path to a directory with test definitions. (default: ./)",
)
parser.add_argument(
"--test-file",
type=str,
default=None,
help="Path to a single test definition to be executed, overrides --test-dir if set.",
)
parser.add_argument(
"--timeout",
type=int,
default=300,
help="The timeout in seconds for running each individual test. (default 300)",
)
parser.add_argument(
"--logging-level",
type=str,
default="INFO",
choices=["ERROR", "WARNING", "INFO", "DEBUG"],
help="Specifies the level of detail in the tester output (default='INFO').",
)
parser.add_argument(
"--knob",
type=str,
default=[],
action="append",
dest="knobs",
help="[lowercase-knob-name]=[knob-value] (there may be multiple --knob options)",
)
return parser.parse_args(argv)
@ -174,5 +258,5 @@ def main(argv):
return run_tests(args)
if __name__ == '__main__':
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

View File

@ -12,13 +12,15 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -11,13 +11,15 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -0,0 +1,18 @@
[[test]]
title = 'Blob Granule API Correctness Single Threaded'
minClients = 1
maxClients = 3
multiThreaded = false
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -11,12 +11,15 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'BlobGranuleErrors'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'BlobGranuleErrors'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -11,12 +11,15 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'BlobGranuleErrors'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'BlobGranuleErrors'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -0,0 +1,18 @@
[[test]]
title = 'Blob Granule Errors Single Threaded'
minClients = 1
maxClients = 3
multiThreaded = false
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'BlobGranuleErrors'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -12,13 +12,13 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -11,13 +11,13 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -12,13 +12,13 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -0,0 +1,28 @@
[[test]]
title = 'Cancel Transaction with Database per Transaction with TLS'
multiThreaded = true
buggify = true
databasePerTransaction = true
minFdbThreads = 2
maxFdbThreads = 8
minDatabases = 2
maxDatabases = 8
minClientThreads = 2
maxClientThreads = 8
minClients = 2
maxClients = 8
[[server]]
tls_enabled = true
max_num_processes = 1
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -11,15 +11,15 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
minTxTimeoutMs = 10
maxTxTimeoutMs = 10000
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
minTxTimeoutMs = 10
maxTxTimeoutMs = 10000

View File

@ -12,23 +12,23 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -12,23 +12,23 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -12,23 +12,23 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -0,0 +1,29 @@
[[test]]
title = 'API Correctness Single Threaded'
minClients = 1
maxClients = 3
minDatabases = 1
maxDatabases = 3
multiThreaded = false
disableClientBypass = true
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -11,23 +11,23 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -4,23 +4,23 @@ minClients = 1
maxClients = 3
multiThreaded = false
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -0,0 +1,37 @@
[[test]]
title = 'API Correctness with TLS'
multiThreaded = true
buggify = true
minFdbThreads = 2
maxFdbThreads = 8
minDatabases = 2
maxDatabases = 8
minClientThreads = 2
maxClientThreads = 8
minClients = 2
maxClients = 8
[[server]]
tls_enabled = true
max_num_processes = 1
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -11,23 +11,22 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
minTxTimeoutMs = 100
maxTxTimeoutMs = 10000
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
minTxTimeoutMs = 100
maxTxTimeoutMs = 10000
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
minTxTimeoutMs = 100
maxTxTimeoutMs = 10000
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
minTxTimeoutMs = 100
maxTxTimeoutMs = 10000

View File

@ -9,13 +9,13 @@ maxClients = 8
minTenants = 2
maxTenants = 5
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 5
initialSize = 100
numRandomOperations = 200
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 5
initialSize = 100
numRandomOperations = 200
readExistingKeysRatio = 0.9

View File

@ -0,0 +1,25 @@
[[test]]
title = 'Multi-tenant API Correctness Multi Threaded'
multiThreaded = true
buggify = true
minFdbThreads = 2
maxFdbThreads = 8
minClients = 2
maxClients = 8
minTenants = 2
maxTenants = 5
[[server]]
tls_enabled = true
max_num_processes = 1
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 5
initialSize = 100
numRandomOperations = 200
readExistingKeysRatio = 0.9

View File

@ -12,13 +12,13 @@ maxClientThreads = 4
minClients = 2
maxClients = 4
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -0,0 +1,28 @@
[[test]]
title = 'Test tampering the cluster file with TLS'
multiThreaded = true
buggify = true
tamperClusterFile = true
minFdbThreads = 2
maxFdbThreads = 4
minDatabases = 2
maxDatabases = 4
minClientThreads = 2
maxClientThreads = 4
minClients = 2
maxClients = 4
[[server]]
tls_enabled = true
max_num_processes = 1
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -46,7 +46,7 @@ int main(int argc, char** argv) {
}
fdb_check(fdb_select_api_version(FDB_API_VERSION));
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };
fdb_check(
fdb_network_set_option(FDBNetworkOption::FDB_NET_OPTION_TRACE_ENABLE, reinterpret_cast<const uint8_t*>(""), 0));

View File

@ -321,7 +321,16 @@ int populate(Database db,
const auto key_begin = insertBegin(args.rows, worker_id, thread_id, args.num_processes, args.num_threads);
const auto key_end = insertEnd(args.rows, worker_id, thread_id, args.num_processes, args.num_threads);
auto key_checkpoint = key_begin; // in case of commit failure, restart from this key
double required_keys = (key_end - key_begin + 1) * args.load_factor;
for (auto i = key_begin; i <= key_end; i++) {
// Choose required_keys out of (key_end -i + 1) randomly, so the probability is required_keys / (key_end - i
// + 1). Generate a random number in range [0, 1), if the generated number is smaller or equal to
// required_keys / (key_end - i + 1), then choose this key.
double r = rand() / (1.0 + RAND_MAX);
if (r > required_keys / (key_end - i + 1)) {
continue;
}
--required_keys;
/* sequential keys */
genKey(keystr.data(), KEY_PREFIX, args, i);
/* random values */
@ -984,6 +993,7 @@ int initArguments(Arguments& args) {
args.async_xacts = 0;
args.mode = MODE_INVALID;
args.rows = 100000;
args.load_factor = 1.0;
args.row_digits = digits(args.rows);
args.seconds = 30;
args.iteration = 0;
@ -1166,6 +1176,7 @@ void usage() {
printf("%-24s %s\n", "-t, --threads=THREADS", "Specify number of worker threads");
printf("%-24s %s\n", " --async_xacts", "Specify number of concurrent transactions to be run in async mode");
printf("%-24s %s\n", "-r, --rows=ROWS", "Specify number of records");
printf("%-24s %s\n", "-l, --load_factor=LOAD_FACTOR", "Specify load factor");
printf("%-24s %s\n", "-s, --seconds=SECONDS", "Specify the test duration in seconds\n");
printf("%-24s %s\n", "", "This option cannot be specified with --iteration.");
printf("%-24s %s\n", "-i, --iteration=ITERS", "Specify the number of iterations.\n");
@ -1228,6 +1239,7 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
{ "threads", required_argument, NULL, 't' },
{ "async_xacts", required_argument, NULL, ARG_ASYNC },
{ "rows", required_argument, NULL, 'r' },
{ "load_factor", required_argument, NULL, 'l' },
{ "seconds", required_argument, NULL, 's' },
{ "iteration", required_argument, NULL, 'i' },
{ "keylen", required_argument, NULL, ARG_KEYLEN },
@ -1304,6 +1316,9 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
args.rows = atoi(optarg);
args.row_digits = digits(args.rows);
break;
case 'l':
args.load_factor = atof(optarg);
break;
case 's':
args.seconds = atoi(optarg);
break;
@ -1523,6 +1538,10 @@ int validateArguments(Arguments const& args) {
logr.error("--rows must be a positive integer");
return -1;
}
if (args.load_factor <= 0 || args.load_factor > 1) {
logr.error("--load_factor must be in range (0, 1]");
return -1;
}
if (args.key_length < 0) {
logr.error("--keylen must be a positive integer");
return -1;
@ -2118,6 +2137,7 @@ int statsProcessMain(Arguments const& args,
fmt::fprintf(fp, "\"async_xacts\": %d,", args.async_xacts);
fmt::fprintf(fp, "\"mode\": %d,", args.mode);
fmt::fprintf(fp, "\"rows\": %d,", args.rows);
fmt::fprintf(fp, "\"load_factor\": %lf,", args.load_factor);
fmt::fprintf(fp, "\"seconds\": %d,", args.seconds);
fmt::fprintf(fp, "\"iteration\": %d,", args.iteration);
fmt::fprintf(fp, "\"tpsmax\": %d,", args.tpsmax);

View File

@ -138,6 +138,7 @@ struct Arguments {
int async_xacts;
int mode;
int rows; /* is 2 billion enough? */
double load_factor;
int row_digits;
int seconds;
int iteration;

View File

@ -233,7 +233,7 @@ int main(int argc, char** argv) {
applyNetworkOptions(options);
fdb::network::setup();
std::thread network_thread{ &fdb::network::run };
std::thread network_thread{ [] { fdb_check(fdb::network::run(), "FDB network thread failed"); } };
// Try calling some basic functionality that is available
// in all recent API versions

View File

@ -271,7 +271,7 @@ int main(int argc, char** argv) {
context.applyCommandLine(argc, argv);
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };
db = fdb_open_database(argv[1]);
timeoutDb = fdb_open_database(argv[1]);

View File

@ -66,7 +66,7 @@ TEST_CASE("setup") {
},
&context));
std::thread network_thread{ &fdb_run_network };
std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };
CHECK(!context.called);
fdb_check(fdb_stop_network());

View File

@ -68,7 +68,7 @@ int main(int argc, char** argv) {
set_net_opt(FDBNetworkOption::FDB_NET_OPTION_TRACE_PARTIAL_FILE_SUFFIX, trace_partial_file_suffix);
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };
// Apparently you need to open a database to initialize logging
FDBDatabase* out;

View File

@ -2998,7 +2998,7 @@ int main(int argc, char** argv) {
context.applyCommandLine(argc, argv);
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };
db = fdb_open_database(argv[1]);
clusterFilePath = std::string(argv[1]);

View File

@ -88,7 +88,7 @@ int main(int argc, char** argv) {
context.applyCommandLine(argc, argv);
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };
{
FDBCluster* cluster;

View File

@ -24,6 +24,7 @@
#include "com_apple_foundationdb_FDB.h"
#include "com_apple_foundationdb_FDBDatabase.h"
#include "com_apple_foundationdb_FDBTenant.h"
#include "com_apple_foundationdb_FDBTransaction.h"
#include "com_apple_foundationdb_FutureBool.h"
#include "com_apple_foundationdb_FutureInt64.h"
@ -1102,6 +1103,203 @@ JNIEXPORT void JNICALL Java_com_apple_foundationdb_FDBTenant_Tenant_1dispose(JNI
fdb_tenant_destroy((FDBTenant*)tPtr);
}
JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBTenant_Tenant_1purgeBlobGranules(JNIEnv* jenv,
jobject,
jlong tPtr,
jbyteArray beginKeyBytes,
jbyteArray endKeyBytes,
jlong purgeVersion,
jboolean force) {
if (!tPtr || !beginKeyBytes || !endKeyBytes) {
throwParamNotNull(jenv);
return 0;
}
FDBTenant* tenant = (FDBTenant*)tPtr;
uint8_t* beginKeyArr = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL);
if (!beginKeyArr) {
if (!jenv->ExceptionOccurred())
throwRuntimeEx(jenv, "Error getting handle to native resources");
return 0;
}
uint8_t* endKeyArr = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL);
if (!endKeyArr) {
jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT);
if (!jenv->ExceptionOccurred())
throwRuntimeEx(jenv, "Error getting handle to native resources");
return 0;
}
FDBFuture* f = fdb_tenant_purge_blob_granules(tenant,
beginKeyArr,
jenv->GetArrayLength(beginKeyBytes),
endKeyArr,
jenv->GetArrayLength(endKeyBytes),
purgeVersion,
(fdb_bool_t)force);
jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT);
jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKeyArr, JNI_ABORT);
return (jlong)f;
}
JNIEXPORT jlong JNICALL
Java_com_apple_foundationdb_FDBTenant_Tenant_1waitPurgeGranulesComplete(JNIEnv* jenv,
jobject,
jlong tPtr,
jbyteArray purgeKeyBytes) {
if (!tPtr || !purgeKeyBytes) {
throwParamNotNull(jenv);
return 0;
}
FDBTenant* tenant = (FDBTenant*)tPtr;
uint8_t* purgeKeyArr = (uint8_t*)jenv->GetByteArrayElements(purgeKeyBytes, JNI_NULL);
if (!purgeKeyArr) {
if (!jenv->ExceptionOccurred())
throwRuntimeEx(jenv, "Error getting handle to native resources");
return 0;
}
FDBFuture* f = fdb_tenant_wait_purge_granules_complete(tenant, purgeKeyArr, jenv->GetArrayLength(purgeKeyBytes));
jenv->ReleaseByteArrayElements(purgeKeyBytes, (jbyte*)purgeKeyArr, JNI_ABORT);
return (jlong)f;
}
JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBTenant_Tenant_1blobbifyRange(JNIEnv* jenv,
jobject,
jlong tPtr,
jbyteArray beginKeyBytes,
jbyteArray endKeyBytes) {
if (!tPtr || !beginKeyBytes || !endKeyBytes) {
throwParamNotNull(jenv);
return 0;
}
FDBTenant* tenant = (FDBTenant*)tPtr;
uint8_t* beginKeyArr = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL);
if (!beginKeyArr) {
if (!jenv->ExceptionOccurred())
throwRuntimeEx(jenv, "Error getting handle to native resources");
return 0;
}
uint8_t* endKeyArr = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL);
if (!endKeyArr) {
jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT);
if (!jenv->ExceptionOccurred())
throwRuntimeEx(jenv, "Error getting handle to native resources");
return 0;
}
FDBFuture* f = fdb_tenant_blobbify_range(
tenant, beginKeyArr, jenv->GetArrayLength(beginKeyBytes), endKeyArr, jenv->GetArrayLength(endKeyBytes));
jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT);
jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKeyArr, JNI_ABORT);
return (jlong)f;
}
JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBTenant_Tenant_1unblobbifyRange(JNIEnv* jenv,
jobject,
jlong tPtr,
jbyteArray beginKeyBytes,
jbyteArray endKeyBytes) {
if (!tPtr || !beginKeyBytes || !endKeyBytes) {
throwParamNotNull(jenv);
return 0;
}
FDBTenant* tenant = (FDBTenant*)tPtr;
uint8_t* beginKeyArr = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL);
if (!beginKeyArr) {
if (!jenv->ExceptionOccurred())
throwRuntimeEx(jenv, "Error getting handle to native resources");
return 0;
}
uint8_t* endKeyArr = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL);
if (!endKeyArr) {
jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT);
if (!jenv->ExceptionOccurred())
throwRuntimeEx(jenv, "Error getting handle to native resources");
return 0;
}
FDBFuture* f = fdb_tenant_unblobbify_range(
tenant, beginKeyArr, jenv->GetArrayLength(beginKeyBytes), endKeyArr, jenv->GetArrayLength(endKeyBytes));
jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT);
jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKeyArr, JNI_ABORT);
return (jlong)f;
}
JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBTenant_Tenant_1listBlobbifiedRanges(JNIEnv* jenv,
jobject,
jlong tPtr,
jbyteArray beginKeyBytes,
jbyteArray endKeyBytes,
jint rangeLimit) {
if (!tPtr || !beginKeyBytes || !endKeyBytes) {
throwParamNotNull(jenv);
return 0;
}
FDBTenant* tenant = (FDBTenant*)tPtr;
uint8_t* startKey = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL);
if (!startKey) {
if (!jenv->ExceptionOccurred())
throwRuntimeEx(jenv, "Error getting handle to native resources");
return 0;
}
uint8_t* endKey = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL);
if (!endKey) {
jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT);
if (!jenv->ExceptionOccurred())
throwRuntimeEx(jenv, "Error getting handle to native resources");
return 0;
}
FDBFuture* f = fdb_tenant_list_blobbified_ranges(
tenant, startKey, jenv->GetArrayLength(beginKeyBytes), endKey, jenv->GetArrayLength(endKeyBytes), rangeLimit);
jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT);
jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKey, JNI_ABORT);
return (jlong)f;
}
JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBTenant_Tenant_1verifyBlobRange(JNIEnv* jenv,
jobject,
jlong tPtr,
jbyteArray beginKeyBytes,
jbyteArray endKeyBytes,
jlong version) {
if (!tPtr || !beginKeyBytes || !endKeyBytes) {
throwParamNotNull(jenv);
return 0;
}
FDBTenant* tenant = (FDBTenant*)tPtr;
uint8_t* startKey = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL);
if (!startKey) {
if (!jenv->ExceptionOccurred())
throwRuntimeEx(jenv, "Error getting handle to native resources");
return 0;
}
uint8_t* endKey = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL);
if (!endKey) {
jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT);
if (!jenv->ExceptionOccurred())
throwRuntimeEx(jenv, "Error getting handle to native resources");
return 0;
}
FDBFuture* f = fdb_tenant_verify_blob_range(
tenant, startKey, jenv->GetArrayLength(beginKeyBytes), endKey, jenv->GetArrayLength(endKeyBytes), version);
jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT);
jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKey, JNI_ABORT);
return (jlong)f;
}
JNIEXPORT void JNICALL Java_com_apple_foundationdb_FDBTransaction_Transaction_1setVersion(JNIEnv* jenv,
jobject,
jlong tPtr,

View File

@ -138,6 +138,66 @@ class FDBTenant extends NativeObjectWrapper implements Tenant {
}
}
@Override
public CompletableFuture<byte[]> purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force, Executor e) {
pointerReadLock.lock();
try {
return new FutureKey(Tenant_purgeBlobGranules(getPtr(), beginKey, endKey, purgeVersion, force), e, eventKeeper);
} finally {
pointerReadLock.unlock();
}
}
@Override
public CompletableFuture<Void> waitPurgeGranulesComplete(byte[] purgeKey, Executor e) {
pointerReadLock.lock();
try {
return new FutureVoid(Tenant_waitPurgeGranulesComplete(getPtr(), purgeKey), e);
} finally {
pointerReadLock.unlock();
}
}
@Override
public CompletableFuture<Boolean> blobbifyRange(byte[] beginKey, byte[] endKey, Executor e) {
pointerReadLock.lock();
try {
return new FutureBool(Tenant_blobbifyRange(getPtr(), beginKey, endKey), e);
} finally {
pointerReadLock.unlock();
}
}
@Override
public CompletableFuture<Boolean> unblobbifyRange(byte[] beginKey, byte[] endKey, Executor e) {
pointerReadLock.lock();
try {
return new FutureBool(Tenant_unblobbifyRange(getPtr(), beginKey, endKey), e);
} finally {
pointerReadLock.unlock();
}
}
@Override
public CompletableFuture<KeyRangeArrayResult> listBlobbifiedRanges(byte[] beginKey, byte[] endKey, int rangeLimit, Executor e) {
pointerReadLock.lock();
try {
return new FutureKeyRangeArray(Tenant_listBlobbifiedRanges(getPtr(), beginKey, endKey, rangeLimit), e);
} finally {
pointerReadLock.unlock();
}
}
@Override
public CompletableFuture<Long> verifyBlobRange(byte[] beginKey, byte[] endKey, long version, Executor e) {
pointerReadLock.lock();
try {
return new FutureInt64(Tenant_verifyBlobRange(getPtr(), beginKey, endKey, version), e);
} finally {
pointerReadLock.unlock();
}
}
@Override
public byte[] getName() {
return name;
@ -155,4 +215,10 @@ class FDBTenant extends NativeObjectWrapper implements Tenant {
private native long Tenant_createTransaction(long cPtr);
private native void Tenant_dispose(long cPtr);
private native long Tenant_purgeBlobGranules(long cPtr, byte[] beginKey, byte[] endKey, long purgeVersion, boolean force);
private native long Tenant_waitPurgeGranulesComplete(long cPtr, byte[] purgeKey);
private native long Tenant_blobbifyRange(long cPtr, byte[] beginKey, byte[] endKey);
private native long Tenant_unblobbifyRange(long cPtr, byte[] beginKey, byte[] endKey);
private native long Tenant_listBlobbifiedRanges(long cPtr, byte[] beginKey, byte[] endKey, int rangeLimit);
private native long Tenant_verifyBlobRange(long cPtr, byte[] beginKey, byte[] endKey, long version);
}

View File

@ -247,6 +247,173 @@ public interface Tenant extends AutoCloseable, TransactionContext {
<T> CompletableFuture<T> runAsync(
Function<? super Transaction, ? extends CompletableFuture<T>> retryable, Executor e);
/**
* Runs {@link #purgeBlobGranules(Function)} on the default executor.
*
* @param beginKey start of the key range
* @param endKey end of the key range
* @param force if true delete all data, if not keep data >= purgeVersion
*
* @return the key to watch for purge complete
*/
default CompletableFuture<byte[]> purgeBlobGranules(byte[] beginKey, byte[] endKey, boolean force) {
return purgeBlobGranules(beginKey, endKey, -2, force, getExecutor());
}
/**
* Runs {@link #purgeBlobGranules(Function)} on the default executor.
*
* @param beginKey start of the key range
* @param endKey end of the key range
* @param purgeVersion version to purge at
* @param force if true delete all data, if not keep data >= purgeVersion
*
* @return the key to watch for purge complete
*/
default CompletableFuture<byte[]> purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force) {
return purgeBlobGranules(beginKey, endKey, purgeVersion, force, getExecutor());
}
/**
* Queues a purge of blob granules for specified key range of this tenant, at the specified version.
*
* @param beginKey start of the key range
* @param endKey end of the key range
* @param purgeVersion version to purge at
* @param force if true delete all data, if not keep data >= purgeVersion
* @param e the {@link Executor} to use for asynchronous callbacks
* @return the key to watch for purge complete
*/
CompletableFuture<byte[]> purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force, Executor e);
/**
* Runs {@link #waitPurgeGranulesComplete(Function)} on the default executor.
*
* @param purgeKey key to watch
*/
default CompletableFuture<Void> waitPurgeGranulesComplete(byte[] purgeKey) {
return waitPurgeGranulesComplete(purgeKey, getExecutor());
}
/**
* Wait for a previous call to purgeBlobGranules to complete.
*
* @param purgeKey key to watch
* @param e the {@link Executor} to use for asynchronous callbacks
*/
CompletableFuture<Void> waitPurgeGranulesComplete(byte[] purgeKey, Executor e);
/**
* Runs {@link #blobbifyRange(Function)} on the default executor.
*
* @param beginKey start of the key range
* @param endKey end of the key range
* @return if the recording of the range was successful
*/
default CompletableFuture<Boolean> blobbifyRange(byte[] beginKey, byte[] endKey) {
return blobbifyRange(beginKey, endKey, getExecutor());
}
/**
* Sets a range to be blobbified in this tenant. Must be a completely unblobbified range.
*
* @param beginKey start of the key range
* @param endKey end of the key range
* @param e the {@link Executor} to use for asynchronous callbacks
* @return if the recording of the range was successful
*/
CompletableFuture<Boolean> blobbifyRange(byte[] beginKey, byte[] endKey, Executor e);
/**
* Runs {@link #unblobbifyRange(Function)} on the default executor.
*
* @param beginKey start of the key range
* @param endKey end of the key range
* @return if the recording of the range was successful
*/
default CompletableFuture<Boolean> unblobbifyRange(byte[] beginKey, byte[] endKey) {
return unblobbifyRange(beginKey, endKey, getExecutor());
}
/**
* Unsets a blobbified range in this tenant. The range must be aligned to known blob ranges.
*
* @param beginKey start of the key range
* @param endKey end of the key range
* @param e the {@link Executor} to use for asynchronous callbacks
* @return if the recording of the range was successful
*/
CompletableFuture<Boolean> unblobbifyRange(byte[] beginKey, byte[] endKey, Executor e);
/**
* Runs {@link #listBlobbifiedRanges(Function)} on the default executor.
*
* @param beginKey start of the key range
* @param endKey end of the key range
* @param rangeLimit batch size
* @param e the {@link Executor} to use for asynchronous callbacks
* @return a future with the list of blobbified ranges: [lastLessThan(beginKey), firstGreaterThanOrEqual(endKey)]
*/
default CompletableFuture<KeyRangeArrayResult> listBlobbifiedRanges(byte[] beginKey, byte[] endKey, int rangeLimit) {
return listBlobbifiedRanges(beginKey, endKey, rangeLimit, getExecutor());
}
/**
* Lists blobbified ranges in this tenant. There may be more if result.size() == rangeLimit.
*
* @param beginKey start of the key range
* @param endKey end of the key range
* @param rangeLimit batch size
* @param e the {@link Executor} to use for asynchronous callbacks
* @return a future with the list of blobbified ranges: [lastLessThan(beginKey), firstGreaterThanOrEqual(endKey)]
*/
CompletableFuture<KeyRangeArrayResult> listBlobbifiedRanges(byte[] beginKey, byte[] endKey, int rangeLimit, Executor e);
/**
* Runs {@link #verifyBlobRange(Function)} on the default executor.
*
* @param beginKey start of the key range
* @param endKey end of the key range
*
* @return a future with the version of the last blob granule.
*/
default CompletableFuture<Long> verifyBlobRange(byte[] beginKey, byte[] endKey) {
return verifyBlobRange(beginKey, endKey, -2, getExecutor());
}
/**
* Runs {@link #verifyBlobRange(Function)} on the default executor.
*
* @param beginKey start of the key range
* @param endKey end of the key range
* @param version version to read at
*
* @return a future with the version of the last blob granule.
*/
default CompletableFuture<Long> verifyBlobRange(byte[] beginKey, byte[] endKey, long version) {
return verifyBlobRange(beginKey, endKey, version, getExecutor());
}
/**
* Checks if a blob range is blobbified in this tenant.
*
* @param beginKey start of the key range
* @param endKey end of the key range
* @param version version to read at
*
* @return a future with the version of the last blob granule.
*/
CompletableFuture<Long> verifyBlobRange(byte[] beginKey, byte[] endKey, long version, Executor e);
/**
* Close the {@code Tenant} object and release any associated resources. This must be called at
* least once after the {@code Tenant} object is no longer in use. This can be called multiple

View File

@ -320,11 +320,11 @@ function(create_long_running_correctness_package)
add_custom_command(
OUTPUT ${tar_file}
DEPENDS ${package_files}
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTest.sh
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTest.sh
${out_dir}/joshua_test
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh
${out_dir}/joshua_timeout
COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} ${package_files}
${out_dir}/joshua_test

View File

@ -74,7 +74,7 @@ function(compile_boost)
BUILD_IN_SOURCE ON
INSTALL_COMMAND ""
UPDATE_COMMAND ""
BUILD_BYPRODUCTS "${BOOST_INSTALL_DIR}/boost/config.hpp"
BUILD_BYPRODUCTS "${BOOST_INSTALL_DIR}/include/boost/config.hpp"
"${BOOST_INSTALL_DIR}/lib/libboost_context.a"
"${BOOST_INSTALL_DIR}/lib/libboost_filesystem.a"
"${BOOST_INSTALL_DIR}/lib/libboost_iostreams.a")

View File

@ -0,0 +1,9 @@
#!/bin/sh
# Simulation currently has memory leaks. We need to investigate before we can enable leak detection in joshua.
export ASAN_OPTIONS="detect_leaks=0"
OLDBINDIR="${OLDBINDIR:-/app/deploy/global_data/oldBinaries}"
#mono bin/TestHarness.exe joshua-run "${OLDBINDIR}" false
python3 -m test_harness.app -s ${JOSHUA_SEED} --old-binaries-path ${OLDBINDIR} --long-running

View File

@ -0,0 +1,3 @@
#!/bin/bash -u
python3 -m test_harness.timeout --long-running

View File

@ -184,6 +184,8 @@ class Config:
self.reproduce_prefix: str | None = None
self.reproduce_prefix_args = {'type': str, 'required': False,
'help': 'When printing the results, prepend this string to the command'}
self.long_running: bool = False
self.long_running_args = {'action': 'store_true'}
self._env_names: Dict[str, str] = {}
self._config_map = self._build_map()
self._read_env()

View File

@ -42,9 +42,10 @@ class ToSummaryTree(xml.sax.handler.ContentHandler):
def _print_summary(summary: SummaryTree, commands: Set[str]):
cmd = []
is_valgrind_run = False
if config.reproduce_prefix is not None:
cmd.append(config.reproduce_prefix)
cmd.append('fdbserver')
cmd.append('bin/fdbserver')
if 'TestFile' in summary.attributes:
file_name = summary.attributes['TestFile']
role = 'test' if test_harness.run.is_no_sim(Path(file_name)) else 'simulation'
@ -63,11 +64,6 @@ def _print_summary(summary: SummaryTree, commands: Set[str]):
else:
cmd += ['b', '<ERROR>']
cmd += ['--crash', '--trace_format', config.trace_format]
key = ' '.join(cmd)
count = 1
while key in commands:
key = '{} # {}'.format(' '.join(cmd), count)
count += 1
# we want the command as the first attribute
attributes = {'Command': ' '.join(cmd)}
for k, v in summary.attributes.items():
@ -76,18 +72,6 @@ def _print_summary(summary: SummaryTree, commands: Set[str]):
else:
attributes[k] = v
summary.attributes = attributes
if config.details:
key = str(len(commands))
str_io = io.StringIO()
summary.dump(str_io, prefix=(' ' if config.pretty_print else ''))
if config.output_format == 'json':
sys.stdout.write('{}"Test{}": {}'.format(' ' if config.pretty_print else '',
key, str_io.getvalue()))
else:
sys.stdout.write(str_io.getvalue())
if config.pretty_print:
sys.stdout.write('\n' if config.output_format == 'xml' else ',\n')
return key
error_count = 0
warning_count = 0
small_summary = SummaryTree('Test')
@ -98,6 +82,8 @@ def _print_summary(summary: SummaryTree, commands: Set[str]):
for child in summary.children:
if 'Severity' in child.attributes and child.attributes['Severity'] == '40' and error_count < config.max_errors:
error_count += 1
if errors.name == 'ValgrindError':
is_valgrind_run = True
errors.append(child)
if 'Severity' in child.attributes and child.attributes[
'Severity'] == '30' and warning_count < config.max_warnings:
@ -122,6 +108,26 @@ def _print_summary(summary: SummaryTree, commands: Set[str]):
small_summary.children.append(errors)
if len(warnings.children) > 0:
small_summary.children.append(warnings)
if is_valgrind_run:
idx = 0 if config.reproduce_prefix is None else 1
cmd.insert(idx, 'valgrind')
key = ' '.join(cmd)
count = 1
while key in commands:
key = '{} # {}'.format(' '.join(cmd), count)
count += 1
if config.details:
key = str(len(commands))
str_io = io.StringIO()
summary.dump(str_io, prefix=(' ' if config.pretty_print else ''))
if config.output_format == 'json':
sys.stdout.write('{}"Test{}": {}'.format(' ' if config.pretty_print else '',
key, str_io.getvalue()))
else:
sys.stdout.write(str_io.getvalue())
if config.pretty_print:
sys.stdout.write('\n' if config.output_format == 'xml' else ',\n')
return key
output = io.StringIO()
small_summary.dump(output, prefix=(' ' if config.pretty_print else ''))
if config.output_format == 'json':

View File

@ -303,6 +303,7 @@ class TestRun:
self.stats: str | None = stats
self.expected_unseed: int | None = expected_unseed
self.use_valgrind: bool = config.use_valgrind
self.long_running: bool = config.long_running
self.old_binary_path: Path = config.old_binaries_path
self.buggify_enabled: bool = buggify_enabled
self.fault_injection_enabled: bool = True
@ -375,7 +376,7 @@ class TestRun:
process = subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, cwd=self.temp_path,
text=True, env=env)
did_kill = False
timeout = 20 * config.kill_seconds if self.use_valgrind else config.kill_seconds
timeout = 20 * config.kill_seconds if self.use_valgrind or self.long_running else config.kill_seconds
err_out: str
try:
_, err_out = process.communicate(timeout=timeout)

View File

@ -41,6 +41,13 @@ To set the quota through `fdbcli`, run:
fdbcli> quota set <tag> [reserved_throughput|total_throughput] <bytes_per_second>
```
To clear a both reserved and total throughput quotas for a tag, run:
```
fdbcli> quota clear <tag>
```
>>>>>>> origin/main
### Limit Calculation
The transaction budget that ratekeeper calculates and distributes to clients (via GRV proxies) for each tag is calculated based on several intermediate rate calculations, outlined in this section.

View File

@ -524,6 +524,12 @@ The ``start`` command will start a new restore on the specified (or default) tag
``--inconsistent-snapshot-only``
Ignore mutation log files during the restore to speedup the process. Because only range files are restored, this option gives an inconsistent snapshot in most cases and is not recommended to use.
``--user-data``
Restore only the user keyspace. This option should NOT be used alongside --system-metadata (below) and CANNOT be used alongside other specified key ranges.
``--system-metadata``
Restore only the relevant system keyspace. This option should NOT be used alongside --user-data (above) and CANNOT be used alongside other specified key ranges.
.. program:: fdbrestore abort
``abort``

View File

@ -528,7 +528,8 @@
"duplicate_mutation_fetch_timeout",
"primary_dc_missing",
"fetch_primary_dc_timeout",
"fetch_storage_wiggler_stats_timeout"
"fetch_storage_wiggler_stats_timeout",
"fetch_consistency_scan_info_timeout"
]
},
"issues":[

View File

@ -47,6 +47,7 @@
#include "fdbclient/IKnobCollection.h"
#include "fdbclient/RunTransaction.actor.h"
#include "fdbclient/S3BlobStore.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/json_spirit/json_spirit_writer_template.h"
#include "flow/Platform.h"
@ -155,6 +156,11 @@ enum {
OPT_RESTORE_CLUSTERFILE_ORIG,
OPT_RESTORE_BEGIN_VERSION,
OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY,
// The two restore options below allow callers of fdbrestore to divide a normal restore into one which restores just
// the system keyspace and another that restores just the user key space. This is unlike the backup command where
// all keys (both system and user) will be backed up together
OPT_RESTORE_USER_DATA,
OPT_RESTORE_SYSTEM_DATA,
// Shared constants
OPT_CLUSTERFILE,
@ -696,6 +702,8 @@ CSimpleOpt::SOption g_rgRestoreOptions[] = {
{ OPT_BACKUPKEYS, "--keys", SO_REQ_SEP },
{ OPT_WAITFORDONE, "-w", SO_NONE },
{ OPT_WAITFORDONE, "--waitfordone", SO_NONE },
{ OPT_RESTORE_USER_DATA, "--user-data", SO_NONE },
{ OPT_RESTORE_SYSTEM_DATA, "--system-metadata", SO_NONE },
{ OPT_RESTORE_VERSION, "--version", SO_REQ_SEP },
{ OPT_RESTORE_VERSION, "-v", SO_REQ_SEP },
{ OPT_TRACE, "--log", SO_NONE },
@ -1187,6 +1195,13 @@ static void printRestoreUsage(bool devhelp) {
printf(" The cluster file for the original database from which the backup was created. The "
"original database\n");
printf(" is only needed to convert a --timestamp argument to a database version.\n");
printf(" --user-data\n"
" Restore only the user keyspace. This option should NOT be used alongside "
"--system-metadata (below) and CANNOT be used alongside other specified key ranges.\n");
printf(
" --system-metadata\n"
" Restore only the relevant system keyspace. This option "
"should NOT be used alongside --user-data (above) and CANNOT be used alongside other specified key ranges.\n");
if (devhelp) {
#ifdef _WIN32
@ -3367,6 +3382,8 @@ int main(int argc, char* argv[]) {
bool trace = false;
bool quietDisplay = false;
bool dryRun = false;
bool restoreSystemKeys = false;
bool restoreUserKeys = false;
// TODO (Nim): Set this value when we add optional encrypt_files CLI argument to backup agent start
bool encryptionEnabled = true;
std::string traceDir = "";
@ -3691,6 +3708,14 @@ int main(int argc, char* argv[]) {
restoreVersion = ver;
break;
}
case OPT_RESTORE_USER_DATA: {
restoreUserKeys = true;
break;
}
case OPT_RESTORE_SYSTEM_DATA: {
restoreSystemKeys = true;
break;
}
case OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY: {
inconsistentSnapshotOnly.set(true);
break;
@ -3838,6 +3863,11 @@ int main(int argc, char* argv[]) {
}
}
if (restoreSystemKeys && restoreUserKeys) {
fprintf(stderr, "ERROR: Please only specify one of --user-data or --system-metadata, not both\n");
return FDB_EXIT_ERROR;
}
if (trace) {
if (!traceLogGroup.empty())
setNetworkOption(FDBNetworkOptions::TRACE_LOG_GROUP, StringRef(traceLogGroup));
@ -3938,10 +3968,30 @@ int main(int argc, char* argv[]) {
// The fastrestore tool does not yet support multiple ranges and is incompatible with tenants
// or other features that back up data in the system keys
if (backupKeys.empty() && programExe != ProgramExe::FASTRESTORE_TOOL) {
if (!restoreSystemKeys && !restoreUserKeys && backupKeys.empty() &&
programExe != ProgramExe::FASTRESTORE_TOOL) {
addDefaultBackupRanges(backupKeys);
}
if ((restoreSystemKeys || restoreUserKeys) && programExe == ProgramExe::FASTRESTORE_TOOL) {
fprintf(stderr, "ERROR: Options: --user-data and --system-metadata are not supported with fastrestore\n");
return FDB_EXIT_ERROR;
}
if ((restoreUserKeys || restoreSystemKeys) && !backupKeys.empty()) {
fprintf(stderr,
"ERROR: Cannot specify additional ranges when using --user-data or --system-metadata "
"options\n");
return FDB_EXIT_ERROR;
}
if (restoreUserKeys) {
backupKeys.push_back_deep(backupKeys.arena(), normalKeys);
} else if (restoreSystemKeys) {
for (const auto& r : getSystemBackupRanges()) {
backupKeys.push_back_deep(backupKeys.arena(), r);
}
}
switch (programExe) {
case ProgramExe::AGENT:
if (!initCluster())

View File

@ -56,7 +56,7 @@ ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
loop {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
try {
state ThreadFuture<Optional<Value>> resultFuture = tr->get(tag.withPrefix(tagQuotaPrefix));
state ThreadFuture<Optional<Value>> resultFuture = tr->get(ThrottleApi::getTagQuotaKey(tag));
Optional<Value> v = wait(safeThreadFutureToFuture(resultFuture));
if (!v.present()) {
fmt::print("<empty>\n");
@ -77,11 +77,10 @@ ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitType limitType, double value) {
state Reference<ITransaction> tr = db->createTransaction();
state Key key = tag.withPrefix(tagQuotaPrefix);
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
state ThreadFuture<Optional<Value>> resultFuture = tr->get(key);
state ThreadFuture<Optional<Value>> resultFuture = tr->get(ThrottleApi::getTagQuotaKey(tag));
Optional<Value> v = wait(safeThreadFutureToFuture(resultFuture));
ThrottleApi::TagQuotaValue quota;
if (v.present()) {
@ -107,8 +106,22 @@ ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
}
}
ACTOR Future<Void> clearQuota(Reference<IDatabase> db, TransactionTag tag) {
state Reference<ITransaction> tr = db->createTransaction();
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
tr->clear(ThrottleApi::getTagQuotaKey(tag));
wait(safeThreadFutureToFuture(tr->commit()));
return Void();
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
constexpr auto usage = "quota [get <tag> [reserved_throughput|total_throughput] | set <tag> "
"[reserved_throughput|total_throughput] <value>]";
"[reserved_throughput|total_throughput] <value> | clear <tag>]";
bool exitFailure() {
fmt::print(usage);
@ -121,30 +134,40 @@ namespace fdb_cli {
ACTOR Future<bool> quotaCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
state bool result = true;
if (tokens.size() != 5 && tokens.size() != 6) {
if (tokens.size() < 3 || tokens.size() > 5) {
return exitFailure();
} else {
auto tag = parseTag(tokens[2]);
auto limitType = parseLimitType(tokens[3]);
if (!tag.present() || !limitType.present()) {
auto const tag = parseTag(tokens[2]);
if (!tag.present()) {
return exitFailure();
}
if (tokens[1] == "get"_sr) {
if (tokens.size() != 4) {
return exitFailure();
}
auto const limitType = parseLimitType(tokens[3]);
if (!limitType.present()) {
return exitFailure();
}
wait(getQuota(db, tag.get(), limitType.get()));
return true;
} else if (tokens[1] == "set"_sr) {
if (tokens.size() != 5) {
return exitFailure();
}
auto const limitType = parseLimitType(tokens[3]);
auto const limitValue = parseLimitValue(tokens[4]);
if (!limitValue.present()) {
if (!limitType.present() || !limitValue.present()) {
return exitFailure();
}
wait(setQuota(db, tag.get(), limitType.get(), limitValue.get()));
return true;
} else if (tokens[1] == "clear"_sr) {
if (tokens.size() != 3) {
return exitFailure();
}
wait(clearQuota(db, tag.get()));
return true;
} else {
return exitFailure();
}

View File

@ -542,8 +542,8 @@ void initHelp() {
"Displays the current read version of the database or currently running transaction.");
helpMap["quota"] = CommandHelp("quota",
"quota [get <tag> [reserved_throughput|total_throughput] | set <tag> "
"[reserved_throughput|total_throughput] <value>]",
"Get or modify the throughput quota for the specified tag.");
"[reserved_throughput|total_throughput] <value> | clear <tag>]",
"Get, modify, or clear the throughput quota for the specified tag.");
helpMap["reset"] =
CommandHelp("reset",
"reset the current transaction",
@ -1480,6 +1480,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
if (isCommitDesc && tokens.size() == 1) {
// prompt for description and add to txn
state Optional<std::string> raw;
warn.cancel();
while (!raw.present() || raw.get().empty()) {
fprintf(stdout,
"Please set a description for the change. Description must be non-empty.\n");
@ -1490,6 +1491,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
std::string line = raw.get();
config_tr->set("\xff\xff/description"_sr, line);
}
warn =
checkStatus(timeWarning(5.0, "\nWARNING: Long delay (Ctrl-C to interrupt)\n"), db, localDb);
if (transtype == TransType::Db) {
wait(commitTransaction(tr));
} else {
@ -1821,6 +1824,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
if (!intrans) {
// prompt for description and add to txn
state Optional<std::string> raw_desc;
warn.cancel();
while (!raw_desc.present() || raw_desc.get().empty()) {
fprintf(stdout,
"Please set a description for the change. Description must be non-empty\n");
@ -1830,6 +1834,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
}
std::string line = raw_desc.get();
config_tr->set("\xff\xff/description"_sr, line);
warn = checkStatus(
timeWarning(5.0, "\nWARNING: Long delay (Ctrl-C to interrupt)\n"), db, localDb);
wait(commitTransaction(config_tr));
} else {
isCommitDesc = true;

View File

@ -103,6 +103,59 @@ def maintenance(logger):
output3 = run_fdbcli_command('maintenance')
assert output3 == no_maintenance_output
@enable_logging()
def quota(logger):
# Should be a noop
command = 'quota clear green'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == ''
command = 'quota get green total_throughput'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == '<empty>'
# Ignored update
command = 'quota set red total_throughput 49152'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == ''
command = 'quota set green total_throughput 32768'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == ''
command = 'quota set green reserved_throughput 16384'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == ''
command = 'quota get green total_throughput'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == '32768'
command = 'quota get green reserved_throughput'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == '16384'
command = 'quota clear green'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == ''
command = 'quota get green total_throughput'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == '<empty>'
# Too few arguments, should log help message
command = 'quota get green'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
@enable_logging()
def setclass(logger):
@ -1035,6 +1088,7 @@ if __name__ == '__main__':
integer_options()
tls_address_suffix()
knobmanagement()
quota()
else:
assert args.process_number > 1, "Process number should be positive"
coordinators()

View File

@ -1057,6 +1057,9 @@ ParsedDeltaBoundaryRef deltaAtVersion(const DeltaBoundaryRef& delta, Version beg
beginVersion <= delta.clearVersion.get();
if (delta.values.empty()) {
return ParsedDeltaBoundaryRef(delta.key, clearAfter);
} else if (readVersion >= delta.values.back().version && beginVersion <= delta.values.back().version) {
// for all but zero or one delta files, readVersion >= the entire delta file. optimize this case
return ParsedDeltaBoundaryRef(delta.key, clearAfter, delta.values.back());
}
auto valueAtVersion = std::lower_bound(delta.values.begin(),
delta.values.end(),
@ -1324,7 +1327,8 @@ typedef std::priority_queue<MergeStreamNext, std::vector<MergeStreamNext>, Order
static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
const std::vector<Standalone<VectorRef<ParsedDeltaBoundaryRef>>>& streams,
const std::vector<bool> startClears) {
const std::vector<bool> startClears,
GranuleMaterializeStats& stats) {
ASSERT(streams.size() < std::numeric_limits<int16_t>::max());
ASSERT(startClears.size() == streams.size());
@ -1337,6 +1341,10 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
std::set<int16_t, std::greater<int16_t>> activeClears;
int16_t maxActiveClear = -1;
// trade off memory for cpu performance by assuming all inserts
RangeResult result;
int maxExpectedSize = 0;
// check if a given stream is actively clearing
bool clearActive[streams.size()];
for (int16_t i = 0; i < streams.size(); i++) {
@ -1354,10 +1362,16 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
item.streamIdx = i;
item.dataIdx = 0;
next.push(item);
maxExpectedSize += streams[i].size();
result.arena().dependsOn(streams[i].arena());
}
}
result.reserve(result.arena(), maxExpectedSize);
if (chunk.snapshotFile.present()) {
stats.snapshotRows += streams[0].size();
}
RangeResult result;
std::vector<MergeStreamNext> cur;
cur.reserve(streams.size());
while (!next.empty()) {
@ -1373,6 +1387,7 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
// un-set clears and find latest value for key (if present)
bool foundValue = false;
bool includesSnapshot = cur.back().streamIdx == 0 && chunk.snapshotFile.present();
for (auto& it : cur) {
auto& v = streams[it.streamIdx][it.dataIdx];
if (clearActive[it.streamIdx]) {
@ -1391,7 +1406,14 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
if (v.isSet() && maxActiveClear < it.streamIdx) {
KeyRef finalKey =
chunk.tenantPrefix.present() ? v.key.removePrefix(chunk.tenantPrefix.get()) : v.key;
result.push_back_deep(result.arena(), KeyValueRef(finalKey, v.value));
result.push_back(result.arena(), KeyValueRef(finalKey, v.value));
if (!includesSnapshot) {
stats.rowsInserted++;
} else if (it.streamIdx > 0) {
stats.rowsUpdated++;
}
} else if (includesSnapshot) {
stats.rowsCleared++;
}
}
}
@ -1413,6 +1435,36 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
}
}
// FIXME: if memory assumption was wrong and result is significantly smaller than total input size, could copy it
// with push_back_deep to a new result. This is rare though
stats.outputBytes += result.expectedSize();
return result;
}
RangeResult materializeJustSnapshot(const BlobGranuleChunkRef& chunk,
Optional<StringRef> snapshotData,
const KeyRange& requestRange,
GranuleMaterializeStats& stats) {
stats.inputBytes += snapshotData.get().size();
Standalone<VectorRef<ParsedDeltaBoundaryRef>> snapshotRows = loadSnapshotFile(
chunk.snapshotFile.get().filename, snapshotData.get(), requestRange, chunk.snapshotFile.get().cipherKeysCtx);
RangeResult result;
if (!snapshotRows.empty()) {
result.arena().dependsOn(snapshotRows.arena());
result.reserve(result.arena(), snapshotRows.size());
for (auto& it : snapshotRows) {
// TODO REMOVE validation
ASSERT(it.op == MutationRef::Type::SetValue);
KeyRef finalKey = chunk.tenantPrefix.present() ? it.key.removePrefix(chunk.tenantPrefix.get()) : it.key;
result.push_back(result.arena(), KeyValueRef(finalKey, it.value));
}
stats.outputBytes += result.expectedSize();
stats.snapshotRows += result.size();
}
return result;
}
@ -1421,7 +1473,8 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
Version beginVersion,
Version readVersion,
Optional<StringRef> snapshotData,
StringRef deltaFileData[]) {
StringRef deltaFileData[],
GranuleMaterializeStats& stats) {
// TODO REMOVE with early replying
ASSERT(readVersion == chunk.includedVersion);
@ -1438,12 +1491,18 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
requestRange = keyRange;
}
// fast case for only-snapshot read
if (chunk.snapshotFile.present() && chunk.deltaFiles.empty() && chunk.newDeltas.empty()) {
return materializeJustSnapshot(chunk, snapshotData, requestRange, stats);
}
std::vector<Standalone<VectorRef<ParsedDeltaBoundaryRef>>> streams;
std::vector<bool> startClears;
// +1 for possible snapshot, +1 for possible memory deltas
streams.reserve(chunk.deltaFiles.size() + 2);
if (snapshotData.present()) {
stats.inputBytes += snapshotData.get().size();
ASSERT(chunk.snapshotFile.present());
Standalone<VectorRef<ParsedDeltaBoundaryRef>> snapshotRows =
loadSnapshotFile(chunk.snapshotFile.get().filename,
@ -1461,6 +1520,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
fmt::print("Applying {} delta files\n", chunk.deltaFiles.size());
}
for (int deltaIdx = 0; deltaIdx < chunk.deltaFiles.size(); deltaIdx++) {
stats.inputBytes += deltaFileData[deltaIdx].size();
bool startClear = false;
auto deltaRows = loadChunkedDeltaFile(chunk.deltaFiles[deltaIdx].filename,
deltaFileData[deltaIdx],
@ -1480,6 +1540,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
fmt::print("Applying {} memory deltas\n", chunk.newDeltas.size());
}
if (!chunk.newDeltas.empty()) {
stats.inputBytes += chunk.newDeltas.expectedSize();
// TODO REMOVE validation
ASSERT(beginVersion <= chunk.newDeltas.front().version);
ASSERT(readVersion >= chunk.newDeltas.back().version);
@ -1491,7 +1552,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
}
}
return mergeDeltaStreams(chunk, streams, startClears);
return mergeDeltaStreams(chunk, streams, startClears, stats);
}
struct GranuleLoadFreeHandle : NonCopyable, ReferenceCounted<GranuleLoadFreeHandle> {
@ -1560,8 +1621,6 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
}
GranuleLoadIds loadIds[files.size()];
int64_t inputBytes = 0;
int64_t outputBytes = 0;
try {
// Kick off first file reads if parallelism > 1
@ -1586,7 +1645,6 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
if (!snapshotData.get().begin()) {
return ErrorOr<RangeResult>(blob_granule_file_load_error());
}
inputBytes += snapshotData.get().size();
}
// +1 to avoid UBSAN variable length array of size zero
@ -1599,16 +1657,11 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
if (!deltaData[i].begin()) {
return ErrorOr<RangeResult>(blob_granule_file_load_error());
}
inputBytes += deltaData[i].size();
}
inputBytes += files[chunkIdx].newDeltas.expectedSize();
// materialize rows from chunk
chunkRows =
materializeBlobGranule(files[chunkIdx], keyRange, beginVersion, readVersion, snapshotData, deltaData);
outputBytes += chunkRows.expectedSize();
chunkRows = materializeBlobGranule(
files[chunkIdx], keyRange, beginVersion, readVersion, snapshotData, deltaData, stats);
results.arena().dependsOn(chunkRows.arena());
results.append(results.arena(), chunkRows.begin(), chunkRows.size());
@ -1616,8 +1669,6 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
// free once done by forcing FreeHandles to trigger
loadIds[chunkIdx].freeHandles.clear();
}
stats.inputBytes = inputBytes;
stats.outputBytes = outputBytes;
return ErrorOr<RangeResult>(results);
} catch (Error& e) {
return ErrorOr<RangeResult>(e);
@ -2303,6 +2354,7 @@ void checkDeltaRead(const KeyValueGen& kvGen,
// expected answer
std::map<KeyRef, ValueRef> expectedData;
Version lastFileEndVersion = 0;
GranuleMaterializeStats stats;
fmt::print("Delta Read [{0} - {1}) @ {2} - {3}\n",
range.begin.printable(),
@ -2322,7 +2374,7 @@ void checkDeltaRead(const KeyValueGen& kvGen,
chunk.includedVersion = readVersion;
chunk.snapshotVersion = invalidVersion;
RangeResult actualData = materializeBlobGranule(chunk, range, beginVersion, readVersion, {}, serialized);
RangeResult actualData = materializeBlobGranule(chunk, range, beginVersion, readVersion, {}, serialized, stats);
if (expectedData.size() != actualData.size()) {
fmt::print("Expected Data {0}:\n", expectedData.size());
@ -2430,6 +2482,7 @@ void checkGranuleRead(const KeyValueGen& kvGen,
}
Version lastFileEndVersion = 0;
applyDeltasByVersion(deltaData, range, beginVersion, readVersion, lastFileEndVersion, expectedData);
GranuleMaterializeStats stats;
// actual answer
Standalone<BlobGranuleChunkRef> chunk;
@ -2477,7 +2530,8 @@ void checkGranuleRead(const KeyValueGen& kvGen,
if (beginVersion == 0) {
snapshotPtr = serializedSnapshot;
}
RangeResult actualData = materializeBlobGranule(chunk, range, beginVersion, readVersion, snapshotPtr, deltaPtrs);
RangeResult actualData =
materializeBlobGranule(chunk, range, beginVersion, readVersion, snapshotPtr, deltaPtrs, stats);
if (expectedData.size() != actualData.size()) {
fmt::print("Expected Size {0} != Actual Size {1}\n", expectedData.size(), actualData.size());
@ -2663,6 +2717,14 @@ struct CommonPrefixStats {
int totalKeys = 0;
int minKeySize = 1000000000;
int maxKeySize = 0;
int64_t logicalBytes = 0;
int64_t totalLogicalBytes = 0;
int deltas = 0;
int deltasSet = 0;
int deltasClear = 0;
int deltasNoOp = 0;
int deltasClearAfter = 0;
void addKey(const KeyRef& k) {
if (len == -1) {
@ -2677,7 +2739,38 @@ struct CommonPrefixStats {
maxKeySize = std::max(maxKeySize, k.size());
}
void addKeyValue(const KeyRef& k, const ValueRef& v) {
addKey(k);
logicalBytes += k.size();
logicalBytes += v.size();
}
void addBoundary(const ParsedDeltaBoundaryRef& d) {
addKey(d.key);
deltas++;
if (d.isSet()) {
deltasSet++;
logicalBytes += d.value.size();
} else if (d.isClear()) {
deltasClear++;
} else {
ASSERT(d.isNoOp());
deltasNoOp++;
}
if (d.clearAfter) {
deltasClearAfter++;
}
}
void doneFile() {
totalLogicalBytes += logicalBytes;
fmt::print("Logical Size: {0}\n", logicalBytes);
logicalBytes = 0;
}
Key done() {
doneFile();
ASSERT(len >= 0);
fmt::print("Common prefix: {0}\nCommon Prefix Length: {1}\nAverage Key Size: {2}\nMin Key Size: {3}, Max Key "
"Size: {4}\n",
@ -2686,11 +2779,21 @@ struct CommonPrefixStats {
totalKeySize / totalKeys,
minKeySize,
maxKeySize);
if (deltas > 0) {
fmt::print("Delta stats: {0} deltas, {1} sets, {2} clears, {3} noops, {4} clearAfters\n",
deltas,
deltasSet,
deltasClear,
deltasNoOp,
deltasClearAfter);
}
fmt::print("Logical Size: {0}\n", totalLogicalBytes);
return key.substr(0, len);
}
};
FileSet loadFileSet(std::string basePath, const std::vector<std::string>& filenames) {
FileSet loadFileSet(std::string basePath, const std::vector<std::string>& filenames, bool newFormat) {
FileSet files;
CommonPrefixStats stats;
for (int i = 0; i < filenames.size(); i++) {
@ -2701,40 +2804,66 @@ FileSet loadFileSet(std::string basePath, const std::vector<std::string>& filena
std::string fpath = basePath + filenames[i];
Value data = loadFileData(fpath);
Arena arena;
GranuleSnapshot file;
ObjectReader dataReader(data.begin(), Unversioned());
dataReader.deserialize(FileIdentifierFor<GranuleSnapshot>::value, file, arena);
Standalone<GranuleSnapshot> parsed(file, arena);
Standalone<GranuleSnapshot> parsed;
if (!newFormat) {
Arena arena;
GranuleSnapshot file;
ObjectReader dataReader(data.begin(), Unversioned());
dataReader.deserialize(FileIdentifierFor<GranuleSnapshot>::value, file, arena);
parsed = Standalone<GranuleSnapshot>(file, arena);
fmt::print("Loaded {0} rows from snapshot file\n", parsed.size());
for (auto& it : parsed) {
stats.addKeyValue(it.key, it.value);
}
} else {
Standalone<VectorRef<ParsedDeltaBoundaryRef>> res = loadSnapshotFile(""_sr, data, normalKeys, {});
fmt::print("Loaded {0} rows from snapshot file\n", res.size());
for (auto& it : res) {
stats.addKeyValue(it.key, it.value);
}
}
fmt::print("Loaded {0} rows from snapshot file\n", parsed.size());
files.snapshotFile = { filenames[i], version, data, parsed };
for (auto& it : parsed) {
stats.addKey(it.key);
}
} else {
std::string fpath = basePath + filenames[i];
Value data = loadFileData(fpath);
Arena arena;
GranuleDeltas file;
ObjectReader dataReader(data.begin(), Unversioned());
dataReader.deserialize(FileIdentifierFor<GranuleDeltas>::value, file, arena);
Standalone<GranuleDeltas> parsed(file, arena);
if (!newFormat) {
Arena arena;
GranuleDeltas file;
ObjectReader dataReader(data.begin(), Unversioned());
dataReader.deserialize(FileIdentifierFor<GranuleDeltas>::value, file, arena);
Standalone<GranuleDeltas> parsed(file, arena);
fmt::print("Loaded {0} deltas from delta file\n", parsed.size());
files.deltaFiles.push_back({ filenames[i], version, data, parsed });
fmt::print("Loaded {0} deltas from delta file\n", parsed.size());
files.deltaFiles.push_back({ filenames[i], version, data, parsed });
for (auto& it : parsed) {
for (auto& it2 : it.mutations) {
stats.addKey(it2.param1);
if (it2.type == MutationRef::Type::ClearRange) {
stats.addKey(it2.param2);
for (auto& it : parsed) {
for (auto& it2 : it.mutations) {
stats.addKey(it2.param1);
if (it2.type == MutationRef::Type::ClearRange) {
stats.addKey(it2.param2);
}
}
}
} else {
bool startClear = false;
Standalone<VectorRef<ParsedDeltaBoundaryRef>> res =
loadChunkedDeltaFile(""_sr, data, normalKeys, 0, version, {}, startClear);
ASSERT(!startClear);
Standalone<GranuleDeltas> parsed;
fmt::print("Loaded {0} boundaries from delta file\n", res.size());
files.deltaFiles.push_back({ filenames[i], version, data, parsed });
for (auto& it : res) {
stats.addBoundary(it);
}
}
}
stats.doneFile();
}
files.commonPrefix = stats.done();
@ -2792,6 +2921,28 @@ std::pair<int64_t, double> doDeltaWriteBench(const Standalone<GranuleDeltas>& da
return { serializedBytes, elapsed };
}
void chunkFromFileSet(const FileSet& fileSet,
Standalone<BlobGranuleChunkRef>& chunk,
StringRef* deltaPtrs,
Version readVersion,
Optional<BlobGranuleCipherKeysCtx> keys,
int numDeltaFiles) {
size_t snapshotSize = std::get<3>(fileSet.snapshotFile).size();
chunk.snapshotFile =
BlobFilePointerRef(chunk.arena(), std::get<0>(fileSet.snapshotFile), 0, snapshotSize, snapshotSize, keys);
for (int i = 0; i < numDeltaFiles; i++) {
size_t deltaSize = std::get<3>(fileSet.deltaFiles[i]).size();
chunk.deltaFiles.emplace_back_deep(
chunk.arena(), std::get<0>(fileSet.deltaFiles[i]), 0, deltaSize, deltaSize, keys);
deltaPtrs[i] = std::get<2>(fileSet.deltaFiles[i]);
}
chunk.keyRange = fileSet.range;
chunk.includedVersion = readVersion;
chunk.snapshotVersion = std::get<1>(fileSet.snapshotFile);
}
FileSet rewriteChunkedFileSet(const FileSet& fileSet,
Optional<BlobGranuleCipherKeysCtx> keys,
Optional<CompressionFilter> compressionFilter) {
@ -2818,40 +2969,30 @@ std::pair<int64_t, double> doReadBench(const FileSet& fileSet,
KeyRange readRange,
bool clearAllAtEnd,
Optional<BlobGranuleCipherKeysCtx> keys,
Optional<CompressionFilter> compressionFilter) {
int numDeltaFiles,
bool printStats = false) {
Version readVersion = std::get<1>(fileSet.deltaFiles.back());
Standalone<BlobGranuleChunkRef> chunk;
StringRef deltaPtrs[fileSet.deltaFiles.size()];
GranuleMaterializeStats stats;
ASSERT(numDeltaFiles >= 0 && numDeltaFiles <= fileSet.deltaFiles.size());
StringRef deltaPtrs[numDeltaFiles];
MutationRef clearAllAtEndMutation;
if (clearAllAtEnd) {
clearAllAtEndMutation = MutationRef(MutationRef::Type::ClearRange, readRange.begin, readRange.end);
}
if (chunked) {
size_t snapshotSize = std::get<3>(fileSet.snapshotFile).size();
chunk.snapshotFile =
BlobFilePointerRef(chunk.arena(), std::get<0>(fileSet.snapshotFile), 0, snapshotSize, snapshotSize, keys);
for (int i = 0; i < fileSet.deltaFiles.size(); i++) {
size_t deltaSize = std::get<3>(fileSet.deltaFiles[i]).size();
chunk.deltaFiles.emplace_back_deep(
chunk.arena(), std::get<0>(fileSet.deltaFiles[i]), 0, deltaSize, deltaSize, keys);
deltaPtrs[i] = std::get<2>(fileSet.deltaFiles[i]);
}
chunkFromFileSet(fileSet, chunk, deltaPtrs, readVersion, keys, numDeltaFiles);
if (clearAllAtEnd) {
readVersion++;
MutationsAndVersionRef lastDelta;
lastDelta.version = readVersion;
lastDelta.mutations.push_back(chunk.arena(), clearAllAtEndMutation);
chunk.includedVersion = readVersion;
chunk.newDeltas.push_back_deep(chunk.arena(), lastDelta);
}
chunk.keyRange = fileSet.range;
chunk.includedVersion = readVersion;
chunk.snapshotVersion = std::get<1>(fileSet.snapshotFile);
}
int64_t serializedBytes = 0;
@ -2875,14 +3016,26 @@ std::pair<int64_t, double> doReadBench(const FileSet& fileSet,
}
serializedBytes += actualData.expectedSize();
} else {
RangeResult actualData =
materializeBlobGranule(chunk, readRange, 0, readVersion, std::get<2>(fileSet.snapshotFile), deltaPtrs);
RangeResult actualData = materializeBlobGranule(
chunk, readRange, 0, readVersion, std::get<2>(fileSet.snapshotFile), deltaPtrs, stats);
serializedBytes += actualData.expectedSize();
}
}
elapsed += timer_monotonic();
elapsed /= READ_RUNS;
serializedBytes /= READ_RUNS;
if (printStats) {
fmt::print("Materialize stats:\n");
fmt::print(" Input bytes: {0}\n", stats.inputBytes / READ_RUNS);
fmt::print(" Output bytes: {0}\n", stats.outputBytes / READ_RUNS);
fmt::print(" Write Amp: {0}\n", (1.0 * stats.inputBytes) / stats.outputBytes);
fmt::print(" Snapshot Rows: {0}\n", stats.snapshotRows / READ_RUNS);
fmt::print(" Rows Cleared: {0}\n", stats.rowsCleared / READ_RUNS);
fmt::print(" Rows Inserted: {0}\n", stats.rowsInserted / READ_RUNS);
fmt::print(" Rows Updated: {0}\n", stats.rowsUpdated / READ_RUNS);
}
return { serializedBytes, elapsed };
}
@ -2913,7 +3066,7 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
int64_t logicalSnapshotSize = 0;
int64_t logicalDeltaSize = 0;
for (auto& it : fileSetNames) {
FileSet fileSet = loadFileSet(basePath, it);
FileSet fileSet = loadFileSet(basePath, it, false);
fileSets.push_back(fileSet);
logicalSnapshotSize += std::get<3>(fileSet.snapshotFile).expectedSize();
for (auto& deltaFile : fileSet.deltaFiles) {
@ -2944,7 +3097,7 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
if (encrypt) {
name += "ENC";
}
if (compressionFilter.present()) {
if (compressionFilter.present() && compressionFilter.get() != CompressionFilter::NONE) {
name += "CMP";
}
if (name.empty()) {
@ -3000,9 +3153,16 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
std::vector<std::string> readRunNames = {};
std::vector<std::pair<int64_t, double>> readMetrics;
bool doEdgeCaseReadTests = true;
bool doEdgeCaseReadTests = false;
bool doVaryingDeltaTests = false;
std::vector<double> clearAllReadMetrics;
std::vector<double> readSingleKeyMetrics;
std::vector<std::vector<std::pair<int64_t, double>>> varyingDeltaMetrics;
size_t maxDeltaFiles = 100000;
for (auto& f : fileSets) {
maxDeltaFiles = std::min(maxDeltaFiles, f.deltaFiles.size());
}
for (bool chunk : chunkModes) {
for (bool encrypt : encryptionModes) {
@ -3025,7 +3185,7 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
if (encrypt) {
name += "ENC";
}
if (compressionFilter.present()) {
if (compressionFilter.present() && compressionFilter.get() != CompressionFilter::NONE) {
name += "CMP";
}
if (name.empty()) {
@ -3038,6 +3198,10 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
double totalElapsed = 0.0;
double totalElapsedClearAll = 0.0;
double totalElapsedSingleKey = 0.0;
std::vector<std::pair<int64_t, double>> varyingDeltas;
for (int i = 0; i <= maxDeltaFiles; i++) {
varyingDeltas.push_back({ 0, 0.0 });
}
for (auto& fileSet : fileSets) {
FileSet newFileSet;
if (!chunk) {
@ -3046,24 +3210,38 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
newFileSet = rewriteChunkedFileSet(fileSet, keys, compressionFilter);
}
auto res = doReadBench(newFileSet, chunk, fileSet.range, false, keys, compressionFilter);
auto res = doReadBench(newFileSet, chunk, fileSet.range, false, keys, newFileSet.deltaFiles.size());
totalBytesRead += res.first;
totalElapsed += res.second;
if (doEdgeCaseReadTests) {
totalElapsedClearAll +=
doReadBench(newFileSet, chunk, fileSet.range, true, keys, compressionFilter).second;
doReadBench(newFileSet, chunk, fileSet.range, true, keys, newFileSet.deltaFiles.size())
.second;
Key k = std::get<3>(fileSet.snapshotFile).front().key;
KeyRange singleKeyRange(KeyRangeRef(k, keyAfter(k)));
totalElapsedSingleKey +=
doReadBench(newFileSet, chunk, singleKeyRange, false, keys, compressionFilter).second;
doReadBench(newFileSet, chunk, singleKeyRange, false, keys, newFileSet.deltaFiles.size())
.second;
}
if (doVaryingDeltaTests && chunk) {
for (int i = 0; i <= maxDeltaFiles; i++) {
auto r = doReadBench(newFileSet, chunk, fileSet.range, false, keys, i);
varyingDeltas[i].first += r.first;
varyingDeltas[i].second += r.second;
}
}
}
readMetrics.push_back({ totalBytesRead, totalElapsed });
if (doEdgeCaseReadTests) {
clearAllReadMetrics.push_back(totalElapsedClearAll);
readSingleKeyMetrics.push_back(totalElapsedSingleKey);
}
if (doVaryingDeltaTests) {
varyingDeltaMetrics.push_back(varyingDeltas);
}
}
}
}
@ -3097,6 +3275,25 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
}
}
if (doVaryingDeltaTests) {
ASSERT(readRunNames.size() == varyingDeltaMetrics.size());
fmt::print("\n\nVarying Deltas Read Results:\nDF#\t");
for (int i = 0; i <= maxDeltaFiles; i++) {
fmt::print("{0}\t", i);
}
fmt::print("\n");
for (int i = 0; i < readRunNames.size(); i++) {
fmt::print("{0}", readRunNames[i]);
for (auto& it : varyingDeltaMetrics[i]) {
double MBperCPUsec = (it.first / 1024.0 / 1024.0) / it.second;
fmt::print("\t{:.6}", MBperCPUsec);
}
fmt::print("\n");
}
}
fmt::print("\n\nCombined Results:\n");
ASSERT(readRunNames.size() == runNames.size() - 1);
for (int i = 0; i < readRunNames.size(); i++) {
@ -3113,3 +3310,22 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
return Void();
}
TEST_CASE("!/blobgranule/files/repeatFromFiles") {
std::string basePath = "SET_ME";
std::vector<std::vector<std::string>> fileSetNames = { { "SET_ME" } };
int64_t totalBytesRead = 0;
double totalElapsed = 0.0;
for (auto& it : fileSetNames) {
FileSet fileSet = loadFileSet(basePath, it, true);
auto res = doReadBench(fileSet, true, fileSet.range, false, {}, fileSet.deltaFiles.size(), true);
totalBytesRead += res.first;
totalElapsed += res.second;
}
double MBperCPUsec = (totalBytesRead / 1024.0 / 1024.0) / totalElapsed;
fmt::print("Read Results: {:.6} MB/cpusec\n", MBperCPUsec);
return Void();
}

View File

@ -105,7 +105,9 @@ ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
arena.dependsOn(data.arena());
}
return materializeBlobGranule(chunk, keyRange, beginVersion, readVersion, snapshotData, deltaData);
// TODO do something useful with stats?
GranuleMaterializeStats stats;
return materializeBlobGranule(chunk, keyRange, beginVersion, readVersion, snapshotData, deltaData, stats);
} catch (Error& e) {
throw e;

View File

@ -1040,13 +1040,10 @@ private:
Key lastValue;
};
ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
Standalone<VectorRef<KeyValueRef>>* results,
bool encryptedBlock,
Optional<Database> cx) {
void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>* results) {
// Read begin key, if this fails then block was invalid.
state uint32_t kLen = reader->consumeNetworkUInt32();
state const uint8_t* k = reader->consume(kLen);
uint32_t kLen = reader->consumeNetworkUInt32();
const uint8_t* k = reader->consume(kLen);
results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef()));
// Read kv pairs and end key
@ -1075,7 +1072,6 @@ ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
for (auto b : reader->remainder())
if (b != 0xFF)
throw restore_corrupted_data_padding();
return Void();
}
ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file,
@ -1098,7 +1094,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
// BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION
int32_t file_version = reader.consume<int32_t>();
if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {
wait(decodeKVPairs(&reader, &results, false, cx));
decodeKVPairs(&reader, &results);
} else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) {
CODE_PROBE(true, "decoding encrypted block");
ASSERT(cx.present());
@ -1121,7 +1117,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
StringRef decryptedData =
wait(EncryptedRangeFileWriter::decrypt(cx.get(), header, dataPayloadStart, dataLen, &results.arena()));
reader = StringRefReader(decryptedData, restore_corrupted_data());
wait(decodeKVPairs(&reader, &results, true, cx));
decodeKVPairs(&reader, &results);
} else {
throw restore_unsupported_file_version();
}
@ -1704,7 +1700,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {
state std::unique_ptr<IRangeFileWriter> rangeFile;
state BackupConfig backup(task);
state Arena arena;
state Reference<TenantEntryCache<Void>> tenantCache = makeReference<TenantEntryCache<Void>>(cx);
state Reference<TenantEntryCache<Void>> tenantCache;
// Don't need to check keepRunning(task) here because we will do that while finishing each output file, but
// if bc is false then clearly the backup is no longer in progress
@ -1798,6 +1794,10 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {
// Initialize range file writer and write begin key
if (encryptionEnabled) {
CODE_PROBE(true, "using encrypted snapshot file writer");
if (!tenantCache.isValid()) {
tenantCache = makeReference<TenantEntryCache<Void>>(cx, TenantEntryCacheRefreshMode::WATCH);
wait(tenantCache->init());
}
rangeFile = std::make_unique<EncryptedRangeFileWriter>(cx, &arena, tenantCache, outFile, blockSize);
} else {
rangeFile = std::make_unique<RangeFileWriter>(outFile, blockSize);

View File

@ -122,6 +122,7 @@ IdempotencyIdRef generate(Arena& arena) {
TEST_CASE("/fdbclient/IdempotencyId/basic") {
Arena arena;
uint16_t firstBatchIndex = deterministicRandom()->randomUInt32();
firstBatchIndex &= 0xff7f; // ensure firstBatchIndex+5 won't change the higher order byte
uint16_t batchIndex = firstBatchIndex;
Version commitVersion = deterministicRandom()->randomInt64(0, std::numeric_limits<Version>::max());
std::vector<IdempotencyIdRef> idVector; // Reference

View File

@ -504,6 +504,73 @@ ThreadFuture<Void> DLTenant::waitPurgeGranulesComplete(const KeyRef& purgeKey) {
return toThreadFuture<Void>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { return Void(); });
}
ThreadFuture<bool> DLTenant::blobbifyRange(const KeyRangeRef& keyRange) {
if (!api->tenantBlobbifyRange) {
return unsupported_operation();
}
FdbCApi::FDBFuture* f = api->tenantBlobbifyRange(
tenant, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size());
return toThreadFuture<bool>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
FdbCApi::fdb_bool_t ret = false;
ASSERT(!api->futureGetBool(f, &ret));
return ret;
});
}
ThreadFuture<bool> DLTenant::unblobbifyRange(const KeyRangeRef& keyRange) {
if (!api->tenantUnblobbifyRange) {
return unsupported_operation();
}
FdbCApi::FDBFuture* f = api->tenantUnblobbifyRange(
tenant, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size());
return toThreadFuture<bool>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
FdbCApi::fdb_bool_t ret = false;
ASSERT(!api->futureGetBool(f, &ret));
return ret;
});
}
ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> DLTenant::listBlobbifiedRanges(const KeyRangeRef& keyRange,
int rangeLimit) {
if (!api->tenantListBlobbifiedRanges) {
return unsupported_operation();
}
FdbCApi::FDBFuture* f = api->tenantListBlobbifiedRanges(
tenant, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), rangeLimit);
return toThreadFuture<Standalone<VectorRef<KeyRangeRef>>>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
const FdbCApi::FDBKeyRange* keyRanges;
int keyRangesLength;
FdbCApi::fdb_error_t error = api->futureGetKeyRangeArray(f, &keyRanges, &keyRangesLength);
ASSERT(!error);
// The memory for this is stored in the FDBFuture and is released when the future gets destroyed.
return Standalone<VectorRef<KeyRangeRef>>(VectorRef<KeyRangeRef>((KeyRangeRef*)keyRanges, keyRangesLength),
Arena());
});
}
ThreadFuture<Version> DLTenant::verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) {
if (!api->tenantVerifyBlobRange) {
return unsupported_operation();
}
Version readVersion = version.present() ? version.get() : latestVersion;
FdbCApi::FDBFuture* f = api->tenantVerifyBlobRange(
tenant, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), readVersion);
return toThreadFuture<Version>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
Version version = invalidVersion;
ASSERT(!api->futureGetInt64(f, &version));
return version;
});
}
// DLDatabase
DLDatabase::DLDatabase(Reference<FdbCApi> api, ThreadFuture<FdbCApi::FDBDatabase*> dbFuture) : api(api), db(nullptr) {
addref();
@ -841,12 +908,32 @@ void DLApi::init() {
lib,
fdbCPath,
"fdb_tenant_purge_blob_granules",
headerVersion >= ApiVersion::withBlobRangeApi().version());
headerVersion >= ApiVersion::withTenantBlobRangeApi().version());
loadClientFunction(&api->tenantWaitPurgeGranulesComplete,
lib,
fdbCPath,
"fdb_tenant_wait_purge_granules_complete",
headerVersion >= ApiVersion::withBlobRangeApi().version());
headerVersion >= ApiVersion::withTenantBlobRangeApi().version());
loadClientFunction(&api->tenantBlobbifyRange,
lib,
fdbCPath,
"fdb_tenant_blobbify_range",
headerVersion >= ApiVersion::withTenantBlobRangeApi().version());
loadClientFunction(&api->tenantUnblobbifyRange,
lib,
fdbCPath,
"fdb_tenant_unblobbify_range",
headerVersion >= ApiVersion::withTenantBlobRangeApi().version());
loadClientFunction(&api->tenantListBlobbifiedRanges,
lib,
fdbCPath,
"fdb_tenant_list_blobbified_ranges",
headerVersion >= ApiVersion::withTenantBlobRangeApi().version());
loadClientFunction(&api->tenantVerifyBlobRange,
lib,
fdbCPath,
"fdb_tenant_verify_blob_range",
headerVersion >= ApiVersion::withTenantBlobRangeApi().version());
loadClientFunction(&api->tenantDestroy, lib, fdbCPath, "fdb_tenant_destroy", headerVersion >= 710);
loadClientFunction(&api->transactionSetOption, lib, fdbCPath, "fdb_transaction_set_option", headerVersion >= 0);
@ -1630,13 +1717,41 @@ Reference<ITransaction> MultiVersionTenant::createTransaction() {
}
ThreadFuture<Key> MultiVersionTenant::purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) {
auto f = tenantState->db ? tenantState->db->purgeBlobGranules(keyRange, purgeVersion, force)
: ThreadFuture<Key>(Never());
return abortableFuture(f, tenantState->db->dbState->dbVar->get().onChange);
auto tenantDb = tenantState->tenantVar->get();
auto f =
tenantDb.value ? tenantDb.value->purgeBlobGranules(keyRange, purgeVersion, force) : ThreadFuture<Key>(Never());
return abortableFuture(f, tenantDb.onChange);
}
ThreadFuture<Void> MultiVersionTenant::waitPurgeGranulesComplete(const KeyRef& purgeKey) {
auto f = tenantState->db ? tenantState->db->waitPurgeGranulesComplete(purgeKey) : ThreadFuture<Void>(Never());
return abortableFuture(f, tenantState->db->dbState->dbVar->get().onChange);
auto tenantDb = tenantState->tenantVar->get();
auto f = tenantDb.value ? tenantDb.value->waitPurgeGranulesComplete(purgeKey) : ThreadFuture<Void>(Never());
return abortableFuture(f, tenantDb.onChange);
}
ThreadFuture<bool> MultiVersionTenant::blobbifyRange(const KeyRangeRef& keyRange) {
auto tenantDb = tenantState->tenantVar->get();
auto f = tenantDb.value ? tenantDb.value->blobbifyRange(keyRange) : ThreadFuture<bool>(Never());
return abortableFuture(f, tenantDb.onChange);
}
ThreadFuture<bool> MultiVersionTenant::unblobbifyRange(const KeyRangeRef& keyRange) {
auto tenantDb = tenantState->tenantVar->get();
auto f = tenantDb.value ? tenantDb.value->unblobbifyRange(keyRange) : ThreadFuture<bool>(Never());
return abortableFuture(f, tenantDb.onChange);
}
ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> MultiVersionTenant::listBlobbifiedRanges(const KeyRangeRef& keyRange,
int rangeLimit) {
auto tenantDb = tenantState->tenantVar->get();
auto f = tenantDb.value ? tenantDb.value->listBlobbifiedRanges(keyRange, rangeLimit)
: ThreadFuture<Standalone<VectorRef<KeyRangeRef>>>(Never());
return abortableFuture(f, tenantDb.onChange);
}
ThreadFuture<Version> MultiVersionTenant::verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) {
auto tenantDb = tenantState->tenantVar->get();
auto f = tenantDb.value ? tenantDb.value->verifyBlobRange(keyRange, version) : ThreadFuture<Version>(Never());
return abortableFuture(f, tenantDb.onChange);
}
MultiVersionTenant::TenantState::TenantState(Reference<MultiVersionDatabase> db, TenantNameRef tenantName)

View File

@ -40,6 +40,7 @@
#include "fdbclient/FDBTypes.h"
#include "fdbrpc/FailureMonitor.h"
#include "fdbrpc/MultiInterface.h"
#include "fdbrpc/TenantInfo.h"
#include "fdbclient/ActorLineageProfiler.h"
#include "fdbclient/AnnotateActor.h"
@ -66,6 +67,7 @@
#include "fdbclient/SpecialKeySpace.actor.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/Tenant.h"
#include "fdbclient/TenantSpecialKeys.actor.h"
#include "fdbclient/TransactionLineage.h"
#include "fdbclient/versions.h"
@ -687,25 +689,8 @@ ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
.detail("MedianBytesPerCommit", cx->bytesPerCommit.median())
.detail("MaxBytesPerCommit", cx->bytesPerCommit.max())
.detail("NumLocalityCacheEntries", cx->locationCache.size());
if (cx->anyBlobGranuleRequests) {
ev.detail("MeanBGLatency", cx->bgLatencies.mean())
.detail("MedianBGLatency", cx->bgLatencies.median())
.detail("MaxBGLatency", cx->bgLatencies.max())
.detail("MeanBGGranulesPerRequest", cx->bgGranulesPerRequest.mean())
.detail("MedianBGGranulesPerRequest", cx->bgGranulesPerRequest.median())
.detail("MaxBGGranulesPerRequest", cx->bgGranulesPerRequest.max());
}
}
cx->latencies.clear();
cx->readLatencies.clear();
cx->GRVLatencies.clear();
cx->commitLatencies.clear();
cx->mutationsPerCommit.clear();
cx->bytesPerCommit.clear();
cx->bgLatencies.clear();
cx->bgGranulesPerRequest.clear();
if (cx->usedAnyChangeFeeds && logTraces) {
TraceEvent feedEv("ChangeFeedClientMetrics", cx->dbId);
@ -719,6 +704,37 @@ ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
cx->ccFeed.logToTraceEvent(feedEv);
}
if (cx->anyBGReads && logTraces) {
TraceEvent bgReadEv("BlobGranuleReadMetrics", cx->dbId);
bgReadEv.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged)
.detail("Cluster",
cx->getConnectionRecord()
? cx->getConnectionRecord()->getConnectionString().clusterKeyName().toString()
: "")
.detail("Internal", cx->internal);
// add counters
cx->ccBG.logToTraceEvent(bgReadEv);
// add latencies
bgReadEv.detail("MeanBGLatency", cx->bgLatencies.mean())
.detail("MedianBGLatency", cx->bgLatencies.median())
.detail("MaxBGLatency", cx->bgLatencies.max())
.detail("MeanBGGranulesPerRequest", cx->bgGranulesPerRequest.mean())
.detail("MedianBGGranulesPerRequest", cx->bgGranulesPerRequest.median())
.detail("MaxBGGranulesPerRequest", cx->bgGranulesPerRequest.max());
}
cx->latencies.clear();
cx->readLatencies.clear();
cx->GRVLatencies.clear();
cx->commitLatencies.clear();
cx->mutationsPerCommit.clear();
cx->bytesPerCommit.clear();
cx->bgLatencies.clear();
cx->bgGranulesPerRequest.clear();
lastLogged = now();
}
}
@ -1524,17 +1540,21 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<IClusterConnection
transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc),
transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc),
transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc),
transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), bgReadInputBytes("BGReadInputBytes", cc),
bgReadOutputBytes("BGReadOutputBytes", cc), usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"),
feedStreamStarts("FeedStreamStarts", ccFeed), feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed),
feedErrors("FeedErrors", ccFeed), feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed),
feedPops("FeedPops", ccFeed), feedPopsFallback("FeedPopsFallback", ccFeed), latencies(1000), readLatencies(1000),
commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), bgLatencies(1000),
bgGranulesPerRequest(1000), outstandingWatches(0), sharedStatePtr(nullptr), lastGrvTime(0.0), cachedReadVersion(0),
lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0), lastProxyRequestTime(0.0),
transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor),
coordinator(coordinator), apiVersion(_apiVersion), mvCacheInsertLocation(0), healthMetricsLastUpdated(0),
detailedHealthMetricsLastUpdated(0), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), anyBGReads(false),
ccBG("BlobGranuleReadMetrics"), bgReadInputBytes("BGReadInputBytes", ccBG),
bgReadOutputBytes("BGReadOutputBytes", ccBG), bgReadSnapshotRows("BGReadSnapshotRows", ccBG),
bgReadRowsCleared("BGReadRowsCleared", ccBG), bgReadRowsInserted("BGReadRowsInserted", ccBG),
bgReadRowsUpdated("BGReadRowsUpdated", ccBG), bgLatencies(1000), bgGranulesPerRequest(1000),
usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"), feedStreamStarts("FeedStreamStarts", ccFeed),
feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed), feedErrors("FeedErrors", ccFeed),
feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed), feedPops("FeedPops", ccFeed),
feedPopsFallback("FeedPopsFallback", ccFeed), latencies(1000), readLatencies(1000), commitLatencies(1000),
GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), outstandingWatches(0), sharedStatePtr(nullptr),
lastGrvTime(0.0), cachedReadVersion(0), lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0),
lastProxyRequestTime(0.0), transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo),
clientInfoMonitor(clientInfoMonitor), coordinator(coordinator), apiVersion(_apiVersion), mvCacheInsertLocation(0),
healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0),
smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
specialKeySpace(std::make_unique<SpecialKeySpace>(specialKeys.begin, specialKeys.end, /* test */ false)),
connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {
@ -1824,14 +1844,17 @@ DatabaseContext::DatabaseContext(const Error& err)
transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc),
transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc),
transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc),
transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), bgReadInputBytes("BGReadInputBytes", cc),
bgReadOutputBytes("BGReadOutputBytes", cc), usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"),
feedStreamStarts("FeedStreamStarts", ccFeed), feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed),
feedErrors("FeedErrors", ccFeed), feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed),
feedPops("FeedPops", ccFeed), feedPopsFallback("FeedPopsFallback", ccFeed), latencies(1000), readLatencies(1000),
commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), bgLatencies(1000),
bgGranulesPerRequest(1000), sharedStatePtr(nullptr), transactionTracingSample(false),
smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), anyBGReads(false),
ccBG("BlobGranuleReadMetrics"), bgReadInputBytes("BGReadInputBytes", ccBG),
bgReadOutputBytes("BGReadOutputBytes", ccBG), bgReadSnapshotRows("BGReadSnapshotRows", ccBG),
bgReadRowsCleared("BGReadRowsCleared", ccBG), bgReadRowsInserted("BGReadRowsInserted", ccBG),
bgReadRowsUpdated("BGReadRowsUpdated", ccBG), bgLatencies(1000), bgGranulesPerRequest(1000),
usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"), feedStreamStarts("FeedStreamStarts", ccFeed),
feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed), feedErrors("FeedErrors", ccFeed),
feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed), feedPops("FeedPops", ccFeed),
feedPopsFallback("FeedPopsFallback", ccFeed), latencies(1000), readLatencies(1000), commitLatencies(1000),
GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), sharedStatePtr(nullptr),
transactionTracingSample(false), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {}
// Static constructor used by server processes to create a DatabaseContext
@ -6232,7 +6255,7 @@ ACTOR Future<Optional<ClientTrCommitCostEstimation>> estimateCommitCosts(Referen
trCommitCosts.opsCount++;
keyRange = KeyRangeRef(mutation.param1, mutation.param2);
if (trState->options.expensiveClearCostEstimation) {
StorageMetrics m = wait(trState->cx->getStorageMetrics(keyRange, CLIENT_KNOBS->TOO_MANY));
StorageMetrics m = wait(trState->cx->getStorageMetrics(keyRange, CLIENT_KNOBS->TOO_MANY, trState));
trCommitCosts.clearIdxCosts.emplace_back(i, getWriteOperationCost(m.bytes));
trCommitCosts.writeCosts += getWriteOperationCost(m.bytes);
++trCommitCosts.expensiveCostEstCount;
@ -7505,34 +7528,45 @@ Future<Void> Transaction::onError(Error const& e) {
return e;
}
ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx, KeyRange keys);
ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx,
KeyRange keys,
Optional<Reference<TransactionState>> trState);
ACTOR Future<StorageMetrics> doGetStorageMetrics(Database cx, KeyRange keys, Reference<LocationInfo> locationInfo) {
loop {
try {
WaitMetricsRequest req(keys, StorageMetrics(), StorageMetrics());
req.min.bytes = 0;
req.max.bytes = -1;
StorageMetrics m = wait(loadBalance(
locationInfo->locations(), &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution));
return m;
} catch (Error& e) {
if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
TraceEvent(SevError, "WaitStorageMetricsError").error(e);
throw;
}
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
cx->invalidateCache(Key(), keys);
StorageMetrics m = wait(getStorageMetricsLargeKeyRange(cx, keys));
return m;
ACTOR Future<StorageMetrics> doGetStorageMetrics(Database cx,
KeyRange keys,
Reference<LocationInfo> locationInfo,
TenantMapEntry tenantEntry,
Optional<Reference<TransactionState>> trState) {
state TenantInfo tenantInfo =
wait(trState.present() ? populateAndGetTenant(trState.get(), keys.begin, latestVersion) : TenantInfo());
try {
WaitMetricsRequest req(tenantInfo, keys, StorageMetrics(), StorageMetrics());
req.min.bytes = 0;
req.max.bytes = -1;
StorageMetrics m = wait(loadBalance(
locationInfo->locations(), &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution));
return m;
} catch (Error& e) {
if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
TraceEvent(SevError, "WaitStorageMetricsError").error(e);
throw;
}
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
cx->invalidateCache(tenantEntry.prefix, keys);
StorageMetrics m = wait(getStorageMetricsLargeKeyRange(cx, keys, trState));
return m;
}
}
ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx, KeyRange keys) {
ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx,
KeyRange keys,
Optional<Reference<TransactionState>> trState) {
state Span span("NAPI:GetStorageMetricsLargeKeyRange"_loc);
state TenantInfo tenantInfo =
wait(trState.present() ? populateAndGetTenant(trState.get(), keys.begin, latestVersion) : TenantInfo());
std::vector<KeyRangeLocationInfo> locations = wait(getKeyRangeLocations(cx,
TenantInfo(),
tenantInfo,
keys,
std::numeric_limits<int>::max(),
Reverse::False,
@ -7548,7 +7582,8 @@ ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx, KeyRang
for (int i = 0; i < nLocs; i++) {
partBegin = (i == 0) ? keys.begin : locations[i].range.begin;
partEnd = (i == nLocs - 1) ? keys.end : locations[i].range.end;
fx[i] = doGetStorageMetrics(cx, KeyRangeRef(partBegin, partEnd), locations[i].locations);
fx[i] = doGetStorageMetrics(
cx, KeyRangeRef(partBegin, partEnd), locations[i].locations, locations[i].tenantEntry, trState);
}
wait(waitForAll(fx));
for (int i = 0; i < nLocs; i++) {
@ -7557,14 +7592,15 @@ ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx, KeyRang
return total;
}
ACTOR Future<Void> trackBoundedStorageMetrics(KeyRange keys,
ACTOR Future<Void> trackBoundedStorageMetrics(TenantInfo tenantInfo,
KeyRange keys,
Reference<LocationInfo> location,
StorageMetrics x,
StorageMetrics halfError,
PromiseStream<StorageMetrics> deltaStream) {
try {
loop {
WaitMetricsRequest req(keys, x - halfError, x + halfError);
WaitMetricsRequest req(tenantInfo, keys, x - halfError, x + halfError);
StorageMetrics nextX = wait(loadBalance(location->locations(), &StorageServerInterface::waitMetrics, req));
deltaStream.send(nextX - x);
x = nextX;
@ -7575,7 +7611,8 @@ ACTOR Future<Void> trackBoundedStorageMetrics(KeyRange keys,
}
}
ACTOR Future<StorageMetrics> waitStorageMetricsMultipleLocations(std::vector<KeyRangeLocationInfo> locations,
ACTOR Future<StorageMetrics> waitStorageMetricsMultipleLocations(TenantInfo tenantInfo,
std::vector<KeyRangeLocationInfo> locations,
StorageMetrics min,
StorageMetrics max,
StorageMetrics permittedError) {
@ -7589,7 +7626,7 @@ ACTOR Future<StorageMetrics> waitStorageMetricsMultipleLocations(std::vector<Key
state StorageMetrics minMinus = min - halfErrorPerMachine * (nLocs - 1);
for (int i = 0; i < nLocs; i++) {
WaitMetricsRequest req(locations[i].range, StorageMetrics(), StorageMetrics());
WaitMetricsRequest req(tenantInfo, locations[i].range, StorageMetrics(), StorageMetrics());
req.min.bytes = 0;
req.max.bytes = -1;
fx[i] = loadBalance(locations[i].locations->locations(),
@ -7610,7 +7647,7 @@ ACTOR Future<StorageMetrics> waitStorageMetricsMultipleLocations(std::vector<Key
for (int i = 0; i < nLocs; i++)
wx[i] = trackBoundedStorageMetrics(
locations[i].range, locations[i].locations, fx[i].get(), halfErrorPerMachine, deltas);
tenantInfo, locations[i].range, locations[i].locations, fx[i].get(), halfErrorPerMachine, deltas);
loop {
StorageMetrics delta = waitNext(deltas.getFuture());
@ -7695,25 +7732,30 @@ ACTOR Future<Standalone<VectorRef<ReadHotRangeWithMetrics>>> getReadHotRanges(Da
}
}
ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(Database cx,
KeyRange keys,
StorageMetrics min,
StorageMetrics max,
StorageMetrics permittedError,
int shardLimit,
int expectedShardCount) {
ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
Database cx,
KeyRange keys,
StorageMetrics min,
StorageMetrics max,
StorageMetrics permittedError,
int shardLimit,
int expectedShardCount,
Optional<Reference<TransactionState>> trState) {
state Span span("NAPI:WaitStorageMetrics"_loc, generateSpanID(cx->transactionTracingSample));
state TenantInfo tenantInfo =
wait(trState.present() ? populateAndGetTenant(trState.get(), keys.begin, latestVersion) : TenantInfo());
loop {
std::vector<KeyRangeLocationInfo> locations = wait(getKeyRangeLocations(cx,
TenantInfo(),
keys,
shardLimit,
Reverse::False,
&StorageServerInterface::waitMetrics,
span.context,
Optional<UID>(),
UseProvisionalProxies::False,
latestVersion));
state std::vector<KeyRangeLocationInfo> locations =
wait(getKeyRangeLocations(cx,
tenantInfo,
keys,
shardLimit,
Reverse::False,
&StorageServerInterface::waitMetrics,
span.context,
Optional<UID>(),
UseProvisionalProxies::False,
latestVersion));
if (expectedShardCount >= 0 && locations.size() != expectedShardCount) {
return std::make_pair(Optional<StorageMetrics>(), locations.size());
}
@ -7724,9 +7766,9 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(Databa
try {
Future<StorageMetrics> fx;
if (locations.size() > 1) {
fx = waitStorageMetricsMultipleLocations(locations, min, max, permittedError);
fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError);
} else {
WaitMetricsRequest req(keys, min, max);
WaitMetricsRequest req(tenantInfo, keys, min, max);
fx = loadBalance(locations[0].locations->locations(),
&StorageServerInterface::waitMetrics,
req,
@ -7739,7 +7781,7 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(Databa
TraceEvent(SevError, "WaitStorageMetricsError").error(e);
throw;
}
cx->invalidateCache(Key(), keys);
cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
}
} else {
@ -7749,7 +7791,7 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(Databa
.detail("JitteredSecondsOfPenitence", CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY);
wait(delayJittered(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
// make sure that the next getKeyRangeLocations() call will actually re-fetch the range
cx->invalidateCache(Key(), keys);
cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
}
}
}
@ -7760,17 +7802,21 @@ Future<std::pair<Optional<StorageMetrics>, int>> DatabaseContext::waitStorageMet
StorageMetrics const& max,
StorageMetrics const& permittedError,
int shardLimit,
int expectedShardCount) {
int expectedShardCount,
Optional<Reference<TransactionState>> trState) {
return ::waitStorageMetrics(Database(Reference<DatabaseContext>::addRef(this)),
keys,
min,
max,
permittedError,
shardLimit,
expectedShardCount);
expectedShardCount,
trState);
}
Future<StorageMetrics> DatabaseContext::getStorageMetrics(KeyRange const& keys, int shardLimit) {
Future<StorageMetrics> DatabaseContext::getStorageMetrics(KeyRange const& keys,
int shardLimit,
Optional<Reference<TransactionState>> trState) {
if (shardLimit > 0) {
StorageMetrics m;
m.bytes = -1;
@ -7780,9 +7826,10 @@ Future<StorageMetrics> DatabaseContext::getStorageMetrics(KeyRange const& keys,
m,
StorageMetrics(),
shardLimit,
-1));
-1,
trState));
} else {
return ::getStorageMetricsLargeKeyRange(Database(Reference<DatabaseContext>::addRef(this)), keys);
return ::getStorageMetricsLargeKeyRange(Database(Reference<DatabaseContext>::addRef(this)), keys, trState);
}
}
@ -7873,25 +7920,46 @@ ACTOR Future<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(Reference<Transa
}
// kind of a hack, but necessary to work around needing to access system keys in a tenant-enabled transaction
ACTOR Future<TenantMapEntry> blobGranuleGetTenantEntry(Transaction* self, Key rangeStartKey) {
ACTOR Future<TenantMapEntry> blobGranuleGetTenantEntry(Transaction* self,
Key rangeStartKey,
Optional<TenantName> tenantName) {
ASSERT(tenantName.present() || self->getTenant().present());
TenantName tName = tenantName.present() ? tenantName.get() : self->getTenant().get();
state TenantMapEntry tme;
Optional<KeyRangeLocationInfo> cachedLocationInfo =
self->trState->cx->getCachedLocation(self->getTenant().get(), rangeStartKey, Reverse::False);
self->trState->cx->getCachedLocation(tName, rangeStartKey, Reverse::False);
if (!cachedLocationInfo.present()) {
// If we're passing in a tenant, use that and do not touch the transaction.
TenantInfo tInfo;
if (tenantName.present()) {
tInfo = TenantInfo(tName, {}, TenantInfo::INVALID_TENANT);
} else {
tInfo = self->trState->getTenantInfo(AllowInvalidTenantID::True);
}
KeyRangeLocationInfo l = wait(getKeyLocation_internal(
self->trState->cx,
self->trState->getTenantInfo(AllowInvalidTenantID::True),
tInfo,
rangeStartKey,
self->trState->spanContext,
self->trState->readOptions.present() ? self->trState->readOptions.get().debugID : Optional<UID>(),
self->trState->useProvisionalProxies,
Reverse::False,
latestVersion));
self->trState->trySetTenantId(l.tenantEntry.id);
return l.tenantEntry;
tme = l.tenantEntry;
} else {
self->trState->trySetTenantId(cachedLocationInfo.get().tenantEntry.id);
return cachedLocationInfo.get().tenantEntry;
tme = cachedLocationInfo.get().tenantEntry;
}
if (tme.id == TenantInfo::INVALID_TENANT) {
throw tenant_not_found();
}
// Modify transaction if desired.
if (!tenantName.present()) {
self->trState->trySetTenantId(tme.id);
}
return tme;
}
Future<Standalone<VectorRef<KeyRef>>> Transaction::getRangeSplitPoints(KeyRange const& keys, int64_t chunkSize) {
@ -7916,7 +7984,7 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRangesActor(Trans
if (self->getTenant().present()) {
// have to bypass tenant to read system key space, and add tenant prefix to part of mapping
TenantMapEntry tenantEntry = wait(blobGranuleGetTenantEntry(self, currentRange.begin));
TenantMapEntry tenantEntry = wait(blobGranuleGetTenantEntry(self, currentRange.begin, {}));
tenantPrefix = tenantEntry.prefix;
} else {
self->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
@ -8019,7 +8087,7 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
if (self->getTenant().present()) {
// have to bypass tenant to read system key space, and add tenant prefix to part of mapping
TenantMapEntry tenantEntry = wait(blobGranuleGetTenantEntry(self, range.begin));
TenantMapEntry tenantEntry = wait(blobGranuleGetTenantEntry(self, range.begin, {}));
tenantPrefix = tenantEntry.prefix;
Standalone<StringRef> mappingPrefix = tenantEntry.prefix.withPrefix(blobGranuleMappingKeys.begin);
@ -8049,8 +8117,7 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
}
if (blobGranuleMapping.more) {
if (BG_REQUEST_DEBUG) {
fmt::print(
"BG Mapping for [{0} - %{1}) too large!\n", keyRange.begin.printable(), keyRange.end.printable());
fmt::print("BG Mapping for [{0} - {1}) too large!\n", keyRange.begin.printable(), keyRange.end.printable());
}
TraceEvent(SevWarn, "BGMappingTooLarge")
.detail("Range", range)
@ -8263,7 +8330,7 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
}
}
self->trState->cx->anyBlobGranuleRequests = true;
self->trState->cx->anyBGReads = true;
self->trState->cx->bgGranulesPerRequest.addSample(results.size());
self->trState->cx->bgLatencies.addSample(now() - startTime);
@ -8305,8 +8372,13 @@ Transaction::summarizeBlobGranules(const KeyRange& range, Optional<Version> summ
}
void Transaction::addGranuleMaterializeStats(const GranuleMaterializeStats& stats) {
trState->cx->anyBGReads = true;
trState->cx->bgReadInputBytes += stats.inputBytes;
trState->cx->bgReadOutputBytes += stats.outputBytes;
trState->cx->bgReadSnapshotRows += stats.snapshotRows;
trState->cx->bgReadRowsCleared += stats.rowsCleared;
trState->cx->bgReadRowsInserted += stats.rowsInserted;
trState->cx->bgReadRowsUpdated += stats.rowsUpdated;
}
ACTOR Future<Version> setPerpetualStorageWiggle(Database cx, bool enable, LockAware lockAware) {
@ -8350,7 +8422,10 @@ ACTOR Future<Version> checkBlobSubrange(Database db, KeyRange keyRange, Optional
}
}
ACTOR Future<Version> verifyBlobRangeActor(Reference<DatabaseContext> cx, KeyRange range, Optional<Version> version) {
ACTOR Future<Version> verifyBlobRangeActor(Reference<DatabaseContext> cx,
KeyRange range,
Optional<Version> version,
Optional<TenantName> tenantName) {
state Database db(cx);
state Transaction tr(db);
state Standalone<VectorRef<KeyRangeRef>> allRanges;
@ -8358,6 +8433,7 @@ ACTOR Future<Version> verifyBlobRangeActor(Reference<DatabaseContext> cx, KeyRan
state Version readVersionOut = invalidVersion;
state int batchSize = BUGGIFY ? deterministicRandom()->randomInt(2, 10) : CLIENT_KNOBS->BG_TOO_MANY_GRANULES / 2;
state int loadSize = (BUGGIFY ? deterministicRandom()->randomInt(1, 20) : 20) * batchSize;
state bool loadedTenantEntry = false;
if (version.present()) {
if (version.get() == latestVersion) {
@ -8381,6 +8457,12 @@ ACTOR Future<Version> verifyBlobRangeActor(Reference<DatabaseContext> cx, KeyRan
if (curRegion.begin >= range.end) {
return readVersionOut;
}
if (tenantName.present() && !loadedTenantEntry) {
TenantMapEntry tenantEntry = wait(blobGranuleGetTenantEntry(&tr, range.begin, tenantName));
loadedTenantEntry = true;
range = range.withPrefix(tenantEntry.prefix);
curRegion = KeyRangeRef(range.begin, range.begin);
}
loop {
try {
wait(store(allRanges, tr.getBlobGranuleRanges(KeyRangeRef(curRegion.begin, range.end), loadSize)));
@ -8432,8 +8514,10 @@ ACTOR Future<Version> verifyBlobRangeActor(Reference<DatabaseContext> cx, KeyRan
}
}
Future<Version> DatabaseContext::verifyBlobRange(const KeyRange& range, Optional<Version> version) {
return verifyBlobRangeActor(Reference<DatabaseContext>::addRef(this), range, version);
Future<Version> DatabaseContext::verifyBlobRange(const KeyRange& range,
Optional<Version> version,
Optional<TenantName> tenantName) {
return verifyBlobRangeActor(Reference<DatabaseContext>::addRef(this), range, version, tenantName);
}
ACTOR Future<std::vector<std::pair<UID, StorageWiggleValue>>> readStorageWiggleValues(Database cx,
@ -10500,7 +10584,7 @@ ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
if (tenant.present() && !loadedTenantPrefix) {
TenantMapEntry tenantEntry = wait(blobGranuleGetTenantEntry(&tr, range.begin));
TenantMapEntry tenantEntry = wait(blobGranuleGetTenantEntry(&tr, range.begin, tenant));
loadedTenantPrefix = true;
purgeRange = purgeRange.withPrefix(tenantEntry.prefix);
}
@ -10619,9 +10703,13 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobRanges(Reference<ReadYou
}
}
ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx, KeyRange range, bool active) {
ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx,
KeyRange range,
bool active,
Optional<TenantName> tenantName) {
state Database db(cx);
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
state bool loadedTenantEntry = false;
state Value value = active ? blobRangeActive : blobRangeInactive;
loop {
@ -10629,6 +10717,13 @@ ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx, KeyRange ran
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
if (tenantName.present() && !loadedTenantEntry) {
TenantMapEntry tenantEntry =
wait(blobGranuleGetTenantEntry(&tr->getTransaction(), range.begin, tenantName));
loadedTenantEntry = true;
range = range.withPrefix(tenantEntry.prefix);
}
Standalone<VectorRef<KeyRangeRef>> startBlobRanges = wait(getBlobRanges(tr, range, 1));
if (active) {
@ -10665,27 +10760,59 @@ ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx, KeyRange ran
}
}
Future<bool> DatabaseContext::blobbifyRange(KeyRange range) {
return setBlobRangeActor(Reference<DatabaseContext>::addRef(this), range, true);
Future<bool> DatabaseContext::blobbifyRange(KeyRange range, Optional<TenantName> tenantName) {
return setBlobRangeActor(Reference<DatabaseContext>::addRef(this), range, true, tenantName);
}
Future<bool> DatabaseContext::unblobbifyRange(KeyRange range) {
return setBlobRangeActor(Reference<DatabaseContext>::addRef(this), range, false);
Future<bool> DatabaseContext::unblobbifyRange(KeyRange range, Optional<TenantName> tenantName) {
return setBlobRangeActor(Reference<DatabaseContext>::addRef(this), range, false, tenantName);
}
ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRangesActor(Reference<DatabaseContext> cx,
KeyRange range,
int rangeLimit) {
int rangeLimit,
Optional<TenantName> tenantName) {
state Database db(cx);
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
state TenantMapEntry tme;
loop {
try {
if (tenantName.present()) {
wait(store(tme, blobGranuleGetTenantEntry(&tr->getTransaction(), range.begin, tenantName)));
range = range.withPrefix(tme.prefix);
}
break;
} catch (Error& e) {
wait(tr->onError(e));
}
}
state Standalone<VectorRef<KeyRangeRef>> blobRanges = wait(getBlobRanges(tr, range, rangeLimit));
if (!tenantName.present()) {
return blobRanges;
}
return blobRanges;
// Strip tenant prefix out.
state Standalone<VectorRef<KeyRangeRef>> tenantBlobRanges;
for (auto& blobRange : blobRanges) {
// Filter out blob ranges that span tenants for some reason.
if (!blobRange.begin.startsWith(tme.prefix) || !blobRange.end.startsWith(tme.prefix)) {
TraceEvent("ListBlobbifiedRangeSpansTenants")
.suppressFor(/*seconds=*/5)
.detail("Tenant", tenantName.get())
.detail("Range", blobRange);
continue;
}
tenantBlobRanges.push_back_deep(tenantBlobRanges.arena(), blobRange.removePrefix(tme.prefix));
}
return tenantBlobRanges;
}
Future<Standalone<VectorRef<KeyRangeRef>>> DatabaseContext::listBlobbifiedRanges(KeyRange range, int rowLimit) {
return listBlobbifiedRangesActor(Reference<DatabaseContext>::addRef(this), range, rowLimit);
Future<Standalone<VectorRef<KeyRangeRef>>> DatabaseContext::listBlobbifiedRanges(KeyRange range,
int rowLimit,
Optional<TenantName> tenantName) {
return listBlobbifiedRangesActor(Reference<DatabaseContext>::addRef(this), range, rowLimit, tenantName);
}
int64_t getMaxKeySize(KeyRef const& key) {

View File

@ -1770,7 +1770,10 @@ Future<int64_t> ReadYourWritesTransaction::getEstimatedRangeSizeBytes(const KeyR
if (resetPromise.isSet())
return resetPromise.getFuture().getError();
return map(waitOrError(tr.getDatabase()->getStorageMetrics(keys, -1), resetPromise.getFuture()),
// Pass in the TransactionState only if tenant is present
Optional<Reference<TransactionState>> trState =
tr.trState->hasTenant() ? tr.trState : Optional<Reference<TransactionState>>();
return map(waitOrError(tr.getDatabase()->getStorageMetrics(keys, -1, trState), resetPromise.getFuture()),
[](const StorageMetrics& m) { return m.bytes; });
}

View File

@ -582,7 +582,8 @@ const KeyRef JSONSchemas::statusSchema = R"statusSchema(
"duplicate_mutation_fetch_timeout",
"primary_dc_missing",
"fetch_primary_dc_timeout",
"fetch_storage_wiggler_stats_timeout"
"fetch_storage_wiggler_stats_timeout",
"fetch_consistency_scan_info_timeout"
]
},
"issues":[

View File

@ -39,11 +39,12 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ENABLE_VERSION_VECTOR, false );
init( ENABLE_VERSION_VECTOR_TLOG_UNICAST, false );
bool buggifyShortReadWindow = randomize && BUGGIFY && !ENABLE_VERSION_VECTOR;
bool buggifyShortReadWindow = randomize && BUGGIFY && !ENABLE_VERSION_VECTOR;
init( MAX_READ_TRANSACTION_LIFE_VERSIONS, 5 * VERSIONS_PER_SECOND ); if (randomize && BUGGIFY) MAX_READ_TRANSACTION_LIFE_VERSIONS = VERSIONS_PER_SECOND; else if (buggifyShortReadWindow) MAX_READ_TRANSACTION_LIFE_VERSIONS = std::max<int>(1, 0.1 * VERSIONS_PER_SECOND); else if( randomize && BUGGIFY ) MAX_READ_TRANSACTION_LIFE_VERSIONS = 10 * VERSIONS_PER_SECOND;
init( MAX_WRITE_TRANSACTION_LIFE_VERSIONS, 5 * VERSIONS_PER_SECOND ); if (randomize && BUGGIFY) MAX_WRITE_TRANSACTION_LIFE_VERSIONS=std::max<int>(1, 1 * VERSIONS_PER_SECOND);
init( MAX_COMMIT_BATCH_INTERVAL, 2.0 ); if( randomize && BUGGIFY ) MAX_COMMIT_BATCH_INTERVAL = 0.5; // Each commit proxy generates a CommitTransactionBatchRequest at least this often, so that versions always advance smoothly
MAX_COMMIT_BATCH_INTERVAL = std::min(MAX_COMMIT_BATCH_INTERVAL, MAX_READ_TRANSACTION_LIFE_VERSIONS/double(2*VERSIONS_PER_SECOND)); // Ensure that the proxy commits 2 times every MAX_READ_TRANSACTION_LIFE_VERSIONS, otherwise the master will not give out versions fast enough
MAX_COMMIT_BATCH_INTERVAL = std::min(MAX_COMMIT_BATCH_INTERVAL, MAX_WRITE_TRANSACTION_LIFE_VERSIONS/double(2*VERSIONS_PER_SECOND)); // Ensure that the proxy commits 2 times every MAX_WRITE_TRANSACTION_LIFE_VERSIONS, otherwise the master will not give out versions fast enough
init( MAX_VERSION_RATE_MODIFIER, 0.1 );
init( MAX_VERSION_RATE_OFFSET, VERSIONS_PER_SECOND ); // If the calculated version is more than this amount away from the expected version, it will be clamped to this value. This prevents huge version jumps.
init( ENABLE_VERSION_VECTOR_HA_OPTIMIZATION, false );
@ -296,7 +297,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC, isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120;
init( DD_TENANT_AWARENESS_ENABLED, false );
init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
// TeamRemover
init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
@ -420,6 +421,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// Enable this knob only for experminatal purpose, never enable this in production.
// If enabled, all the committed in-memory memtable writes are lost on a crash.
init( ROCKSDB_DISABLE_WAL_EXPERIMENTAL, false );
// If ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE is enabled, disable ENABLE_CLEAR_RANGE_EAGER_READS knob.
// These knobs have contrary functionality.
init( ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE, false ); if( randomize && BUGGIFY ) ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE = deterministicRandom()->coinflip() ? false : true;
init( ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT, 200000 ); // 200KB
// Can commit will delay ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD seconds for
// ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD times, if rocksdb overloaded.
// Set ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD to 0, to disable
@ -725,6 +730,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ENFORCE_TAG_THROTTLING_ON_PROXIES, false );
init( GLOBAL_TAG_THROTTLING_MIN_RATE, 1.0 );
init( GLOBAL_TAG_THROTTLING_FOLDING_TIME, 10.0 );
init( GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO, 5.0 );
init( GLOBAL_TAG_THROTTLING_MAX_TAGS_TRACKED, 10 );
init( GLOBAL_TAG_THROTTLING_TAG_EXPIRE_AFTER, 240.0 );
//Storage Metrics
init( STORAGE_METRICS_AVERAGE_INTERVAL, 120.0 );
@ -784,7 +792,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( RANGESTREAM_LIMIT_BYTES, 2e6 ); if( randomize && BUGGIFY ) RANGESTREAM_LIMIT_BYTES = 1;
init( CHANGEFEEDSTREAM_LIMIT_BYTES, 1e6 ); if( randomize && BUGGIFY ) CHANGEFEEDSTREAM_LIMIT_BYTES = 1;
init( BLOBWORKERSTATUSSTREAM_LIMIT_BYTES, 1e4 ); if( randomize && BUGGIFY ) BLOBWORKERSTATUSSTREAM_LIMIT_BYTES = 1;
init( ENABLE_CLEAR_RANGE_EAGER_READS, true );
init( ENABLE_CLEAR_RANGE_EAGER_READS, true ); if( randomize && BUGGIFY ) ENABLE_CLEAR_RANGE_EAGER_READS = deterministicRandom()->coinflip() ? false : true;
init( CHECKPOINT_TRANSFER_BLOCK_BYTES, 40e6 );
init( QUICK_GET_VALUE_FALLBACK, true );
init( QUICK_GET_KEY_VALUES_FALLBACK, true );

View File

@ -145,13 +145,13 @@ Value ThrottleApi::TagQuotaValue::toValue() const {
ThrottleApi::TagQuotaValue ThrottleApi::TagQuotaValue::fromValue(ValueRef value) {
auto tuple = Tuple::unpack(value);
if (tuple.size() != 4) {
if (tuple.size() != 2) {
throw invalid_throttle_quota_value();
}
TagQuotaValue result;
try {
result.reservedQuota = tuple.getDouble(0);
result.totalQuota = tuple.getDouble(1);
result.reservedQuota = tuple.getInt(0);
result.totalQuota = tuple.getInt(1);
} catch (Error& e) {
TraceEvent(SevWarnAlways, "TagQuotaValueFailedToDeserialize").error(e);
throw invalid_throttle_quota_value();

View File

@ -246,6 +246,47 @@ ThreadFuture<Void> ThreadSafeTenant::waitPurgeGranulesComplete(const KeyRef& pur
});
}
ThreadFuture<bool> ThreadSafeTenant::blobbifyRange(const KeyRangeRef& keyRange) {
DatabaseContext* db = this->db->db;
TenantName tenantName = this->name;
KeyRange range = keyRange;
return onMainThread([=]() -> Future<bool> {
db->checkDeferredError();
return db->blobbifyRange(range, tenantName);
});
}
ThreadFuture<bool> ThreadSafeTenant::unblobbifyRange(const KeyRangeRef& keyRange) {
DatabaseContext* db = this->db->db;
TenantName tenantName = this->name;
KeyRange range = keyRange;
return onMainThread([=]() -> Future<bool> {
db->checkDeferredError();
return db->unblobbifyRange(range, tenantName);
});
}
ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> ThreadSafeTenant::listBlobbifiedRanges(const KeyRangeRef& keyRange,
int rangeLimit) {
DatabaseContext* db = this->db->db;
TenantName tenantName = this->name;
KeyRange range = keyRange;
return onMainThread([=]() -> Future<Standalone<VectorRef<KeyRangeRef>>> {
db->checkDeferredError();
return db->listBlobbifiedRanges(range, rangeLimit, tenantName);
});
}
ThreadFuture<Version> ThreadSafeTenant::verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) {
DatabaseContext* db = this->db->db;
TenantName tenantName = this->name;
KeyRange range = keyRange;
return onMainThread([=]() -> Future<Version> {
db->checkDeferredError();
return db->verifyBlobRange(range, version, tenantName);
});
}
ThreadSafeTenant::~ThreadSafeTenant() {}
ThreadSafeTransaction::ThreadSafeTransaction(DatabaseContext* cx,

View File

@ -56,10 +56,18 @@ struct GranuleDeltas : VectorRef<MutationsAndVersionRef> {
};
struct GranuleMaterializeStats {
// file-level stats
int64_t inputBytes;
int64_t outputBytes;
GranuleMaterializeStats() : inputBytes(0), outputBytes(0) {}
// merge stats
int32_t snapshotRows;
int32_t rowsCleared;
int32_t rowsInserted;
int32_t rowsUpdated;
GranuleMaterializeStats()
: inputBytes(0), outputBytes(0), snapshotRows(0), rowsCleared(0), rowsInserted(0), rowsUpdated(0) {}
};
struct BlobGranuleCipherKeysMeta {

View File

@ -51,7 +51,8 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
Version beginVersion,
Version readVersion,
Optional<StringRef> snapshotData,
StringRef deltaFileData[]);
StringRef deltaFileData[],
GranuleMaterializeStats& stats);
std::string randomBGFilename(UID blobWorkerID, UID granuleID, Version version, std::string suffix);

View File

@ -62,8 +62,8 @@ struct BlobMetadataDetailsRef {
BlobMetadataDomainNameRef domainName,
Optional<StringRef> base,
VectorRef<StringRef> partitions,
int64_t refreshAt,
int64_t expireAt)
double refreshAt,
double expireAt)
: domainId(domainId), domainName(ar, domainName), partitions(ar, partitions), refreshAt(refreshAt),
expireAt(expireAt) {
if (base.present()) {

View File

@ -298,13 +298,19 @@ public:
Future<Void> onProxiesChanged() const;
Future<HealthMetrics> getHealthMetrics(bool detailed);
// Pass a negative value for `shardLimit` to indicate no limit on the shard number.
Future<StorageMetrics> getStorageMetrics(KeyRange const& keys, int shardLimit);
Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(KeyRange const& keys,
StorageMetrics const& min,
StorageMetrics const& max,
StorageMetrics const& permittedError,
int shardLimit,
int expectedShardCount);
// Pass a valid `trState` with `hasTenant() == true` to make the function tenant-aware.
Future<StorageMetrics> getStorageMetrics(
KeyRange const& keys,
int shardLimit,
Optional<Reference<TransactionState>> trState = Optional<Reference<TransactionState>>());
Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
KeyRange const& keys,
StorageMetrics const& min,
StorageMetrics const& max,
StorageMetrics const& permittedError,
int shardLimit,
int expectedShardCount,
Optional<Reference<TransactionState>> trState = Optional<Reference<TransactionState>>());
Future<Void> splitStorageMetricsStream(PromiseStream<Key> const& resultsStream,
KeyRange const& keys,
StorageMetrics const& limit,
@ -388,10 +394,14 @@ public:
bool force = false);
Future<Void> waitPurgeGranulesComplete(Key purgeKey);
Future<bool> blobbifyRange(KeyRange range);
Future<bool> unblobbifyRange(KeyRange range);
Future<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRanges(KeyRange range, int rangeLimit);
Future<Version> verifyBlobRange(const KeyRange& range, Optional<Version> version);
Future<bool> blobbifyRange(KeyRange range, Optional<TenantName> tenantName = {});
Future<bool> unblobbifyRange(KeyRange range, Optional<TenantName> tenantName = {});
Future<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRanges(KeyRange range,
int rangeLimit,
Optional<TenantName> tenantName = {});
Future<Version> verifyBlobRange(const KeyRange& range,
Optional<Version> version,
Optional<TenantName> tenantName = {});
// private:
explicit DatabaseContext(Reference<AsyncVar<Reference<IClusterConnectionRecord>>> connectionRecord,
@ -544,8 +554,17 @@ public:
Counter transactionGrvFullBatches;
Counter transactionGrvTimedOutBatches;
Counter transactionCommitVersionNotFoundForSS;
// Blob Granule Read metrics. Omit from logging if not used.
bool anyBGReads;
CounterCollection ccBG;
Counter bgReadInputBytes;
Counter bgReadOutputBytes;
Counter bgReadSnapshotRows;
Counter bgReadRowsCleared;
Counter bgReadRowsInserted;
Counter bgReadRowsUpdated;
ContinuousSample<double> bgLatencies, bgGranulesPerRequest;
// Change Feed metrics. Omit change feed metrics from logging if not used
bool usedAnyChangeFeeds;
@ -558,7 +577,7 @@ public:
Counter feedPopsFallback;
ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit,
bytesPerCommit, bgLatencies, bgGranulesPerRequest;
bytesPerCommit;
int outstandingWatches;
int maxOutstandingWatches;
@ -587,7 +606,6 @@ public:
bool transactionTracingSample;
double verifyCausalReadsProp = 0.0;
bool blobGranuleNoMaterialize = false;
bool anyBlobGranuleRequests = false;
Future<Void> logger;
Future<Void> throttleExpirer;

View File

@ -1402,6 +1402,25 @@ struct TenantMode {
serializer(ar, mode);
}
// This does not go back-and-forth cleanly with toString
// The '_experimental' suffix, if present, needs to be removed in order to be parsed.
static TenantMode fromString(std::string mode) {
if (mode.find("_experimental") != std::string::npos) {
mode.replace(mode.find("_experimental"), std::string::npos, "");
}
if (mode == "disabled") {
return TenantMode::DISABLED;
} else if (mode == "optional") {
return TenantMode::OPTIONAL_TENANT;
} else if (mode == "required") {
return TenantMode::REQUIRED;
} else {
TraceEvent(SevError, "UnknownTenantMode").detail("TenantMode", mode);
ASSERT(false);
throw internal_error();
}
}
std::string toString() const {
switch (mode) {
case DISABLED:
@ -1686,10 +1705,20 @@ struct Versionstamp {
serializer(ar, beVersion, beBatch);
if constexpr (Ar::isDeserializing) {
version = bigEndian64(version);
version = bigEndian64(beVersion);
batchNumber = bigEndian16(beBatch);
}
}
};
template <class Ar>
inline void save(Ar& ar, const Versionstamp& value) {
return const_cast<Versionstamp&>(value).serialize(ar);
}
template <class Ar>
inline void load(Ar& ar, Versionstamp& value) {
value.serialize(ar);
}
#endif

View File

@ -104,6 +104,11 @@ Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>> getL
// Collect cached cipher keys.
for (auto& domain : domains) {
if (domain.first == FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
ASSERT(domain.second == FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
} else if (domain.first == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
ASSERT(domain.second == FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
}
Reference<BlobCipherKey> cachedCipherKey = cipherKeyCache->getLatestCipherKey(domain.first /*domainId*/);
if (cachedCipherKey.isValid()) {
cipherKeys[domain.first] = cachedCipherKey;
@ -301,7 +306,7 @@ template <class T>
Future<TextAndHeaderCipherKeys> getLatestSystemEncryptCipherKeys(const Reference<AsyncVar<T> const>& db,
BlobCipherMetrics::UsageType usageType) {
return getLatestEncryptCipherKeysForDomain(
db, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME, usageType);
db, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME, usageType);
}
ACTOR template <class T>

View File

@ -151,6 +151,13 @@ public:
virtual ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) = 0;
virtual ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) = 0;
virtual ThreadFuture<bool> blobbifyRange(const KeyRangeRef& keyRange) = 0;
virtual ThreadFuture<bool> unblobbifyRange(const KeyRangeRef& keyRange) = 0;
virtual ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRanges(const KeyRangeRef& keyRange,
int rangeLimit) = 0;
virtual ThreadFuture<Version> verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) = 0;
virtual void addref() = 0;
virtual void delref() = 0;
};

View File

@ -319,6 +319,11 @@ public:
tr->clear(key);
}
template <class Transaction>
Future<Void> watch(Transaction tr) {
return tr->watch(key);
}
Key key;
};

View File

@ -502,6 +502,7 @@ Future<Void> decommissionMetacluster(Reference<DB> db) {
ManagementClusterMetadata::tenantMetadata().lastTenantId.clear(tr);
ManagementClusterMetadata::tenantMetadata().tenantTombstones.clear(tr);
ManagementClusterMetadata::tenantMetadata().tombstoneCleanupData.clear(tr);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.clear(tr);
wait(managementClusterCheckEmpty(tr));
MetaclusterMetadata::metaclusterRegistration().clear(tr);
@ -797,6 +798,7 @@ struct RemoveClusterImpl {
ASSERT(entry.getString(0) == self->ctx.clusterName.get());
ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, entry.getString(1));
ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, entry.getInt(2));
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
}
// Erase all of the tenants processed in this transaction from the cluster tenant index
@ -1262,6 +1264,7 @@ struct CreateTenantImpl {
self->tenantEntry.tenantState = TenantState::REGISTERING;
ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, self->tenantEntry);
ManagementClusterMetadata::tenantMetadata().tenantIdIndex.set(tr, self->tenantEntry.id, self->tenantName);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue);
ManagementClusterMetadata::clusterTenantCount.atomicOp(
@ -1317,6 +1320,7 @@ struct CreateTenantImpl {
TenantMapEntry updatedEntry = managementEntry.get();
updatedEntry.tenantState = TenantState::READY;
ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, updatedEntry);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
}
return Void();
@ -1446,6 +1450,7 @@ struct DeleteTenantImpl {
}
updatedEntry.tenantState = TenantState::REMOVING;
ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, updatedEntry);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
// If this has a rename pair, also mark the other entry for deletion
if (self->pairName.present()) {
state Optional<TenantMapEntry> pairEntry = wait(tryGetTenantTransaction(tr, self->pairName.get()));
@ -1457,6 +1462,8 @@ struct DeleteTenantImpl {
CODE_PROBE(true, "marking pair tenant in removing state");
updatedPairEntry.tenantState = TenantState::REMOVING;
ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->pairName.get(), updatedPairEntry);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(
tr, Versionstamp(), 0);
}
}
@ -1485,6 +1492,7 @@ struct DeleteTenantImpl {
// Erase the tenant entry itself
ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, tenantName);
ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, tenantEntry.get().id);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
// This is idempotent because this function is only called if the tenant is in the map
ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue);
@ -1689,6 +1697,7 @@ struct ConfigureTenantImpl {
++self->updatedEntry.configurationSequenceNum;
ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, self->updatedEntry);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
return Void();
}
@ -1724,6 +1733,7 @@ struct ConfigureTenantImpl {
tenantEntry.get().tenantState = TenantState::READY;
ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, tenantEntry.get());
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
return Void();
}
@ -1770,6 +1780,7 @@ struct RenameTenantImpl {
TenantMapEntry tenantEntry) {
// Erase the tenant entry itself
ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, self->oldName);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
// Remove old tenant from tenant count
ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue);
@ -1857,6 +1868,7 @@ struct RenameTenantImpl {
ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->oldName, updatedOldEntry);
ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->newName, updatedNewEntry);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
// Add temporary tenant to tenantCount to prevent exceeding capacity during a rename
ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue);
@ -1919,6 +1931,7 @@ struct RenameTenantImpl {
updatedNewEntry.renamePair.reset();
ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->newName, updatedNewEntry);
ManagementClusterMetadata::tenantMetadata().tenantIdIndex.set(tr, self->tenantId, self->newName);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
}
// We will remove the old entry from the management cluster

View File

@ -221,6 +221,32 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
uint8_t const* purge_key_name,
int purge_key_name_length);
FDBFuture* (*tenantBlobbifyRange)(FDBTenant* tenant,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length);
FDBFuture* (*tenantUnblobbifyRange)(FDBTenant* tenant,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length);
FDBFuture* (*tenantListBlobbifiedRanges)(FDBTenant* tenant,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length,
int rangeLimit);
FDBFuture* (*tenantVerifyBlobRange)(FDBTenant* tenant,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length,
int64_t version);
void (*tenantDestroy)(FDBTenant* tenant);
// Transaction
@ -515,6 +541,13 @@ public:
ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override;
ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) override;
ThreadFuture<bool> blobbifyRange(const KeyRangeRef& keyRange) override;
ThreadFuture<bool> unblobbifyRange(const KeyRangeRef& keyRange) override;
ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRanges(const KeyRangeRef& keyRange,
int rangeLimit) override;
ThreadFuture<Version> verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) override;
void addref() override { ThreadSafeReferenceCounted<DLTenant>::addref(); }
void delref() override { ThreadSafeReferenceCounted<DLTenant>::delref(); }
@ -562,6 +595,7 @@ public:
ThreadFuture<bool> unblobbifyRange(const KeyRangeRef& keyRange) override;
ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRanges(const KeyRangeRef& keyRange,
int rangeLimit) override;
ThreadFuture<Version> verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) override;
ThreadFuture<DatabaseSharedState*> createSharedState() override;
@ -812,6 +846,12 @@ public:
ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override;
ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) override;
ThreadFuture<bool> blobbifyRange(const KeyRangeRef& keyRange) override;
ThreadFuture<bool> unblobbifyRange(const KeyRangeRef& keyRange) override;
ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRanges(const KeyRangeRef& keyRange,
int rangeLimit) override;
ThreadFuture<Version> verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) override;
void addref() override { ThreadSafeReferenceCounted<MultiVersionTenant>::addref(); }
void delref() override { ThreadSafeReferenceCounted<MultiVersionTenant>::delref(); }

View File

@ -237,6 +237,8 @@ public:
DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC; // Minimal age of a correct-configured server before it's chosen to be wiggled
bool DD_TENANT_AWARENESS_ENABLED;
int TENANT_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantCache is refreshed
int TENANT_CACHE_STORAGE_REFRESH_INTERVAL; // How often the storage bytes used by each tenant in the TenantCache is
// refreshed
// TeamRemover to remove redundant teams
bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor
@ -343,6 +345,8 @@ public:
int ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD;
int ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD;
bool ROCKSDB_DISABLE_WAL_EXPERIMENTAL;
bool ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE;
int64_t ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT;
int64_t ROCKSDB_COMPACTION_READAHEAD_SIZE;
int64_t ROCKSDB_BLOCK_SIZE;
bool ENABLE_SHARDED_ROCKSDB;
@ -624,6 +628,12 @@ public:
double GLOBAL_TAG_THROTTLING_FOLDING_TIME;
// Cost multiplier for writes (because write operations are more expensive than reads)
double GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO;
// Maximum number of tags tracked by global tag throttler. Additional tags will be ignored
// until some existing tags expire
int64_t GLOBAL_TAG_THROTTLING_MAX_TAGS_TRACKED;
// Global tag throttler forgets about throughput from a tag once no new transactions from that
// tag have been received for this duration (in seconds):
int64_t GLOBAL_TAG_THROTTLING_TAG_EXPIRE_AFTER;
double MAX_TRANSACTIONS_PER_BYTE;

View File

@ -103,7 +103,7 @@ struct StorageServerInterface {
PublicRequestStream<struct GetMappedKeyValuesRequest> getMappedKeyValues;
RequestStream<struct GetShardStateRequest> getShardState;
RequestStream<struct WaitMetricsRequest> waitMetrics;
PublicRequestStream<struct WaitMetricsRequest> waitMetrics;
RequestStream<struct SplitMetricsRequest> splitMetrics;
RequestStream<struct GetStorageMetricsRequest> getStorageMetrics;
RequestStream<ReplyPromise<Void>> waitFailure;
@ -161,7 +161,8 @@ public:
PublicRequestStream<struct GetKeyValuesRequest>(getValue.getEndpoint().getAdjustedEndpoint(2));
getShardState =
RequestStream<struct GetShardStateRequest>(getValue.getEndpoint().getAdjustedEndpoint(3));
waitMetrics = RequestStream<struct WaitMetricsRequest>(getValue.getEndpoint().getAdjustedEndpoint(4));
waitMetrics =
PublicRequestStream<struct WaitMetricsRequest>(getValue.getEndpoint().getAdjustedEndpoint(4));
splitMetrics = RequestStream<struct SplitMetricsRequest>(getValue.getEndpoint().getAdjustedEndpoint(5));
getStorageMetrics =
RequestStream<struct GetStorageMetricsRequest>(getValue.getEndpoint().getAdjustedEndpoint(6));
@ -713,18 +714,25 @@ struct WaitMetricsRequest {
// Waits for any of the given minimum or maximum metrics to be exceeded, and then returns the current values
// Send a reversed range for min, max to receive an immediate report
constexpr static FileIdentifier file_identifier = 1795961;
// Setting the tenantInfo makes the request tenant-aware.
Optional<TenantInfo> tenantInfo;
Arena arena;
KeyRangeRef keys;
StorageMetrics min, max;
ReplyPromise<StorageMetrics> reply;
bool verify() const { return tenantInfo.present() && tenantInfo.get().isAuthorized(); }
WaitMetricsRequest() {}
WaitMetricsRequest(KeyRangeRef const& keys, StorageMetrics const& min, StorageMetrics const& max)
: keys(arena, keys), min(min), max(max) {}
WaitMetricsRequest(TenantInfo tenantInfo,
KeyRangeRef const& keys,
StorageMetrics const& min,
StorageMetrics const& max)
: tenantInfo(tenantInfo), keys(arena, keys), min(min), max(max) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, keys, min, max, reply, arena);
serializer(ar, keys, min, max, reply, tenantInfo, arena);
}
};

View File

@ -597,8 +597,8 @@ Future<Void> enableAuto(Reference<DB> db, bool enabled) {
class TagQuotaValue {
public:
double reservedQuota{ 0.0 };
double totalQuota{ 0.0 };
int64_t reservedQuota{ 0 };
int64_t totalQuota{ 0 };
bool isValid() const;
Value toValue() const;
static TagQuotaValue fromValue(ValueRef);

View File

@ -181,6 +181,7 @@ struct TenantMetadataSpecification {
KeyBackedObjectProperty<TenantTombstoneCleanupData, decltype(IncludeVersion())> tombstoneCleanupData;
KeyBackedSet<Tuple> tenantGroupTenantIndex;
KeyBackedObjectMap<TenantGroupName, TenantGroupEntry, decltype(IncludeVersion()), NullCodec> tenantGroupMap;
KeyBackedBinaryValue<Versionstamp> lastTenantModification;
TenantMetadataSpecification(KeyRef prefix)
: subspace(prefix.withSuffix("tenant/"_sr)), tenantMap(subspace.withSuffix("map/"_sr), IncludeVersion()),
@ -188,7 +189,8 @@ struct TenantMetadataSpecification {
tenantCount(subspace.withSuffix("count"_sr)), tenantTombstones(subspace.withSuffix("tombstones/"_sr)),
tombstoneCleanupData(subspace.withSuffix("tombstoneCleanup"_sr), IncludeVersion()),
tenantGroupTenantIndex(subspace.withSuffix("tenantGroup/tenantIndex/"_sr)),
tenantGroupMap(subspace.withSuffix("tenantGroup/map/"_sr), IncludeVersion()) {}
tenantGroupMap(subspace.withSuffix("tenantGroup/map/"_sr), IncludeVersion()),
lastTenantModification(subspace.withSuffix("lastModification"_sr)) {}
};
struct TenantMetadata {
@ -203,6 +205,7 @@ struct TenantMetadata {
static inline auto& tombstoneCleanupData() { return instance().tombstoneCleanupData; }
static inline auto& tenantGroupTenantIndex() { return instance().tenantGroupTenantIndex; }
static inline auto& tenantGroupMap() { return instance().tenantGroupMap; }
static inline auto& lastTenantModification() { return instance().lastTenantModification; }
static Key tenantMapPrivatePrefix();
};

View File

@ -44,8 +44,14 @@
using TenantNameEntryPair = std::pair<TenantName, TenantMapEntry>;
using TenantNameEntryPairVec = std::vector<TenantNameEntryPair>;
enum class TenantEntryCacheRefreshReason { INIT = 1, PERIODIC_TASK = 2, CACHE_MISS = 3, REMOVE_ENTRY = 4 };
enum class TenantEntryCacheRefreshMode { PERIODIC_TASK = 1, NONE = 2 };
enum class TenantEntryCacheRefreshReason {
INIT = 1,
PERIODIC_TASK = 2,
CACHE_MISS = 3,
REMOVE_ENTRY = 4,
WATCH_TRIGGER = 5
};
enum class TenantEntryCacheRefreshMode { PERIODIC_TASK = 1, WATCH = 2, NONE = 3 };
template <class T>
struct TenantEntryCachePayload {
@ -62,12 +68,6 @@ using TenantEntryCachePayloadFunc = std::function<TenantEntryCachePayload<T>(con
// 1. Lookup by 'TenantId'
// 2. Lookup by 'TenantPrefix'
// 3. Lookup by 'TenantName'
//
// TODO:
// ----
// The cache allows user to construct the 'cached object' by supplying a callback. The cache implements a periodic
// refresh mechanism, polling underlying database for updates (add/remove tenants), in future we might want to implement
// database range-watch to monitor such updates
template <class T>
class TenantEntryCache : public ReferenceCounted<TenantEntryCache<T>>, NonCopyable {
@ -78,6 +78,10 @@ private:
TenantEntryCacheRefreshMode refreshMode;
Future<Void> refresher;
Future<Void> watchRefresher;
Future<Void> lastTenantIdRefresher;
Promise<Void> setInitialWatch;
Optional<int64_t> lastTenantId;
Map<int64_t, TenantEntryCachePayload<T>> mapByTenantId;
Map<TenantName, TenantEntryCachePayload<T>> mapByTenantName;
@ -87,6 +91,7 @@ private:
Counter refreshByCacheInit;
Counter refreshByCacheMiss;
Counter numRefreshes;
Counter refreshByWatchTrigger;
ACTOR static Future<TenantNameEntryPairVec> getTenantList(Reference<ReadYourWritesTransaction> tr) {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
@ -102,16 +107,166 @@ private:
return tenantList.results;
}
ACTOR static Future<Void> refreshCacheById(int64_t tenantId,
TenantEntryCache<T>* cache,
TenantEntryCacheRefreshReason reason) {
TraceEvent(SevDebug, "TenantEntryCacheIDRefreshStart", cache->id()).detail("Reason", static_cast<int>(reason));
state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction();
loop {
try {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
state Optional<TenantName> name = wait(TenantMetadata::tenantIdIndex().get(tr, tenantId));
if (name.present()) {
Optional<TenantMapEntry> entry = wait(TenantMetadata::tenantMap().get(tr, name.get()));
if (entry.present()) {
cache->put(std::make_pair(name.get(), entry.get()));
updateCacheRefreshMetrics(cache, reason);
}
}
break;
} catch (Error& e) {
wait(tr->onError(e));
}
}
TraceEvent(SevDebug, "TenantEntryCacheIDRefreshEnd", cache->id()).detail("Reason", static_cast<int>(reason));
return Void();
}
ACTOR static Future<Void> refreshCacheByName(TenantName name,
TenantEntryCache<T>* cache,
TenantEntryCacheRefreshReason reason) {
TraceEvent(SevDebug, "TenantEntryCacheNameRefreshStart", cache->id())
.detail("Reason", static_cast<int>(reason));
state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction();
loop {
try {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
Optional<TenantMapEntry> entry = wait(TenantMetadata::tenantMap().get(tr, name));
if (entry.present()) {
cache->put(std::make_pair(name, entry.get()));
updateCacheRefreshMetrics(cache, reason);
}
break;
} catch (Error& e) {
wait(tr->onError(e));
}
}
TraceEvent(SevDebug, "TenantEntryCacheNameRefreshEnd", cache->id()).detail("Reason", static_cast<int>(reason));
return Void();
}
static void updateCacheRefreshMetrics(TenantEntryCache<T>* cache, TenantEntryCacheRefreshReason reason) {
if (reason == TenantEntryCacheRefreshReason::INIT) {
cache->refreshByCacheInit += 1;
} else if (reason == TenantEntryCacheRefreshReason::CACHE_MISS) {
cache->refreshByCacheMiss += 1;
} else if (reason == TenantEntryCacheRefreshReason::WATCH_TRIGGER) {
cache->refreshByWatchTrigger += 1;
}
cache->numRefreshes += 1;
}
ACTOR static Future<Void> refreshCacheUsingWatch(TenantEntryCache<T>* cache, TenantEntryCacheRefreshReason reason) {
TraceEvent(SevDebug, "TenantEntryCacheRefreshUsingWatchStart", cache->id())
.detail("Reason", static_cast<int>(reason));
state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction();
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
state Future<Void> tenantModifiedWatch = TenantMetadata::lastTenantModification().watch(tr);
wait(tr->commit());
TraceEvent(SevDebug, "TenantEntryCacheRefreshWatchSet", cache->id());
// setInitialWatch is set to indicate that an inital watch has been set for the lastTenantModification
// key. Currently this is only used in simulation to avoid a race condition where a tenant is created
// before the inital watch is set. However, it can be enabled by passing waitForInitalWatch = true to
// the init() method.
if (cache->setInitialWatch.canBeSet()) {
cache->setInitialWatch.send(Void());
}
wait(tenantModifiedWatch);
// If watch triggered then refresh the cache as tenant metadata was updated
TraceEvent(SevDebug, "TenantEntryCacheRefreshUsingWatchTriggered", cache->id())
.detail("Reason", static_cast<int>(reason));
wait(refreshImpl(cache, reason));
tr->reset();
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled) {
TraceEvent("TenantEntryCacheRefreshUsingWatchError", cache->id())
.errorUnsuppressed(e)
.suppressFor(1.0);
}
wait(tr->onError(e));
// In case the watch threw an error then refresh the cache just in case it was updated
wait(refreshImpl(cache, reason));
}
}
}
static bool tenantsEnabled(TenantEntryCache<T>* cache) {
// Avoid using the cache if the tenant mode is disabled. However since we use clientInfo, sometimes it may not
// be fully up to date (i.e it may indicate the tenantMode is disabled when in fact it is required). Thus if
// there is at least one tenant that has been created on the cluster then use the cache to avoid an incorrect
// miss.
if (cache->getDatabase()->clientInfo->get().tenantMode == TenantMode::DISABLED) {
if (!cache->lastTenantId.present()) {
return false;
}
return cache->lastTenantId.get() > 0;
}
return true;
}
ACTOR static Future<Void> setLastTenantId(TenantEntryCache<T>* cache) {
state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction();
loop {
try {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<int64_t> lastTenantId = wait(TenantMetadata::lastTenantId().get(tr));
cache->lastTenantId = lastTenantId;
return Void();
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
ACTOR static Future<Void> lastTenantIdWatch(TenantEntryCache<T>* cache) {
TraceEvent(SevDebug, "TenantEntryCacheLastTenantIdWatchStart", cache->id());
// monitor for any changes on the last tenant id and update it as necessary
state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction();
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
state Future<Void> lastTenantIdWatch = tr->watch(TenantMetadata::lastTenantId().key);
wait(tr->commit());
wait(lastTenantIdWatch);
wait(setLastTenantId(cache));
tr->reset();
} catch (Error& e) {
state Error err(e);
if (err.code() != error_code_actor_cancelled) {
TraceEvent("TenantEntryCacheLastTenantIdWatchError", cache->id())
.errorUnsuppressed(err)
.suppressFor(1.0);
// In case watch errors out refresh the lastTenantId in case it has changed or we would have missed
// an update
wait(setLastTenantId(cache));
}
wait(tr->onError(err));
}
}
}
ACTOR static Future<Void> refreshImpl(TenantEntryCache<T>* cache, TenantEntryCacheRefreshReason reason) {
TraceEvent(SevDebug, "TenantEntryCacheRefreshStart", cache->id()).detail("Reason", static_cast<int>(reason));
@ -130,9 +285,7 @@ private:
break;
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled) {
TraceEvent(SevInfo, "TenantEntryCacheRefreshError", cache->id())
.errorUnsuppressed(e)
.suppressFor(1.0);
TraceEvent("TenantEntryCacheRefreshError", cache->id()).errorUnsuppressed(e).suppressFor(1.0);
}
wait(tr->onError(e));
}
@ -151,12 +304,22 @@ private:
return ret;
}
TraceEvent(SevInfo, "TenantEntryCacheGetByIdRefresh").detail("TenantId", tenantId);
if (!tenantsEnabled(cache)) {
// If tenants are disabled on the cluster avoid using the cache
return Optional<TenantEntryCachePayload<T>>();
}
// Entry not found. Refresh cacheEntries by scanning underlying KeyRange.
// TODO: Cache will implement a "KeyRange" watch, monitoring notification when a new entry gets added or any
// existing entry gets updated within the KeyRange of interest. Hence, misses would be very rare
wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS));
TraceEvent("TenantEntryCacheGetByIdRefresh").detail("TenantId", tenantId);
if (cache->refreshMode == TenantEntryCacheRefreshMode::WATCH) {
// Entry not found. Do a point refresh
// TODO: Don't initiate refresh if tenantId < maxTenantId (stored as a system key currently) as we know that
// such a tenant does not exist (it has either never existed or has been deleted)
wait(refreshCacheById(tenantId, cache, TenantEntryCacheRefreshReason::CACHE_MISS));
} else {
// Entry not found. Refresh cacheEntries by scanning underlying KeyRange.
wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS));
}
cache->misses += 1;
return cache->lookupById(tenantId);
@ -170,12 +333,20 @@ private:
return ret;
}
if (!tenantsEnabled(cache)) {
// If tenants are disabled on the cluster avoid using the cache
return Optional<TenantEntryCachePayload<T>>();
}
TraceEvent("TenantEntryCacheGetByNameRefresh").detail("TenantName", name);
// Entry not found. Refresh cacheEntries by scanning underlying KeyRange.
// TODO: Cache will implement a "KeyRange" watch, monitoring notification when a new entry gets added or any
// existing entry gets updated within the KeyRange of interest. Hence, misses would be very rare
wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS));
if (cache->refreshMode == TenantEntryCacheRefreshMode::WATCH) {
// Entry not found. Do a point refresh
wait(refreshCacheByName(name, cache, TenantEntryCacheRefreshReason::CACHE_MISS));
} else {
// Entry not found. Refresh cacheEntries by scanning underlying KeyRange.
wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS));
}
cache->misses += 1;
return cache->lookupByName(name);
@ -272,7 +443,18 @@ public:
hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics),
refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
numRefreshes("TenantEntryCacheNumRefreshes", metrics) {
numRefreshes("TenantEntryCacheNumRefreshes", metrics),
refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) {
TraceEvent("TenantEntryCacheCreatedDefaultFunc", uid);
}
TenantEntryCache(Database db, TenantEntryCacheRefreshMode mode)
: uid(deterministicRandom()->randomUniqueID()), db(db), createPayloadFunc(defaultCreatePayload),
refreshMode(mode), metrics("TenantEntryCacheMetrics", uid.toString()), hits("TenantEntryCacheHits", metrics),
misses("TenantEntryCacheMisses", metrics), refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
numRefreshes("TenantEntryCacheNumRefreshes", metrics),
refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) {
TraceEvent("TenantEntryCacheCreatedDefaultFunc", uid);
}
@ -282,7 +464,8 @@ public:
hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics),
refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
numRefreshes("TenantEntryCacheNumRefreshes", metrics) {
numRefreshes("TenantEntryCacheNumRefreshes", metrics),
refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) {
TraceEvent("TenantEntryCacheCreated", uid);
}
@ -291,7 +474,8 @@ public:
metrics("TenantEntryCacheMetrics", uid.toString()), hits("TenantEntryCacheHits", metrics),
misses("TenantEntryCacheMisses", metrics), refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
numRefreshes("TenantEntryCacheNumRefreshes", metrics) {
numRefreshes("TenantEntryCacheNumRefreshes", metrics),
refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) {
TraceEvent("TenantEntryCacheCreated", uid);
}
@ -300,26 +484,36 @@ public:
hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics),
refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
numRefreshes("TenantEntryCacheNumRefreshes", metrics) {
numRefreshes("TenantEntryCacheNumRefreshes", metrics),
refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) {
TraceEvent("TenantEntryCacheCreated", uid);
}
Future<Void> init() {
Future<Void> init(bool waitForInitalWatch = false) {
TraceEvent("TenantEntryCacheInit", uid);
Future<Void> f = refreshImpl(this, TenantEntryCacheRefreshReason::INIT);
// Launch reaper task to periodically refresh cache by scanning database KeyRange
TenantEntryCacheRefreshReason reason = TenantEntryCacheRefreshReason::PERIODIC_TASK;
Future<Void> initalWatchFuture = Void();
lastTenantIdRefresher = lastTenantIdWatch(this);
if (refreshMode == TenantEntryCacheRefreshMode::PERIODIC_TASK) {
refresher = recurringAsync([&, reason]() { return refresh(reason); },
CLIENT_KNOBS->TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, /* interval */
true, /* absoluteIntervalDelay */
CLIENT_KNOBS->TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, /* intialDelay */
TaskPriority::Worker);
} else if (refreshMode == TenantEntryCacheRefreshMode::WATCH) {
if (waitForInitalWatch) {
initalWatchFuture = setInitialWatch.getFuture();
}
watchRefresher = refreshCacheUsingWatch(this, TenantEntryCacheRefreshReason::WATCH_TRIGGER);
}
return f;
Future<Void> setLastTenant = setLastTenantId(this);
return f && initalWatchFuture && setLastTenant;
}
Database getDatabase() const { return db; }
@ -384,6 +578,7 @@ public:
Counter::Value numCacheRefreshes() const { return numRefreshes.getValue(); }
Counter::Value numRefreshByMisses() const { return refreshByCacheMiss.getValue(); }
Counter::Value numRefreshByInit() const { return refreshByCacheInit.getValue(); }
Counter::Value numWatchRefreshes() const { return refreshByWatchTrigger.getValue(); }
};
#include "flow/unactorcompiler.h"

View File

@ -178,6 +178,7 @@ Future<std::pair<Optional<TenantMapEntry>, bool>> createTenantTransaction(
TenantMetadata::tenantMap().set(tr, name, tenantEntry);
TenantMetadata::tenantIdIndex().set(tr, tenantEntry.id, name);
TenantMetadata::lastTenantModification().setVersionstamp(tr, Versionstamp(), 0);
if (tenantEntry.tenantGroup.present()) {
TenantMetadata::tenantGroupTenantIndex().insert(tr, Tuple::makeTuple(tenantEntry.tenantGroup.get(), name));
@ -346,6 +347,7 @@ Future<Void> deleteTenantTransaction(Transaction tr,
TenantMetadata::tenantMap().erase(tr, name);
TenantMetadata::tenantIdIndex().erase(tr, tenantEntry.get().id);
TenantMetadata::tenantCount().atomicOp(tr, -1, MutationRef::AddValue);
TenantMetadata::lastTenantModification().setVersionstamp(tr, Versionstamp(), 0);
if (tenantEntry.get().tenantGroup.present()) {
TenantMetadata::tenantGroupTenantIndex().erase(tr,
@ -420,6 +422,7 @@ Future<Void> configureTenantTransaction(Transaction tr,
tr->setOption(FDBTransactionOptions::RAW_ACCESS);
TenantMetadata::tenantMap().set(tr, tenantName, updatedTenantEntry);
TenantMetadata::lastTenantModification().setVersionstamp(tr, Versionstamp(), 0);
// If the tenant group was changed, we need to update the tenant group metadata structures
if (originalEntry.tenantGroup != updatedTenantEntry.tenantGroup) {
@ -523,6 +526,7 @@ Future<Void> renameTenantTransaction(Transaction tr,
TenantMetadata::tenantMap().erase(tr, oldName);
TenantMetadata::tenantMap().set(tr, newName, oldEntry.get());
TenantMetadata::tenantIdIndex().set(tr, oldEntry.get().id, newName);
TenantMetadata::lastTenantModification().setVersionstamp(tr, Versionstamp(), 0);
// Update the tenant group index to reflect the new tenant name
if (oldEntry.get().tenantGroup.present()) {

View File

@ -96,6 +96,13 @@ public:
ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override;
ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) override;
ThreadFuture<bool> blobbifyRange(const KeyRangeRef& keyRange) override;
ThreadFuture<bool> unblobbifyRange(const KeyRangeRef& keyRange) override;
ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRanges(const KeyRangeRef& keyRange,
int rangeLimit) override;
ThreadFuture<Version> verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) override;
void addref() override { ThreadSafeReferenceCounted<ThreadSafeTenant>::addref(); }
void delref() override { ThreadSafeReferenceCounted<ThreadSafeTenant>::delref(); }

View File

@ -42,6 +42,8 @@ struct TenantInfo {
// Is set during deserialization. It will be set to true if the tenant
// name is set and the client is authorized to use this tenant.
bool tenantAuthorized = false;
// Number of storage bytes currently used by this tenant.
int64_t storageUsage = 0;
// Helper function for most endpoints that read/write data. This returns true iff
// the client is either a) a trusted peer or b) is accessing keyspace belonging to a tenant,

View File

@ -687,6 +687,9 @@ struct DDQueue : public IDDRelocationQueue {
Reference<EventCacheHolder> movedKeyServersEventHolder;
int moveReusePhysicalShard;
int moveCreateNewPhysicalShard;
void startRelocation(int priority, int healthPriority) {
// Although PRIORITY_TEAM_REDUNDANT has lower priority than split and merge shard movement,
// we must count it into unhealthyRelocations; because team removers relies on unhealthyRelocations to
@ -750,7 +753,8 @@ struct DDQueue : public IDDRelocationQueue {
output(output), input(input), getShardMetrics(getShardMetrics), getTopKMetrics(getTopKMetrics), lastInterval(0),
suppressIntervals(0), rawProcessingUnhealthy(new AsyncVar<bool>(false)),
rawProcessingWiggle(new AsyncVar<bool>(false)), unhealthyRelocations(0),
movedKeyServersEventHolder(makeReference<EventCacheHolder>("MovedKeyServers")) {}
movedKeyServersEventHolder(makeReference<EventCacheHolder>("MovedKeyServers")), moveReusePhysicalShard(0),
moveCreateNewPhysicalShard(0) {}
DDQueue() = default;
void validate() {
@ -1676,6 +1680,11 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
// when !rd.isRestore(), dataMoveId is just decided as physicalShardIDCandidate
// thus, update the physicalShardIDCandidate to related data structures
ASSERT(physicalShardIDCandidate != UID().first());
if (self->physicalShardCollection->physicalShardExists(physicalShardIDCandidate)) {
self->moveReusePhysicalShard++;
} else {
self->moveCreateNewPhysicalShard++;
}
rd.dataMoveId = newShardId(physicalShardIDCandidate, AssignEmptyRange::False);
auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
inFlightRange.value().dataMoveId = rd.dataMoveId;
@ -2472,6 +2481,14 @@ ACTOR Future<Void> dataDistributionQueue(Reference<IDDTxnProcessor> db,
.trackLatest("MovingData"); // This trace event's trackLatest lifetime is controlled by
// DataDistributor::movingDataEventHolder. The track latest
// key we use here must match the key used in the holder.
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
TraceEvent("PhysicalShardMoveStats")
.detail("MoveCreateNewPhysicalShard", self.moveCreateNewPhysicalShard)
.detail("MoveReusePhysicalShard", self.moveReusePhysicalShard);
self.moveCreateNewPhysicalShard = 0;
self.moveReusePhysicalShard = 0;
}
}
when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator
when(wait(waitForAll(ddQueueFutures))) {}

View File

@ -2081,6 +2081,10 @@ void PhysicalShardCollection::logPhysicalShardCollection() {
}
}
bool PhysicalShardCollection::physicalShardExists(uint64_t physicalShardID) {
return physicalShardInstances.find(physicalShardID) != physicalShardInstances.end();
}
// FIXME: complete this test with non-empty range
TEST_CASE("/DataDistributor/Tracker/FetchTopK") {
state DataDistributionTracker self;

View File

@ -25,6 +25,8 @@
#include "fdbclient/DatabaseContext.h"
#include "flow/actorcompiler.h" // This must be the last #include.
FDB_DEFINE_BOOLEAN_PARAM(SkipDDModeCheck);
class DDTxnProcessorImpl {
friend class DDTxnProcessor;
@ -240,7 +242,8 @@ class DDTxnProcessorImpl {
UID distributorId,
MoveKeysLock moveKeysLock,
std::vector<Optional<Key>> remoteDcIds,
const DDEnabledState* ddEnabledState) {
const DDEnabledState* ddEnabledState,
SkipDDModeCheck skipDDModeCheck) {
state Reference<InitialDataDistribution> result = makeReference<InitialDataDistribution>();
state Key beginKey = allKeys.begin;
@ -253,6 +256,7 @@ class DDTxnProcessorImpl {
state std::vector<std::pair<StorageServerInterface, ProcessClass>> tss_servers;
state int numDataMoves = 0;
CODE_PROBE((bool)skipDDModeCheck, "DD Mode won't prevent read initial data distribution.");
// Get the server list in its own try/catch block since it modifies result. We don't want a subsequent failure
// causing entries to be duplicated
loop {
@ -285,7 +289,7 @@ class DDTxnProcessorImpl {
BinaryReader rd(mode.get(), Unversioned());
rd >> result->mode;
}
if (!result->mode || !ddEnabledState->isDDEnabled()) {
if ((!skipDDModeCheck && !result->mode) || !ddEnabledState->isDDEnabled()) {
// DD can be disabled persistently (result->mode = 0) or transiently (isDDEnabled() = 0)
TraceEvent(SevDebug, "GetInitialDataDistribution_DisabledDD").log();
return result;
@ -620,8 +624,10 @@ Future<Reference<InitialDataDistribution>> DDTxnProcessor::getInitialDataDistrib
const UID& distributorId,
const MoveKeysLock& moveKeysLock,
const std::vector<Optional<Key>>& remoteDcIds,
const DDEnabledState* ddEnabledState) {
return DDTxnProcessorImpl::getInitialDataDistribution(cx, distributorId, moveKeysLock, remoteDcIds, ddEnabledState);
const DDEnabledState* ddEnabledState,
SkipDDModeCheck skipDDModeCheck) {
return DDTxnProcessorImpl::getInitialDataDistribution(
cx, distributorId, moveKeysLock, remoteDcIds, ddEnabledState, skipDDModeCheck);
}
Future<Void> DDTxnProcessor::waitForDataDistributionEnabled(const DDEnabledState* ddEnabledState) const {
@ -681,6 +687,33 @@ Future<std::vector<ProcessData>> DDTxnProcessor::getWorkers() const {
return ::getWorkers(cx);
}
Future<Void> DDTxnProcessor::rawStartMovement(MoveKeysParams& params,
std::map<UID, StorageServerInterface>& tssMapping) {
return ::rawStartMovement(cx, params, tssMapping);
}
Future<Void> DDTxnProcessor::rawFinishMovement(MoveKeysParams& params,
const std::map<UID, StorageServerInterface>& tssMapping) {
return ::rawFinishMovement(cx, params, tssMapping);
}
struct DDMockTxnProcessorImpl {
ACTOR static Future<Void> moveKeys(DDMockTxnProcessor* self, MoveKeysParams params) {
state std::map<UID, StorageServerInterface> tssMapping;
self->rawStartMovement(params, tssMapping);
ASSERT(tssMapping.empty());
if (BUGGIFY_WITH_PROB(0.5)) {
wait(delayJittered(5.0));
}
self->rawFinishMovement(params, tssMapping);
if (!params.dataMovementComplete.isSet())
params.dataMovementComplete.send(Void());
return Void();
}
};
Future<ServerWorkerInfos> DDMockTxnProcessor::getServerListAndProcessClasses() {
ServerWorkerInfos res;
for (auto& [_, mss] : mgs->allServers) {
@ -757,7 +790,8 @@ Future<Reference<InitialDataDistribution>> DDMockTxnProcessor::getInitialDataDis
const UID& distributorId,
const MoveKeysLock& moveKeysLock,
const std::vector<Optional<Key>>& remoteDcIds,
const DDEnabledState* ddEnabledState) {
const DDEnabledState* ddEnabledState,
SkipDDModeCheck skipDDModeCheck) {
// FIXME: now we just ignore ddEnabledState and moveKeysLock, will fix it in the future
Reference<InitialDataDistribution> res = makeReference<InitialDataDistribution>();
@ -817,9 +851,10 @@ void DDMockTxnProcessor::setupMockGlobalState(Reference<InitialDataDistribution>
mgs->shardMapping->setCheckMode(ShardsAffectedByTeamFailure::CheckMode::Normal);
}
// FIXME: finish moveKeys implementation
Future<Void> DDMockTxnProcessor::moveKeys(const MoveKeysParams& params) {
UNREACHABLE();
// Not support location metadata yet
ASSERT(!SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
return DDMockTxnProcessorImpl::moveKeys(this, params);
}
// FIXME: finish implementation
@ -851,3 +886,48 @@ Future<std::pair<Optional<StorageMetrics>, int>> DDMockTxnProcessor::waitStorage
Future<std::vector<ProcessData>> DDMockTxnProcessor::getWorkers() const {
return Future<std::vector<ProcessData>>();
}
void DDMockTxnProcessor::rawStartMovement(MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping) {
FlowLock::Releaser releaser(*params.startMoveKeysParallelismLock);
// Add wait(take) would always return immediately because there wont be parallel rawStart or rawFinish in mock
// world due to the fact the following *mock* transaction code will always finish without coroutine switch.
ASSERT(params.startMoveKeysParallelismLock->take().isReady());
std::vector<ShardsAffectedByTeamFailure::Team> destTeams;
destTeams.emplace_back(params.destinationTeam, true);
mgs->shardMapping->moveShard(params.keys, destTeams);
for (auto& id : params.destinationTeam) {
mgs->allServers.at(id).setShardStatus(params.keys, MockShardStatus::INFLIGHT, mgs->restrictSize);
}
}
void DDMockTxnProcessor::rawFinishMovement(MoveKeysParams& params,
const std::map<UID, StorageServerInterface>& tssMapping) {
FlowLock::Releaser releaser(*params.finishMoveKeysParallelismLock);
// Add wait(take) would always return immediately because there wont be parallel rawStart or rawFinish in mock
// world due to the fact the following *mock* transaction code will always finish without coroutine switch.
ASSERT(params.finishMoveKeysParallelismLock->take().isReady());
// get source and dest teams
auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsFor(params.keys);
ASSERT_EQ(destTeams.size(), 0);
if (destTeams.front() != ShardsAffectedByTeamFailure::Team{ params.destinationTeam, true }) {
TraceEvent(SevError, "MockRawFinishMovementError")
.detail("Reason", "InconsistentDestinations")
.detail("ShardMappingDest", describe(destTeams.front().servers))
.detail("ParamDest", describe(params.destinationTeam));
ASSERT(false); // This shouldn't happen because the overlapped key range movement won't be executed in parallel
}
for (auto& id : params.destinationTeam) {
mgs->allServers.at(id).setShardStatus(params.keys, MockShardStatus::COMPLETED, mgs->restrictSize);
}
ASSERT_EQ(srcTeams.size(), 0);
for (auto& id : srcTeams.front().servers) {
mgs->allServers.at(id).removeShard(params.keys);
}
mgs->shardMapping->finishMove(params.keys);
}

View File

@ -316,7 +316,8 @@ public:
ddId,
lock,
configuration.usableRegions > 1 ? remoteDcIds : std::vector<Optional<Key>>(),
context->ddEnabledState.get()));
context->ddEnabledState.get(),
SkipDDModeCheck::False));
}
void initDcInfo() {
@ -692,6 +693,10 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
"DDTenantCacheMonitor",
self->ddId,
&normalDDQueueErrors()));
actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorStorageUsage(),
"StorageUsageTracker",
self->ddId,
&normalDDQueueErrors()));
}
std::vector<DDTeamCollection*> teamCollectionsPtrs;

View File

@ -31,6 +31,7 @@
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/ServerDBInfo.h"
#include "flow/Arena.h"
#include "flow/CodeProbe.h"
#include "flow/EncryptUtils.h"
#include "flow/Error.h"
#include "flow/EventTypes.actor.h"
@ -387,6 +388,15 @@ ACTOR Future<Void> getCipherKeysByBaseCipherKeyIds(Reference<EncryptKeyProxyData
try {
KmsConnLookupEKsByKeyIdsReq keysByIdsReq;
for (const auto& item : lookupCipherInfoMap) {
// TODO: Currently getEncryptCipherKeys does not pass the domain name, once that is fixed we can remove
// the check on the empty domain name
if (!item.second.domainName.empty()) {
if (item.second.domainId == FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
ASSERT(item.second.domainName == FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
} else if (item.second.domainId == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
ASSERT(item.second.domainName == FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
}
}
keysByIdsReq.encryptKeyInfos.emplace_back_deep(
keysByIdsReq.arena, item.second.domainId, item.second.baseCipherId, item.second.domainName);
}
@ -452,6 +462,8 @@ ACTOR Future<Void> getCipherKeysByBaseCipherKeyIds(Reference<EncryptKeyProxyData
keyIdsReply.numHits = cachedCipherDetails.size();
keysByIds.reply.send(keyIdsReply);
CODE_PROBE(!lookupCipherInfoMap.empty(), "EKP fetch cipherKeys by KeyId from KMS");
return Void();
}
@ -475,13 +487,13 @@ ACTOR Future<Void> getLatestCipherKeys(Reference<EncryptKeyProxyData> ekpProxyDa
// Dedup the requested domainIds.
// TODO: endpoint serialization of std::unordered_set isn't working at the moment
std::unordered_map<EncryptCipherDomainId, EKPGetLatestCipherKeysRequestInfo> dedupedDomainInfos;
for (const auto info : req.encryptDomainInfos) {
for (const auto& info : req.encryptDomainInfos) {
dedupedDomainInfos.emplace(info.domainId, info);
}
if (dbgTrace.present()) {
dbgTrace.get().detail("NKeys", dedupedDomainInfos.size());
for (const auto info : dedupedDomainInfos) {
for (const auto& info : dedupedDomainInfos) {
// log encryptDomainIds queried
dbgTrace.get().detail(
getEncryptDbgTraceKey(ENCRYPT_DBG_TRACE_QUERY_PREFIX, info.first, info.second.domainName), "");
@ -524,6 +536,11 @@ ACTOR Future<Void> getLatestCipherKeys(Reference<EncryptKeyProxyData> ekpProxyDa
try {
KmsConnLookupEKsByDomainIdsReq keysByDomainIdReq;
for (const auto& item : lookupCipherDomains) {
if (item.second.domainId == FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
ASSERT(item.second.domainName == FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
} else if (item.second.domainId == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
ASSERT(item.second.domainName == FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
}
keysByDomainIdReq.encryptDomainInfos.emplace_back_deep(
keysByDomainIdReq.arena, item.second.domainId, item.second.domainName);
}
@ -588,6 +605,8 @@ ACTOR Future<Void> getLatestCipherKeys(Reference<EncryptKeyProxyData> ekpProxyDa
latestCipherReply.numHits = cachedCipherDetails.size();
latestKeysReq.reply.send(latestCipherReply);
CODE_PROBE(!lookupCipherDomains.empty(), "EKP fetch latest cipherKeys from KMS");
return Void();
}
@ -610,7 +629,7 @@ bool isBlobMetadataEligibleForRefresh(const BlobMetadataDetailsRef& blobMetadata
return nextRefreshCycleTS > blobMetadata.expireAt || nextRefreshCycleTS > blobMetadata.refreshAt;
}
ACTOR Future<Void> refreshEncryptionKeysCore(Reference<EncryptKeyProxyData> ekpProxyData,
ACTOR Future<Void> refreshEncryptionKeysImpl(Reference<EncryptKeyProxyData> ekpProxyData,
KmsConnectorInterface kmsConnectorInf) {
state UID debugId = deterministicRandom()->randomUniqueID();
@ -672,6 +691,7 @@ ACTOR Future<Void> refreshEncryptionKeysCore(Reference<EncryptKeyProxyData> ekpP
ekpProxyData->baseCipherKeysRefreshed += rep.cipherKeyDetails.size();
t.detail("NumKeys", rep.cipherKeyDetails.size());
CODE_PROBE(!rep.cipherKeyDetails.empty(), "EKP refresh cipherKeys");
} catch (Error& e) {
if (!canReplyWith(e)) {
TraceEvent(SevWarn, "RefreshEKsError").error(e);
@ -685,7 +705,7 @@ ACTOR Future<Void> refreshEncryptionKeysCore(Reference<EncryptKeyProxyData> ekpP
}
Future<Void> refreshEncryptionKeys(Reference<EncryptKeyProxyData> ekpProxyData, KmsConnectorInterface kmsConnectorInf) {
return refreshEncryptionKeysCore(ekpProxyData, kmsConnectorInf);
return refreshEncryptionKeysImpl(ekpProxyData, kmsConnectorInf);
}
ACTOR Future<Void> getLatestBlobMetadata(Reference<EncryptKeyProxyData> ekpProxyData,
@ -775,7 +795,7 @@ ACTOR Future<Void> refreshBlobMetadataCore(Reference<EncryptKeyProxyData> ekpPro
state UID debugId = deterministicRandom()->randomUniqueID();
state double startTime;
state TraceEvent t("RefreshBlobMetadata_Start", ekpProxyData->myId);
state TraceEvent t("RefreshBlobMetadataStart", ekpProxyData->myId);
t.setMaxEventLength(SERVER_KNOBS->ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH);
t.detail("KmsConnInf", kmsConnectorInf.id());
t.detail("DebugId", debugId);
@ -817,7 +837,7 @@ ACTOR Future<Void> refreshBlobMetadataCore(Reference<EncryptKeyProxyData> ekpPro
t.detail("nKeys", rep.metadataDetails.size());
} catch (Error& e) {
if (!canReplyWith(e)) {
TraceEvent("RefreshBlobMetadata_Error").error(e);
TraceEvent("RefreshBlobMetadataError").error(e);
throw e;
}
TraceEvent("RefreshBlobMetadata").detail("ErrorCode", e.code());
@ -832,24 +852,25 @@ void refreshBlobMetadata(Reference<EncryptKeyProxyData> ekpProxyData, KmsConnect
}
void activateKmsConnector(Reference<EncryptKeyProxyData> ekpProxyData, KmsConnectorInterface kmsConnectorInf) {
if (g_network->isSimulated() || (SERVER_KNOBS->KMS_CONNECTOR_TYPE.compare(FDB_PREF_KMS_CONNECTOR_TYPE_STR) == 0)) {
ekpProxyData->kmsConnector = std::make_unique<SimKmsConnector>();
if (g_network->isSimulated()) {
ekpProxyData->kmsConnector = std::make_unique<SimKmsConnector>(FDB_SIM_KMS_CONNECTOR_TYPE_STR);
} else if (SERVER_KNOBS->KMS_CONNECTOR_TYPE.compare(FDB_PREF_KMS_CONNECTOR_TYPE_STR) == 0) {
ekpProxyData->kmsConnector = std::make_unique<SimKmsConnector>(FDB_PREF_KMS_CONNECTOR_TYPE_STR);
} else if (SERVER_KNOBS->KMS_CONNECTOR_TYPE.compare(REST_KMS_CONNECTOR_TYPE_STR) == 0) {
ekpProxyData->kmsConnector = std::make_unique<RESTKmsConnector>();
ekpProxyData->kmsConnector = std::make_unique<RESTKmsConnector>(REST_KMS_CONNECTOR_TYPE_STR);
} else {
throw not_implemented();
}
TraceEvent("EKPActiveKmsConnector", ekpProxyData->myId)
.detail("ConnectorType",
g_network->isSimulated() ? FDB_SIM_KMS_CONNECTOR_TYPE_STR : SERVER_KNOBS->KMS_CONNECTOR_TYPE)
.detail("ConnectorType", ekpProxyData->kmsConnector->getConnectorStr())
.detail("InfId", kmsConnectorInf.id());
ekpProxyData->addActor.send(ekpProxyData->kmsConnector->connectorCore(kmsConnectorInf));
}
ACTOR Future<Void> encryptKeyProxyServer(EncryptKeyProxyInterface ekpInterface, Reference<AsyncVar<ServerDBInfo>> db) {
state Reference<EncryptKeyProxyData> self(new EncryptKeyProxyData(ekpInterface.id()));
state Reference<EncryptKeyProxyData> self = makeReference<EncryptKeyProxyData>(ekpInterface.id());
state Future<Void> collection = actorCollection(self->addActor.getFuture());
self->addActor.send(traceRole(Role::ENCRYPT_KEY_PROXY, ekpInterface.id()));

View File

@ -120,12 +120,13 @@ class GlobalTagThrottlerImpl {
Smoother transactionCounter;
Smoother perClientRate;
Smoother targetRate;
double transactionsLastAdded;
public:
explicit PerTagStatistics()
: transactionCounter(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_FOLDING_TIME),
perClientRate(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_FOLDING_TIME),
targetRate(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_FOLDING_TIME) {}
targetRate(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_FOLDING_TIME), transactionsLastAdded(now()) {}
Optional<ThrottleApi::TagQuotaValue> getQuota() const { return quota; }
@ -133,7 +134,10 @@ class GlobalTagThrottlerImpl {
void clearQuota() { quota = {}; }
void addTransactions(int count) { transactionCounter.addDelta(count); }
void addTransactions(int count) {
transactionsLastAdded = now();
transactionCounter.addDelta(count);
}
double getTransactionRate() const { return transactionCounter.smoothRate(); }
@ -151,6 +155,10 @@ class GlobalTagThrottlerImpl {
targetRate.setTotal(targetTps);
return targetRate.smoothTotal();
}
bool recentTransactionsAdded() const {
return now() - transactionsLastAdded < SERVER_KNOBS->GLOBAL_TAG_THROTTLING_TAG_EXPIRE_AFTER;
}
};
Database db;
@ -278,7 +286,7 @@ class GlobalTagThrottlerImpl {
for (const auto& t : tagsAffectingStorageServer) {
auto const tQuota = getQuota(t, LimitType::TOTAL);
sumQuota += tQuota.orDefault(0);
if (tag.compare(tag) == 0) {
if (t.compare(tag) == 0) {
tagQuota = tQuota.orDefault(0);
}
}
@ -360,6 +368,7 @@ class GlobalTagThrottlerImpl {
tagsWithQuota.insert(tag);
}
self->removeUnseenQuotas(tagsWithQuota);
self->removeExpiredTags();
++self->throttledTagChangeId;
wait(delay(5.0));
break;
@ -397,7 +406,24 @@ class GlobalTagThrottlerImpl {
public:
GlobalTagThrottlerImpl(Database db, UID id) : db(db), id(id) {}
Future<Void> monitorThrottlingChanges() { return monitorThrottlingChanges(this); }
void addRequests(TransactionTag tag, int count) { tagStatistics[tag].addTransactions(static_cast<double>(count)); }
void addRequests(TransactionTag tag, int count) {
auto it = tagStatistics.find(tag);
if (it == tagStatistics.end()) {
if (tagStatistics.size() == SERVER_KNOBS->GLOBAL_TAG_THROTTLING_MAX_TAGS_TRACKED) {
CODE_PROBE(true,
"Global tag throttler ignoring transactions because maximum number of trackable tags has "
"been reached");
TraceEvent("GlobalTagThrottler_IgnoringRequests")
.suppressFor(1.0)
.detail("Tag", printable(tag))
.detail("Count", count);
} else {
tagStatistics[tag].addTransactions(static_cast<double>(count));
}
} else {
it->second.addTransactions(static_cast<double>(count));
}
}
uint64_t getThrottledTagChangeId() const { return throttledTagChangeId; }
TransactionTagMap<double> getProxyRates(int numProxies) {
@ -465,10 +491,14 @@ public:
throttlingRatios[ss.id] = ss.getThrottlingRatio(SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER,
SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER);
for (const auto& busyReadTag : ss.busiestReadTags) {
throughput[ss.id][busyReadTag.tag].updateCost(busyReadTag.rate, OpType::READ);
if (tagStatistics.find(busyReadTag.tag) != tagStatistics.end()) {
throughput[ss.id][busyReadTag.tag].updateCost(busyReadTag.rate, OpType::READ);
}
}
for (const auto& busyWriteTag : ss.busiestWriteTags) {
throughput[ss.id][busyWriteTag.tag].updateCost(busyWriteTag.rate, OpType::WRITE);
if (tagStatistics.find(busyWriteTag.tag) != tagStatistics.end()) {
throughput[ss.id][busyWriteTag.tag].updateCost(busyWriteTag.rate, OpType::WRITE);
}
}
return Void();
}
@ -478,6 +508,22 @@ public:
}
void removeQuota(TransactionTagRef tag) { tagStatistics[tag].clearQuota(); }
void removeExpiredTags() {
for (auto it = tagStatistics.begin(); it != tagStatistics.end();) {
const auto& [tag, stats] = *it;
if (!stats.recentTransactionsAdded()) {
for (auto& [ss, tagToCounters] : throughput) {
tagToCounters.erase(tag);
}
it = tagStatistics.erase(it);
} else {
++it;
}
}
}
uint32_t tagsTracked() const { return tagStatistics.size(); }
};
GlobalTagThrottler::GlobalTagThrottler(Database db, UID id) : impl(PImpl<GlobalTagThrottlerImpl>::create(db, id)) {}
@ -526,6 +572,14 @@ void GlobalTagThrottler::removeQuota(TransactionTagRef tag) {
return impl->removeQuota(tag);
}
uint32_t GlobalTagThrottler::tagsTracked() const {
return impl->tagsTracked();
}
void GlobalTagThrottler::removeExpiredTags() {
return impl->removeExpiredTags();
}
namespace GlobalTagThrottlerTesting {
enum class LimitType { RESERVED, TOTAL };
@ -1025,3 +1079,47 @@ TEST_CASE("/GlobalTagThrottler/ReservedQuota") {
wait(timeoutError(monitor || client || updater, 600.0));
return Void();
}
// Test that tags are expired iff a sufficient amount of time has passed since the
// last transaction with that tag
TEST_CASE("/GlobalTagThrottler/ExpireTags") {
state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10, 5);
TransactionTag testTag = "sampleTag1"_sr;
state Future<Void> client =
timeout(GlobalTagThrottlerTesting::runClient(
&globalTagThrottler, &storageServers, testTag, 10.0, 6.0, GlobalTagThrottlerTesting::OpType::READ),
60.0,
Void());
state Future<Void> updater = timeout(
GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers), 60.0, Void());
wait(client && updater);
client.cancel();
updater.cancel();
ASSERT_EQ(globalTagThrottler.tagsTracked(), 1);
globalTagThrottler.removeExpiredTags();
ASSERT_EQ(globalTagThrottler.tagsTracked(), 1);
wait(delay(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_TAG_EXPIRE_AFTER + 1.0));
ASSERT_EQ(globalTagThrottler.tagsTracked(), 1);
globalTagThrottler.removeExpiredTags();
ASSERT_EQ(globalTagThrottler.tagsTracked(), 0);
return Void();
}
// Test that the number of tags tracked does not grow beyond SERVER_KNOBS->GLOBAL_TAG_THROTTLING_MAX_TAGS_TRACKED
TEST_CASE("/GlobalTagThrottler/TagLimit") {
state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10, 5);
std::vector<Future<Void>> futures;
for (int i = 0; i < 2 * SERVER_KNOBS->GLOBAL_TAG_THROTTLING_MAX_TAGS_TRACKED; ++i) {
Arena arena;
TransactionTag tag = makeString(8, arena);
deterministicRandom()->randomBytes(mutateString(tag), tag.size());
futures.push_back(GlobalTagThrottlerTesting::runClient(
&globalTagThrottler, &storageServers, tag, 1.0, 6.0, GlobalTagThrottlerTesting::OpType::READ));
}
wait(timeout(waitForAll(futures), 60.0, Void()));
ASSERT_EQ(globalTagThrottler.tagsTracked(), SERVER_KNOBS->GLOBAL_TAG_THROTTLING_MAX_TAGS_TRACKED);
return Void();
}

View File

@ -62,6 +62,7 @@ struct GrvProxyStats {
LatencySample defaultTxnGRVTimeInQueue;
LatencySample batchTxnGRVTimeInQueue;
// These latency bands and samples ignore latency injected by the GrvProxyTransactionTagThrottler
LatencyBands grvLatencyBands;
LatencySample grvLatencySample; // GRV latency metric sample of default priority
LatencySample grvBatchLatencySample; // GRV latency metric sample of batched priority
@ -692,7 +693,7 @@ ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture,
double end = g_network->timer();
for (GetReadVersionRequest const& request : requests) {
double duration = end - request.requestTime();
double duration = end - request.requestTime() - request.proxyTagThrottledDuration;
if (request.priority == TransactionPriority::BATCH) {
stats->grvBatchLatencySample.addMeasurement(duration);
}

View File

@ -53,7 +53,11 @@ struct KeyValueStoreCompressTestData final : IKeyValueStore {
void set(KeyValueRef keyValue, const Arena* arena = nullptr) override {
store->set(KeyValueRef(keyValue.key, pack(keyValue.value)), arena);
}
void clear(KeyRangeRef range, const Arena* arena = nullptr) override { store->clear(range, arena); }
void clear(KeyRangeRef range,
const StorageServerMetrics* storageMetrics = nullptr,
const Arena* arena = nullptr) override {
store->clear(range, storageMetrics, arena);
}
Future<Void> commit(bool sequential = false) override { return store->commit(sequential); }
Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> options) override {

View File

@ -130,7 +130,7 @@ public:
}
}
void clear(KeyRangeRef range, const Arena* arena) override {
void clear(KeyRangeRef range, const StorageServerMetrics* storageMetrics, const Arena* arena) override {
// A commit that occurs with no available space returns Never, so we can throw out all modifications
if (getAvailableSize() <= 0)
return;

View File

@ -1846,22 +1846,52 @@ struct RocksDBKeyValueStore : IKeyValueStore {
void set(KeyValueRef kv, const Arena*) override {
if (writeBatch == nullptr) {
writeBatch.reset(new rocksdb::WriteBatch());
keysSet.clear();
}
ASSERT(defaultFdbCF != nullptr);
writeBatch->Put(defaultFdbCF, toSlice(kv.key), toSlice(kv.value));
if (SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE) {
keysSet.insert(kv.key);
}
}
void clear(KeyRangeRef keyRange, const Arena*) override {
void clear(KeyRangeRef keyRange, const StorageServerMetrics* storageMetrics, const Arena*) override {
if (writeBatch == nullptr) {
writeBatch.reset(new rocksdb::WriteBatch());
keysSet.clear();
}
ASSERT(defaultFdbCF != nullptr);
if (keyRange.singleKeyRange()) {
writeBatch->Delete(defaultFdbCF, toSlice(keyRange.begin));
} else {
writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end));
if (SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE && storageMetrics != nullptr &&
storageMetrics->byteSample.getEstimate(keyRange) <
SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT) {
rocksdb::ReadOptions options = sharedState->getReadOptions();
auto beginSlice = toSlice(keyRange.begin);
auto endSlice = toSlice(keyRange.end);
options.iterate_lower_bound = &beginSlice;
options.iterate_upper_bound = &endSlice;
auto cursor = std::unique_ptr<rocksdb::Iterator>(db->NewIterator(options, defaultFdbCF));
cursor->Seek(toSlice(keyRange.begin));
while (cursor->Valid() && toStringRef(cursor->key()) < keyRange.end) {
writeBatch->Delete(defaultFdbCF, cursor->key());
cursor->Next();
}
if (!cursor->status().ok()) {
// if readrange iteration fails, then do a deleteRange.
writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end));
} else {
auto it = keysSet.lower_bound(keyRange.begin);
while (it != keysSet.end() && *it < keyRange.end) {
writeBatch->Delete(defaultFdbCF, toSlice(*it));
it++;
}
}
} else {
writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end));
}
}
}
@ -1890,6 +1920,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
}
auto a = new Writer::CommitAction();
a->batchToCommit = std::move(writeBatch);
keysSet.clear();
auto res = a->done.getFuture();
writeThread->post(a);
return res;
@ -2083,6 +2114,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
Promise<Void> closePromise;
Future<Void> openFuture;
std::unique_ptr<rocksdb::WriteBatch> writeBatch;
std::set<Key> keysSet;
Optional<Future<Void>> metrics;
FlowLock readSemaphore;
int numReadWaiters;

View File

@ -1596,7 +1596,9 @@ public:
StorageBytes getStorageBytes() const override;
void set(KeyValueRef keyValue, const Arena* arena = nullptr) override;
void clear(KeyRangeRef range, const Arena* arena = nullptr) override;
void clear(KeyRangeRef range,
const StorageServerMetrics* storageMetrics = nullptr,
const Arena* arena = nullptr) override;
Future<Void> commit(bool sequential = false) override;
Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> optionss) override;
@ -2215,7 +2217,7 @@ void KeyValueStoreSQLite::set(KeyValueRef keyValue, const Arena* arena) {
++writesRequested;
writeThread->post(new Writer::SetAction(keyValue));
}
void KeyValueStoreSQLite::clear(KeyRangeRef range, const Arena* arena) {
void KeyValueStoreSQLite::clear(KeyRangeRef range, const StorageServerMetrics* storageMetrics, const Arena* arena) {
++writesRequested;
writeThread->post(new Writer::ClearAction(range));
}

View File

@ -49,6 +49,7 @@ static_assert((ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR == 27) ? ROCKSDB_PATCH >= 3 :
"Unsupported rocksdb version. Update the rocksdb to 6.27.3 version");
const std::string rocksDataFolderSuffix = "-data";
const std::string METADATA_SHARD_ID = "kvs-metadata";
const KeyRef shardMappingPrefix("\xff\xff/ShardMapping/"_sr);
// TODO: move constants to a header file.
const StringRef ROCKSDBSTORAGE_HISTOGRAM_GROUP = "RocksDBStorage"_sr;
@ -304,13 +305,12 @@ rocksdb::ReadOptions getReadOptions() {
}
struct ReadIterator {
rocksdb::ColumnFamilyHandle* cf;
uint64_t index; // incrementing counter to uniquely identify read iterator.
bool inUse;
std::shared_ptr<rocksdb::Iterator> iter;
double creationTime;
ReadIterator(rocksdb::ColumnFamilyHandle* cf, uint64_t index, rocksdb::DB* db, rocksdb::ReadOptions& options)
: cf(cf), index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options, cf)) {}
: index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options, cf)) {}
};
/*
@ -475,13 +475,26 @@ struct PhysicalShard {
}
~PhysicalShard() {
if (!deletePending)
return;
logShardEvent(id, ShardOp::CLOSE);
isInitialized.store(false);
readIterPool.reset();
// Destroy CF
auto s = db->DropColumnFamily(cf);
// Deleting default column family is not allowed.
if (id == "default") {
return;
}
if (deletePending) {
auto s = db->DropColumnFamily(cf);
if (!s.ok()) {
logRocksDBError(s, "DestroyShard");
logShardEvent(id, ShardOp::DESTROY, SevError, s.ToString());
return;
}
}
auto s = db->DestroyColumnFamilyHandle(cf);
if (!s.ok()) {
logRocksDBError(s, "DestroyShard");
logRocksDBError(s, "DestroyCFHandle");
logShardEvent(id, ShardOp::DESTROY, SevError, s.ToString());
return;
}
@ -628,7 +641,7 @@ public:
std::vector<rocksdb::ColumnFamilyDescriptor> descriptors;
bool foundMetadata = false;
for (const auto& name : columnFamilies) {
if (name == "kvs-metadata") {
if (name == METADATA_SHARD_ID) {
foundMetadata = true;
}
descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ name, cfOptions });
@ -652,19 +665,19 @@ public:
TraceEvent(SevInfo, "ShardedRocksInitLoadPhysicalShards", this->logId)
.detail("PhysicalShardCount", handles.size());
std::shared_ptr<PhysicalShard> metadataShard = nullptr;
for (auto handle : handles) {
if (handle->GetName() == "kvs-metadata") {
metadataShard = std::make_shared<PhysicalShard>(db, "kvs-metadata", handle);
} else {
physicalShards[handle->GetName()] = std::make_shared<PhysicalShard>(db, handle->GetName(), handle);
auto shard = std::make_shared<PhysicalShard>(db, handle->GetName(), handle);
if (shard->id == METADATA_SHARD_ID) {
metadataShard = shard;
}
physicalShards[shard->id] = shard;
columnFamilyMap[handle->GetID()] = handle;
TraceEvent(SevVerbose, "ShardedRocksInitPhysicalShard", this->logId)
.detail("PhysicalShard", handle->GetName());
TraceEvent(SevVerbose, "ShardedRocksInitPhysicalShard", this->logId).detail("PhysicalShard", shard->id);
}
std::set<std::string> unusedShards(columnFamilies.begin(), columnFamilies.end());
unusedShards.erase("kvs-metadata");
unusedShards.erase(METADATA_SHARD_ID);
unusedShards.erase("default");
KeyRange keyRange = prefixRange(shardMappingPrefix);
@ -746,9 +759,11 @@ public:
defaultShard->dataShards[specialKeys.begin.toString()] = std::move(dataShard);
physicalShards[defaultShard->id] = defaultShard;
metadataShard = std::make_shared<PhysicalShard>(db, "kvs-metadata");
// Create metadata shard.
auto metadataShard = std::make_shared<PhysicalShard>(db, METADATA_SHARD_ID);
metadataShard->init();
columnFamilyMap[metadataShard->cf->GetID()] = metadataShard->cf;
physicalShards[METADATA_SHARD_ID] = metadataShard;
// Write special key range metadata.
writeBatch = std::make_unique<rocksdb::WriteBatch>();
@ -763,7 +778,6 @@ public:
TraceEvent(SevInfo, "ShardedRocksInitializeMetaDataShard", this->logId)
.detail("MetadataShardCF", metadataShard->cf->GetID());
}
physicalShards["kvs-metadata"] = metadataShard;
writeBatch = std::make_unique<rocksdb::WriteBatch>();
dirtyShards = std::make_unique<std::set<PhysicalShard*>>();
@ -910,6 +924,9 @@ public:
std::vector<std::shared_ptr<PhysicalShard>> getPendingDeletionShards(double cleanUpDelay) {
std::vector<std::shared_ptr<PhysicalShard>> emptyShards;
double currentTime = now();
TraceEvent(SevInfo, "ShardedRocksDB", logId)
.detail("PendingDeletionShardQueueSize", pendingDeletionShards.size());
while (!pendingDeletionShards.empty()) {
const auto& id = pendingDeletionShards.front();
auto it = physicalShards.find(id);
@ -976,6 +993,10 @@ public:
.detail("Info", "RangeToPersist")
.detail("BeginKey", range.begin)
.detail("EndKey", range.end);
auto it = physicalShards.find(METADATA_SHARD_ID);
ASSERT(it != physicalShards.end());
auto metadataShard = it->second;
writeBatch->DeleteRange(metadataShard->cf,
getShardMappingKey(range.begin, shardMappingPrefix),
getShardMappingKey(range.end, shardMappingPrefix));
@ -1043,24 +1064,30 @@ public:
}
void closeAllShards() {
for (auto& [_, shard] : physicalShards) {
shard->readIterPool.reset();
}
columnFamilyMap.clear();
physicalShards.clear();
// Close DB.
auto s = db->Close();
if (!s.ok()) {
logRocksDBError(s, "Close");
return;
}
TraceEvent("ShardedRocksDB", this->logId).detail("Info", "DBClosed");
}
void destroyAllShards() {
closeAllShards();
std::vector<rocksdb::ColumnFamilyDescriptor> cfs;
for (const auto& [key, _] : physicalShards) {
cfs.push_back(rocksdb::ColumnFamilyDescriptor{ key, getCFOptions() });
columnFamilyMap.clear();
for (auto& [_, shard] : physicalShards) {
shard->deletePending = true;
}
auto s = rocksdb::DestroyDB(path, getOptions(), cfs);
physicalShards.clear();
// Close DB.
auto s = db->Close();
if (!s.ok()) {
logRocksDBError(s, "Close");
return;
}
s = rocksdb::DestroyDB(path, getOptions());
if (!s.ok()) {
logRocksDBError(s, "DestroyDB");
}
@ -1121,7 +1148,6 @@ private:
std::unique_ptr<rocksdb::WriteBatch> writeBatch;
std::unique_ptr<std::set<PhysicalShard*>> dirtyShards;
KeyRangeMap<DataShard*> dataShardMap;
std::shared_ptr<PhysicalShard> metadataShard = nullptr;
std::deque<std::string> pendingDeletionShards;
};
@ -2240,6 +2266,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
// TODO: Adapt the simulation framework to not advance time quickly when background reads/writes are
// occurring.
if (g_network->isSimulated()) {
TraceEvent(SevDebug, "ShardedRocksDB").detail("Info", "Use Coro threads in simulation.");
writeThread = CoroThreadPool::createThreadPool();
readThreads = CoroThreadPool::createThreadPool();
} else {
@ -2316,7 +2343,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
void set(KeyValueRef kv, const Arena*) override { shardManager.put(kv.key, kv.value); }
void clear(KeyRangeRef range, const Arena*) override {
void clear(KeyRangeRef range, const StorageServerMetrics*, const Arena*) override {
if (range.singleKeyRange()) {
shardManager.clear(range.begin);
} else {

View File

@ -31,6 +31,106 @@ bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus s
return true;
}
void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status, bool restrictSize) {
auto ranges = serverKeys.intersectingRanges(range);
ASSERT(!ranges.empty());
if (ranges.begin().range().contains(range)) {
CODE_PROBE(true, "Implicitly split single shard to 3 pieces");
threeWayShardSplitting(ranges.begin().range(), range, ranges.begin().cvalue().shardSize, restrictSize);
return;
}
if (ranges.begin().begin() < range.begin) {
CODE_PROBE(true, "Implicitly split begin range to 2 pieces");
twoWayShardSplitting(ranges.begin().range(), range.begin, ranges.begin().cvalue().shardSize, restrictSize);
}
if (ranges.end().end() > range.end) {
CODE_PROBE(true, "Implicitly split end range to 2 pieces");
twoWayShardSplitting(ranges.end().range(), range.end, ranges.end().cvalue().shardSize, restrictSize);
}
ranges = serverKeys.containedRanges(range);
// now the boundary must be aligned
ASSERT(ranges.begin().begin() == range.begin);
ASSERT(ranges.end().end() == range.end);
uint64_t newSize = 0;
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
newSize += it->cvalue().shardSize;
}
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
auto oldStatus = it.value().status;
if (isStatusTransitionValid(oldStatus, status)) {
it.value() = ShardInfo{ status, newSize };
} else if (oldStatus == MockShardStatus::COMPLETED && status == MockShardStatus::INFLIGHT) {
CODE_PROBE(true, "Shard already on server");
} else {
TraceEvent(SevError, "MockShardStatusTransitionError")
.detail("From", oldStatus)
.detail("To", status)
.detail("ID", id)
.detail("KeyBegin", range.begin.toHexString())
.detail("KeyEnd", range.begin.toHexString());
}
}
serverKeys.coalesce(range);
}
// split the out range [a, d) based on the inner range's boundary [b, c). The result would be [a,b), [b,c), [c,d). The
// size of the new shards are randomly split from old size of [a, d)
void MockStorageServer::threeWayShardSplitting(KeyRangeRef outerRange,
KeyRangeRef innerRange,
uint64_t outerRangeSize,
bool restrictSize) {
ASSERT(outerRange.contains(innerRange));
Key left = outerRange.begin;
// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
int leftSize = deterministicRandom()->randomInt(
SERVER_KNOBS->MIN_SHARD_BYTES,
restrictSize ? outerRangeSize - 2 * SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
int midSize = deterministicRandom()->randomInt(
SERVER_KNOBS->MIN_SHARD_BYTES,
restrictSize ? outerRangeSize - leftSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
int rightSize =
restrictSize ? outerRangeSize - leftSize - midSize
: deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
serverKeys.insert(innerRange, { serverKeys[left].status, (uint64_t)midSize });
serverKeys[left].shardSize = leftSize;
serverKeys[innerRange.end].shardSize = rightSize;
}
// split the range [a,c) with split point b. The result would be [a, b), [b, c). The
// size of the new shards are randomly split from old size of [a, c)
void MockStorageServer::twoWayShardSplitting(KeyRangeRef range,
KeyRef splitPoint,
uint64_t rangeSize,
bool restrictSize) {
Key left = range.begin;
// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
int leftSize = deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES,
restrictSize ? rangeSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1
: SERVER_KNOBS->MAX_SHARD_BYTES);
int rightSize =
restrictSize ? rangeSize - leftSize
: deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
serverKeys.rawInsert(splitPoint, { serverKeys[left].status, (uint64_t)rightSize });
serverKeys[left].shardSize = leftSize;
}
void MockStorageServer::removeShard(KeyRangeRef range) {
auto ranges = serverKeys.containedRanges(range);
ASSERT(ranges.begin().range() == range);
serverKeys.rawErase(range);
}
uint64_t MockStorageServer::sumRangeSize(KeyRangeRef range) const {
auto ranges = serverKeys.intersectingRanges(range);
uint64_t totalSize = 0;
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
totalSize += it->cvalue().shardSize;
}
return totalSize;
}
void MockGlobalState::initializeAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) {
ASSERT(conf.storageTeamSize > 0);
configuration = conf;
@ -104,8 +204,78 @@ TEST_CASE("/MockGlobalState/initializeAsEmptyDatabaseMGS/SimpleThree") {
auto id = MockGlobalState::indexToUID(i);
std::cout << "Check server " << i << "\n";
ASSERT(mgs->serverIsSourceForShard(id, allKeys));
ASSERT(mgs->allServers.at(id).serverKeys.sumRange(allKeys.begin, allKeys.end) == 0);
ASSERT(mgs->allServers.at(id).sumRangeSize(allKeys) == 0);
}
return Void();
}
struct MockGlobalStateTester {
// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, x2), [x2, r0.end)
void testThreeWaySplitFirstRange(MockStorageServer& mss) {
auto it = mss.serverKeys.ranges().begin();
uint64_t oldSize =
deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
MockShardStatus oldStatus = it.cvalue().status;
it->value().shardSize = oldSize;
KeyRangeRef outerRange = it->range();
Key x1 = keyAfter(it->range().begin);
Key x2 = keyAfter(x1);
std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
mss.threeWayShardSplitting(outerRange, KeyRangeRef(x1, x2), oldSize, false);
auto ranges = mss.serverKeys.containedRanges(outerRange);
ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
ranges.pop_front();
ASSERT(ranges.begin().range() == KeyRangeRef(x1, x2));
ASSERT(ranges.begin().cvalue().status == oldStatus);
ranges.pop_front();
ASSERT(ranges.begin().range() == KeyRangeRef(x2, outerRange.end));
ranges.pop_front();
ASSERT(ranges.empty());
}
// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, r0.end)
void testTwoWaySplitFirstRange(MockStorageServer& mss) {
auto it = mss.serverKeys.nthRange(0);
MockShardStatus oldStatus = it.cvalue().status;
uint64_t oldSize =
deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
it->value().shardSize = oldSize;
KeyRangeRef outerRange = it->range();
Key x1 = keyAfter(it->range().begin);
std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
mss.twoWayShardSplitting(it->range(), x1, oldSize, false);
auto ranges = mss.serverKeys.containedRanges(outerRange);
ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
ranges.pop_front();
ASSERT(ranges.begin().range() == KeyRangeRef(x1, outerRange.end));
ASSERT(ranges.begin().cvalue().status == oldStatus);
ranges.pop_front();
ASSERT(ranges.empty());
}
};
TEST_CASE("/MockGlobalState/MockStorageServer/SplittingFunctions") {
BasicTestConfig testConfig;
testConfig.simpleConfig = true;
testConfig.minimumReplication = 1;
testConfig.logAntiQuorum = 0;
DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
auto mgs = std::make_shared<MockGlobalState>();
mgs->initializeAsEmptyDatabaseMGS(dbConfig);
MockGlobalStateTester tester;
auto& mss = mgs->allServers.at(MockGlobalState::indexToUID(1));
std::cout << "Test 3-way splitting...\n";
tester.testThreeWaySplitFirstRange(mss);
std::cout << "Test 2-way splitting...\n";
mss.serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 }); // reset to empty
tester.testTwoWaySplitFirstRange(mss);
return Void();
}

View File

@ -2469,7 +2469,7 @@ ACTOR Future<Void> cleanUpDataMove(Database occ,
return Void();
}
Future<Void> startMovement(Database occ, MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping) {
Future<Void> rawStartMovement(Database occ, MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping) {
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
return startMoveShards(std::move(occ),
params.dataMoveId,
@ -2491,9 +2491,9 @@ Future<Void> startMovement(Database occ, MoveKeysParams& params, std::map<UID, S
params.ddEnabledState);
}
Future<Void> finishMovement(Database occ,
MoveKeysParams& params,
const std::map<UID, StorageServerInterface>& tssMapping) {
Future<Void> rawFinishMovement(Database occ,
MoveKeysParams& params,
const std::map<UID, StorageServerInterface>& tssMapping) {
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
return finishMoveShards(std::move(occ),
params.dataMoveId,
@ -2523,7 +2523,7 @@ ACTOR Future<Void> moveKeys(Database occ, MoveKeysParams params) {
state std::map<UID, StorageServerInterface> tssMapping;
wait(startMovement(occ, params, tssMapping));
wait(rawStartMovement(occ, params, tssMapping));
state Future<Void> completionSignaller = checkFetchingState(occ,
params.healthyDestinations,
@ -2532,7 +2532,7 @@ ACTOR Future<Void> moveKeys(Database occ, MoveKeysParams params) {
params.relocationIntervalId,
tssMapping);
wait(finishMovement(occ, params, tssMapping));
wait(rawFinishMovement(occ, params, tssMapping));
// This is defensive, but make sure that we always say that the movement is complete before moveKeys completes
completionSignaller.cancel();

Some files were not shown because too many files have changed in this diff Show More