Merge commit '0ae568a872e474c8c755e648efbbe4524e63e445' into storageserver-pml

# Conflicts:
#	fdbserver/VersionedBTree.actor.cpp
This commit is contained in:
Steve Atherton 2022-10-24 22:31:36 -07:00
commit 27dc180b68
134 changed files with 2325 additions and 1009 deletions

View File

@ -274,93 +274,21 @@ if(NOT WIN32)
@CLUSTER_FILE@
${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
)
add_fdbclient_test(
NAME fdb_c_api_tests
DISABLE_LOG_DUMP
COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
--cluster-file
@CLUSTER_FILE@
--tester-binary
$<TARGET_FILE:fdb_c_api_tester>
--external-client-library
${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
--test-dir
${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
--tmp-dir
@TMP_DIR@
--log-dir
@LOG_DIR@
--knob
delete-native-lib-after-loading=false # for properly symbolizing xSAN errors
)
add_fdbclient_test(
NAME fdb_c_api_tests_local_only
DISABLE_LOG_DUMP
COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
--cluster-file
@CLUSTER_FILE@
--tester-binary
$<TARGET_FILE:fdb_c_api_tester>
--test-dir
${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/local_tests
--tmp-dir
@TMP_DIR@
--log-dir
@LOG_DIR@
--knob
delete-native-lib-after-loading=false # for properly symbolizing xSAN errors
)
add_fdbclient_test(
NAME fdb_c_api_tests_blob_granule
DISABLE_LOG_DUMP
API_TEST_BLOB_GRANULES_ENABLED
COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
--cluster-file
@CLUSTER_FILE@
--tester-binary
$<TARGET_FILE:fdb_c_api_tester>
--external-client-library
${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
--test-dir
${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/blobgranuletests
--blob-granule-local-file-path
@DATA_DIR@/fdbblob/
--tmp-dir
@TMP_DIR@
--log-dir
@LOG_DIR@
--knob
delete-native-lib-after-loading=false # for properly symbolizing xSAN errors
)
add_fdbclient_test(
NAME fdb_c_api_tests_with_tls
DISABLE_LOG_DUMP
TLS_ENABLED
COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
--cluster-file
@CLUSTER_FILE@
--tester-binary
$<TARGET_FILE:fdb_c_api_tester>
--external-client-library
${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
--test-dir
${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
--tmp-dir
@TMP_DIR@
--log-dir
@LOG_DIR@
--tls-cert-file
@CLIENT_CERT_FILE@
--tls-key-file
@CLIENT_KEY_FILE@
--tls-ca-file
@SERVER_CA_FILE@
--knob
delete-native-lib-after-loading=false # for properly symbolizing xSAN errors
)
file(GLOB API_TEST_FILES "${CMAKE_CURRENT_SOURCE_DIR}/test/apitester/tests/*.toml")
foreach(test_file ${API_TEST_FILES})
get_filename_component(file_name "${test_file}" NAME_WE)
set(test_name "fdb_c_api_test_${file_name}")
add_test(NAME "${test_name}"
COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
--build-dir ${CMAKE_BINARY_DIR}
--api-tester-bin $<TARGET_FILE:fdb_c_api_tester>
--external-client-library ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
--test-file ${test_file}
--knob delete-native-lib-after-loading=false
)
set_tests_properties("${test_name}" PROPERTIES TIMEOUT 300)
endforeach()
add_test(NAME fdb_c_upgrade_to_future_version
COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py

View File

@ -1,15 +0,0 @@
[[test]]
title = 'Blob Granule API Correctness Single Threaded'
minClients = 1
maxClients = 3
multiThreaded = false
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -1,15 +0,0 @@
[[test]]
title = 'Blob Granule Errors Single Threaded'
minClients = 1
maxClients = 3
multiThreaded = false
[[test.workload]]
name = 'BlobGranuleErrors'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -279,9 +279,9 @@ bool parseArgs(TesterOptions& options, int argc, char** argv) {
return true;
}
void fdb_check(fdb::Error e) {
if (e) {
fmt::print(stderr, "Unexpected FDB error: {}({})\n", e.code(), e.what());
void fdb_check(fdb::Error e, std::string_view msg, fdb::Error::CodeType expectedError = error_code_success) {
if (e.code()) {
fmt::print(stderr, "{}, Error: {}({})\n", msg, e.code(), e.what());
std::abort();
}
}
@ -453,13 +453,13 @@ int main(int argc, char** argv) {
applyNetworkOptions(options);
fdb::network::setup();
std::thread network_thread{ &fdb::network::run };
std::thread network_thread{ [] { fdb_check(fdb::network::run(), "FDB network thread failed"); } };
if (!runWorkloads(options)) {
retCode = 1;
}
fdb_check(fdb::network::stop());
fdb_check(fdb::network::stop(), "Failed to stop FDB thread");
network_thread.join();
} catch (const std::exception& err) {
fmt::print(stderr, "ERROR: {}\n", err.what());

View File

@ -1,29 +0,0 @@
[[test]]
title = 'API Correctness Single Threaded'
minClients = 1
maxClients = 3
minDatabases = 1
maxDatabases = 3
multiThreaded = false
disableClientBypass = true
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -29,31 +29,39 @@ from pathlib import Path
import glob
import random
import string
import toml
sys.path[:0] = [os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "tests", "TestRunner")]
# fmt: off
from tmp_cluster import TempCluster
from local_cluster import TLSConfig
# fmt: on
TESTER_STATS_INTERVAL_SEC = 5
def random_string(len):
return ''.join(random.choice(string.ascii_letters + string.digits) for i in range(len))
return "".join(random.choice(string.ascii_letters + string.digits) for i in range(len))
def get_logger():
return logging.getLogger('foundationdb.run_c_api_tests')
return logging.getLogger("foundationdb.run_c_api_tests")
def initialize_logger_level(logging_level):
logger = get_logger()
assert logging_level in ['DEBUG', 'INFO', 'WARNING', 'ERROR']
assert logging_level in ["DEBUG", "INFO", "WARNING", "ERROR"]
logging.basicConfig(format='%(message)s')
if logging_level == 'DEBUG':
logging.basicConfig(format="%(message)s")
if logging_level == "DEBUG":
logger.setLevel(logging.DEBUG)
elif logging_level == 'INFO':
elif logging_level == "INFO":
logger.setLevel(logging.INFO)
elif logging_level == 'WARNING':
elif logging_level == "WARNING":
logger.setLevel(logging.WARNING)
elif logging_level == 'ERROR':
elif logging_level == "ERROR":
logger.setLevel(logging.ERROR)
@ -65,39 +73,52 @@ def dump_client_logs(log_dir):
print(">>>>>>>>>>>>>>>>>>>> End of {}:".format(log_file))
def run_tester(args, test_file):
cmd = [args.tester_binary,
"--cluster-file", args.cluster_file,
"--test-file", test_file,
"--stats-interval", str(TESTER_STATS_INTERVAL_SEC*1000)]
def run_tester(args, cluster, test_file):
build_dir = Path(args.build_dir).resolve()
tester_binary = Path(args.api_tester_bin).resolve()
external_client_library = build_dir.joinpath("bindings", "c", "libfdb_c_external.so")
log_dir = Path(cluster.log).joinpath("client")
log_dir.mkdir(exist_ok=True)
cmd = [
tester_binary,
"--cluster-file",
cluster.cluster_file,
"--test-file",
test_file,
"--stats-interval",
str(TESTER_STATS_INTERVAL_SEC * 1000),
"--tmp-dir",
cluster.tmp_dir,
"--log",
"--log-dir",
str(log_dir),
]
if args.external_client_library is not None:
cmd += ["--external-client-library", args.external_client_library]
if args.tmp_dir is not None:
cmd += ["--tmp-dir", args.tmp_dir]
log_dir = None
if args.log_dir is not None:
log_dir = Path(args.log_dir).joinpath(random_string(8))
log_dir.mkdir(exist_ok=True)
cmd += ['--log', "--log-dir", str(log_dir)]
external_client_library = Path(args.external_client_library).resolve()
cmd += ["--external-client-library", external_client_library]
if args.blob_granule_local_file_path is not None:
cmd += ["--blob-granule-local-file-path",
args.blob_granule_local_file_path]
if cluster.blob_granules_enabled:
cmd += [
"--blob-granule-local-file-path",
str(cluster.data.joinpath("fdbblob")) + os.sep,
]
if args.tls_ca_file is not None:
cmd += ["--tls-ca-file", args.tls_ca_file]
if args.tls_key_file is not None:
cmd += ["--tls-key-file", args.tls_key_file]
if args.tls_cert_file is not None:
cmd += ["--tls-cert-file", args.tls_cert_file]
if cluster.tls_config is not None:
cmd += [
"--tls-ca-file",
cluster.server_ca_file,
"--tls-key-file",
cluster.client_key_file,
"--tls-cert-file",
cluster.client_cert_file,
]
for knob in args.knobs:
knob_name, knob_value = knob.split("=")
cmd += ["--knob-" + knob_name, knob_value]
get_logger().info('\nRunning tester \'%s\'...' % ' '.join(cmd))
get_logger().info("\nRunning tester '%s'..." % " ".join(map(str, cmd)))
proc = Popen(cmd, stdout=sys.stdout, stderr=sys.stderr)
timed_out = False
ret_code = 1
@ -107,34 +128,76 @@ def run_tester(args, test_file):
proc.kill()
timed_out = True
except Exception as e:
raise Exception('Unable to run tester (%s)' % e)
raise Exception("Unable to run tester (%s)" % e)
if ret_code != 0:
if timed_out:
reason = 'timed out after %d seconds' % args.timeout
reason = "timed out after %d seconds" % args.timeout
elif ret_code < 0:
reason = signal.Signals(-ret_code).name
else:
reason = 'exit code: %d' % ret_code
get_logger().error('\n\'%s\' did not complete succesfully (%s)' %
(cmd[0], reason))
if (log_dir is not None):
reason = "exit code: %d" % ret_code
get_logger().error("\n'%s' did not complete succesfully (%s)" % (cmd[0], reason))
if log_dir is not None:
dump_client_logs(log_dir)
get_logger().info('')
get_logger().info("")
return ret_code
class TestConfig:
def __init__(self, test_file):
config = toml.load(test_file)
server_config = config.get("server", [{}])[0]
self.tenants_enabled = server_config.get("tenants_enabled", True)
self.blob_granules_enabled = server_config.get("blob_granules_enabled", False)
self.tls_enabled = server_config.get("tls_enabled", False)
self.client_chain_len = server_config.get("tls_client_chain_len", 2)
self.server_chain_len = server_config.get("tls_server_chain_len", 3)
self.min_num_processes = server_config.get("min_num_processes", 1)
self.max_num_processes = server_config.get("max_num_processes", 3)
self.num_processes = random.randint(self.min_num_processes, self.max_num_processes)
def run_test(args, test_file):
config = TestConfig(test_file)
tls_config = None
if config.tls_enabled:
tls_config = TLSConfig(
server_chain_len=config.client_chain_len,
client_chain_len=config.server_chain_len,
)
with TempCluster(
args.build_dir,
config.num_processes,
enable_tenants=config.tenants_enabled,
blob_granules_enabled=config.blob_granules_enabled,
tls_config=tls_config,
) as cluster:
ret_code = run_tester(args, cluster, test_file)
if not cluster.check_cluster_logs():
ret_code = 1 if ret_code == 0 else ret_code
return ret_code
def run_tests(args):
num_failed = 0
test_files = [f for f in os.listdir(args.test_dir) if os.path.isfile(
os.path.join(args.test_dir, f)) and f.endswith(".toml")]
if args.test_file is not None:
test_files = [Path(args.test_file).resolve()]
else:
test_files = [
f
for f in os.listdir(args.test_dir)
if os.path.isfile(os.path.join(args.test_dir, f)) and f.endswith(".toml")
]
for test_file in test_files:
get_logger().info('=========================================================')
get_logger().info('Running test %s' % test_file)
get_logger().info('=========================================================')
ret_code = run_tester(args, os.path.join(args.test_dir, test_file))
get_logger().info("=========================================================")
get_logger().info("Running test %s" % test_file)
get_logger().info("=========================================================")
ret_code = run_test(args, os.path.join(args.test_dir, test_file))
if ret_code != 0:
num_failed += 1
@ -142,34 +205,49 @@ def run_tests(args):
def parse_args(argv):
parser = argparse.ArgumentParser(description='FoundationDB C API Tester')
parser.add_argument('--cluster-file', type=str, default="fdb.cluster",
help='The cluster file for the cluster being connected to. (default: fdb.cluster)')
parser.add_argument('--tester-binary', type=str, default="fdb_c_api_tester",
help='Path to the fdb_c_api_tester executable. (default: fdb_c_api_tester)')
parser.add_argument('--external-client-library', type=str, default=None,
help='Path to the external client library. (default: None)')
parser.add_argument('--test-dir', type=str, default="./",
help='Path to a directory with test definitions. (default: ./)')
parser.add_argument('--timeout', type=int, default=300,
help='The timeout in seconds for running each individual test. (default 300)')
parser.add_argument('--log-dir', type=str, default=None,
help='The directory for storing logs (default: None)')
parser.add_argument('--logging-level', type=str, default='INFO',
choices=['ERROR', 'WARNING', 'INFO', 'DEBUG'], help='Specifies the level of detail in the tester output (default=\'INFO\').')
parser.add_argument('--tmp-dir', type=str, default=None,
help='The directory for storing temporary files (default: None)')
parser.add_argument('--blob-granule-local-file-path', type=str, default=None,
help='Enable blob granule tests if set, value is path to local blob granule files')
parser.add_argument('--tls-ca-file', type=str, default=None,
help='Path to client\'s TLS CA file: i.e. certificate of CA that signed the server certificate')
parser.add_argument('--tls-cert-file', type=str, default=None,
help='Path to client\'s TLS certificate file')
parser.add_argument('--tls-key-file', type=str, default=None,
help='Path to client\'s TLS private key file')
parser.add_argument('--knob', type=str, default=[], action="append", dest="knobs",
help='[lowercase-knob-name]=[knob-value] (there may be multiple --knob options)')
parser = argparse.ArgumentParser(description="FoundationDB C API Tester")
parser.add_argument("--build-dir", "-b", type=str, required=True, help="FDB build directory")
parser.add_argument("--api-tester-bin", type=str, help="Path to the fdb_c_api_tester executable.", required=True)
parser.add_argument("--external-client-library", type=str, help="Path to the external client library.")
parser.add_argument(
"--cluster-file",
type=str,
default="fdb.cluster",
help="The cluster file for the cluster being connected to. (default: fdb.cluster)",
)
parser.add_argument(
"--test-dir",
type=str,
default="./",
help="Path to a directory with test definitions. (default: ./)",
)
parser.add_argument(
"--test-file",
type=str,
default=None,
help="Path to a single test definition to be executed, overrides --test-dir if set.",
)
parser.add_argument(
"--timeout",
type=int,
default=300,
help="The timeout in seconds for running each individual test. (default 300)",
)
parser.add_argument(
"--logging-level",
type=str,
default="INFO",
choices=["ERROR", "WARNING", "INFO", "DEBUG"],
help="Specifies the level of detail in the tester output (default='INFO').",
)
parser.add_argument(
"--knob",
type=str,
default=[],
action="append",
dest="knobs",
help="[lowercase-knob-name]=[knob-value] (there may be multiple --knob options)",
)
return parser.parse_args(argv)
@ -180,5 +258,5 @@ def main(argv):
return run_tests(args)
if __name__ == '__main__':
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

View File

@ -12,13 +12,15 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -11,13 +11,15 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -0,0 +1,18 @@
[[test]]
title = 'Blob Granule API Correctness Single Threaded'
minClients = 1
maxClients = 3
multiThreaded = false
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -11,12 +11,15 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'BlobGranuleErrors'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'BlobGranuleErrors'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -11,12 +11,15 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'BlobGranuleErrors'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'BlobGranuleErrors'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -0,0 +1,18 @@
[[test]]
title = 'Blob Granule Errors Single Threaded'
minClients = 1
maxClients = 3
multiThreaded = false
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'BlobGranuleErrors'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -12,13 +12,13 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -11,13 +11,13 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -12,13 +12,13 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -0,0 +1,28 @@
[[test]]
title = 'Cancel Transaction with Database per Transaction with TLS'
multiThreaded = true
buggify = true
databasePerTransaction = true
minFdbThreads = 2
maxFdbThreads = 8
minDatabases = 2
maxDatabases = 8
minClientThreads = 2
maxClientThreads = 8
minClients = 2
maxClients = 8
[[server]]
tls_enabled = true
max_num_processes = 1
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -11,15 +11,15 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
minTxTimeoutMs = 10
maxTxTimeoutMs = 10000
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
minTxTimeoutMs = 10
maxTxTimeoutMs = 10000

View File

@ -12,23 +12,23 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -12,23 +12,23 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -12,23 +12,23 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -0,0 +1,29 @@
[[test]]
title = 'API Correctness Single Threaded'
minClients = 1
maxClients = 3
minDatabases = 1
maxDatabases = 3
multiThreaded = false
disableClientBypass = true
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -11,23 +11,23 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -4,23 +4,23 @@ minClients = 1
maxClients = 3
multiThreaded = false
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -0,0 +1,37 @@
[[test]]
title = 'API Correctness with TLS'
multiThreaded = true
buggify = true
minFdbThreads = 2
maxFdbThreads = 8
minDatabases = 2
maxDatabases = 8
minClientThreads = 2
maxClientThreads = 8
minClients = 2
maxClients = 8
[[server]]
tls_enabled = true
max_num_processes = 1
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -11,23 +11,22 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
minTxTimeoutMs = 100
maxTxTimeoutMs = 10000
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
minTxTimeoutMs = 100
maxTxTimeoutMs = 10000
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
minTxTimeoutMs = 100
maxTxTimeoutMs = 10000
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
minTxTimeoutMs = 100
maxTxTimeoutMs = 10000

View File

@ -9,13 +9,13 @@ maxClients = 8
minTenants = 2
maxTenants = 5
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 5
initialSize = 100
numRandomOperations = 200
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 5
initialSize = 100
numRandomOperations = 200
readExistingKeysRatio = 0.9

View File

@ -0,0 +1,25 @@
[[test]]
title = 'Multi-tenant API Correctness Multi Threaded'
multiThreaded = true
buggify = true
minFdbThreads = 2
maxFdbThreads = 8
minClients = 2
maxClients = 8
minTenants = 2
maxTenants = 5
[[server]]
tls_enabled = true
max_num_processes = 1
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 5
initialSize = 100
numRandomOperations = 200
readExistingKeysRatio = 0.9

View File

@ -12,13 +12,13 @@ maxClientThreads = 4
minClients = 2
maxClients = 4
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -0,0 +1,28 @@
[[test]]
title = 'Test tampering the cluster file with TLS'
multiThreaded = true
buggify = true
tamperClusterFile = true
minFdbThreads = 2
maxFdbThreads = 4
minDatabases = 2
maxDatabases = 4
minClientThreads = 2
maxClientThreads = 4
minClients = 2
maxClients = 4
[[server]]
tls_enabled = true
max_num_processes = 1
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -46,7 +46,7 @@ int main(int argc, char** argv) {
}
fdb_check(fdb_select_api_version(FDB_API_VERSION));
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };
fdb_check(
fdb_network_set_option(FDBNetworkOption::FDB_NET_OPTION_TRACE_ENABLE, reinterpret_cast<const uint8_t*>(""), 0));

View File

@ -321,7 +321,16 @@ int populate(Database db,
const auto key_begin = insertBegin(args.rows, worker_id, thread_id, args.num_processes, args.num_threads);
const auto key_end = insertEnd(args.rows, worker_id, thread_id, args.num_processes, args.num_threads);
auto key_checkpoint = key_begin; // in case of commit failure, restart from this key
double required_keys = (key_end - key_begin + 1) * args.load_factor;
for (auto i = key_begin; i <= key_end; i++) {
// Choose required_keys out of (key_end -i + 1) randomly, so the probability is required_keys / (key_end - i
// + 1). Generate a random number in range [0, 1), if the generated number is smaller or equal to
// required_keys / (key_end - i + 1), then choose this key.
double r = rand() / (1.0 + RAND_MAX);
if (r > required_keys / (key_end - i + 1)) {
continue;
}
--required_keys;
/* sequential keys */
genKey(keystr.data(), KEY_PREFIX, args, i);
/* random values */
@ -984,6 +993,7 @@ int initArguments(Arguments& args) {
args.async_xacts = 0;
args.mode = MODE_INVALID;
args.rows = 100000;
args.load_factor = 1.0;
args.row_digits = digits(args.rows);
args.seconds = 30;
args.iteration = 0;
@ -1166,6 +1176,7 @@ void usage() {
printf("%-24s %s\n", "-t, --threads=THREADS", "Specify number of worker threads");
printf("%-24s %s\n", " --async_xacts", "Specify number of concurrent transactions to be run in async mode");
printf("%-24s %s\n", "-r, --rows=ROWS", "Specify number of records");
printf("%-24s %s\n", "-l, --load_factor=LOAD_FACTOR", "Specify load factor");
printf("%-24s %s\n", "-s, --seconds=SECONDS", "Specify the test duration in seconds\n");
printf("%-24s %s\n", "", "This option cannot be specified with --iteration.");
printf("%-24s %s\n", "-i, --iteration=ITERS", "Specify the number of iterations.\n");
@ -1228,6 +1239,7 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
{ "threads", required_argument, NULL, 't' },
{ "async_xacts", required_argument, NULL, ARG_ASYNC },
{ "rows", required_argument, NULL, 'r' },
{ "load_factor", required_argument, NULL, 'l' },
{ "seconds", required_argument, NULL, 's' },
{ "iteration", required_argument, NULL, 'i' },
{ "keylen", required_argument, NULL, ARG_KEYLEN },
@ -1304,6 +1316,9 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
args.rows = atoi(optarg);
args.row_digits = digits(args.rows);
break;
case 'l':
args.load_factor = atof(optarg);
break;
case 's':
args.seconds = atoi(optarg);
break;
@ -1523,6 +1538,10 @@ int validateArguments(Arguments const& args) {
logr.error("--rows must be a positive integer");
return -1;
}
if (args.load_factor <= 0 || args.load_factor > 1) {
logr.error("--load_factor must be in range (0, 1]");
return -1;
}
if (args.key_length < 0) {
logr.error("--keylen must be a positive integer");
return -1;
@ -2118,6 +2137,7 @@ int statsProcessMain(Arguments const& args,
fmt::fprintf(fp, "\"async_xacts\": %d,", args.async_xacts);
fmt::fprintf(fp, "\"mode\": %d,", args.mode);
fmt::fprintf(fp, "\"rows\": %d,", args.rows);
fmt::fprintf(fp, "\"load_factor\": %lf,", args.load_factor);
fmt::fprintf(fp, "\"seconds\": %d,", args.seconds);
fmt::fprintf(fp, "\"iteration\": %d,", args.iteration);
fmt::fprintf(fp, "\"tpsmax\": %d,", args.tpsmax);

View File

@ -138,6 +138,7 @@ struct Arguments {
int async_xacts;
int mode;
int rows; /* is 2 billion enough? */
double load_factor;
int row_digits;
int seconds;
int iteration;

View File

@ -233,7 +233,7 @@ int main(int argc, char** argv) {
applyNetworkOptions(options);
fdb::network::setup();
std::thread network_thread{ &fdb::network::run };
std::thread network_thread{ [] { fdb_check(fdb::network::run(), "FDB network thread failed"); } };
// Try calling some basic functionality that is available
// in all recent API versions

View File

@ -271,7 +271,7 @@ int main(int argc, char** argv) {
context.applyCommandLine(argc, argv);
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };
db = fdb_open_database(argv[1]);
timeoutDb = fdb_open_database(argv[1]);

View File

@ -66,7 +66,7 @@ TEST_CASE("setup") {
},
&context));
std::thread network_thread{ &fdb_run_network };
std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };
CHECK(!context.called);
fdb_check(fdb_stop_network());

View File

@ -68,7 +68,7 @@ int main(int argc, char** argv) {
set_net_opt(FDBNetworkOption::FDB_NET_OPTION_TRACE_PARTIAL_FILE_SUFFIX, trace_partial_file_suffix);
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };
// Apparently you need to open a database to initialize logging
FDBDatabase* out;

View File

@ -2998,7 +2998,7 @@ int main(int argc, char** argv) {
context.applyCommandLine(argc, argv);
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };
db = fdb_open_database(argv[1]);
clusterFilePath = std::string(argv[1]);

View File

@ -88,7 +88,7 @@ int main(int argc, char** argv) {
context.applyCommandLine(argc, argv);
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };
{
FDBCluster* cluster;

View File

@ -392,11 +392,6 @@ func (o DatabaseOptions) SetTransactionIncludePortInAddress() error {
return o.setOpt(505, nil)
}
// Set a random idempotency id for all transactions. See the transaction option description for more information.
func (o DatabaseOptions) SetTransactionAutomaticIdempotency() error {
return o.setOpt(506, nil)
}
// Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. This sets the ``bypass_unreadable`` option of each transaction created by this database. See the transaction option description for more information.
func (o DatabaseOptions) SetTransactionBypassUnreadable() error {
return o.setOpt(700, nil)
@ -556,18 +551,6 @@ func (o TransactionOptions) SetSizeLimit(param int64) error {
return o.setOpt(503, int64ToBytes(param))
}
// Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes.
//
// Parameter: Unique ID
func (o TransactionOptions) SetIdempotencyId(param string) error {
return o.setOpt(504, []byte(param))
}
// Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future.
func (o TransactionOptions) SetAutomaticIdempotency() error {
return o.setOpt(505, nil)
}
// Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior.
func (o TransactionOptions) SetSnapshotRywEnable() error {
return o.setOpt(600, nil)

View File

@ -320,11 +320,11 @@ function(create_long_running_correctness_package)
add_custom_command(
OUTPUT ${tar_file}
DEPENDS ${package_files}
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTest.sh
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTest.sh
${out_dir}/joshua_test
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh
${out_dir}/joshua_timeout
COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} ${package_files}
${out_dir}/joshua_test

View File

@ -0,0 +1,9 @@
#!/bin/sh
# Simulation currently has memory leaks. We need to investigate before we can enable leak detection in joshua.
export ASAN_OPTIONS="detect_leaks=0"
OLDBINDIR="${OLDBINDIR:-/app/deploy/global_data/oldBinaries}"
#mono bin/TestHarness.exe joshua-run "${OLDBINDIR}" false
python3 -m test_harness.app -s ${JOSHUA_SEED} --old-binaries-path ${OLDBINDIR} --long-running

View File

@ -0,0 +1,3 @@
#!/bin/bash -u
python3 -m test_harness.timeout --long-running

View File

@ -184,6 +184,8 @@ class Config:
self.reproduce_prefix: str | None = None
self.reproduce_prefix_args = {'type': str, 'required': False,
'help': 'When printing the results, prepend this string to the command'}
self.long_running: bool = False
self.long_running_args = {'action': 'store_true'}
self._env_names: Dict[str, str] = {}
self._config_map = self._build_map()
self._read_env()

View File

@ -303,6 +303,7 @@ class TestRun:
self.stats: str | None = stats
self.expected_unseed: int | None = expected_unseed
self.use_valgrind: bool = config.use_valgrind
self.long_running: bool = config.long_running
self.old_binary_path: Path = config.old_binaries_path
self.buggify_enabled: bool = buggify_enabled
self.fault_injection_enabled: bool = True
@ -375,7 +376,7 @@ class TestRun:
process = subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, cwd=self.temp_path,
text=True, env=env)
did_kill = False
timeout = 20 * config.kill_seconds if self.use_valgrind else config.kill_seconds
timeout = 20 * config.kill_seconds if self.use_valgrind or self.long_running else config.kill_seconds
err_out: str
try:
_, err_out = process.communicate(timeout=timeout)

View File

@ -384,6 +384,7 @@ class Summary:
child.attributes['Severity'] = '40'
child.attributes['ErrorCount'] = str(self.errors)
self.out.append(child)
self.error = True
if self.was_killed:
child = SummaryTree('ExternalTimeout')
child.attributes['Severity'] = '40'
@ -420,6 +421,7 @@ class Summary:
child = SummaryTree('TestUnexpectedlyNotFinished')
child.attributes['Severity'] = '40'
self.out.append(child)
self.error = True
if self.error_out is not None and len(self.error_out) > 0:
lines = self.error_out.splitlines()
stderr_bytes = 0

View File

@ -524,6 +524,12 @@ The ``start`` command will start a new restore on the specified (or default) tag
``--inconsistent-snapshot-only``
Ignore mutation log files during the restore to speedup the process. Because only range files are restored, this option gives an inconsistent snapshot in most cases and is not recommended to use.
``--user-data``
Restore only the user keyspace. This option should NOT be used alongside --system-metadata (below) and CANNOT be used alongside other specified key ranges.
``--system-metadata``
Restore only the relevant system keyspace. This option should NOT be used alongside --user-data (above) and CANNOT be used alongside other specified key ranges.
.. program:: fdbrestore abort
``abort``

View File

@ -648,6 +648,16 @@ The subclasses of the ``ApiWorkload`` inherit the following configuration option
initiated by a test script to check if the client workload is successfully progressing after a
cluster change.
The FDB server configuration can be specialized in the section ``[[server]]``:
- ``tenants_enabled``: enable multitenancy (default: true)
- ``blob_granules_enabled``: enable support for blob granules (default: false)
- ``tls_enabled``: enable TLS (default: false)
- ``tls_client_chain_len``: the length of the client-side TLS chain (default: 2)
- ``tls_server_chain_len``: the length of the server-side TLS chain (default: 3)
- ``min_num_processes`` and ``max_num_processes``: the number of FDB server processes to be
randomly selected from the given range (default 1-3)
Executing the Tests
===================
@ -656,19 +666,35 @@ according to its specification. Before that we must create a FDB cluster and pas
a parameter to ``fdb_c_api_tester``. Note that multithreaded tests also need to be provided with an
external client library.
For example, we can create a temporary cluster and use it for execution of one of the existing API tests:
The ``run_c_api_tests.py`` script automates execution of the API tests on a local cluster. The cluster
is created according to the options specified in the ``[[server]]`` section of the given test file.
.. code-block:: bash
${srcDir}/tests/TestRunner/tmp_cluster.py --build-dir ${buildDir} -- \
${buildDir}/bin/fdb_c_api_tester \
--cluster-file @CLUSTER_FILE@ \
--external-client-library=${buildDir}/bindings/c/libfdb_c_external.so \
${srcDir}/bindings/c/test/apitester/run_c_api_tests.py
--build-dir ${buildDir}
--api-tester-bin ${buildDir}/bin/fdb_c_api_tester
--external-client-library ${buildDir}/bindings/c/libfdb_c_external.so
--test-file ${srcDir}/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml
The test specifications added to the ``bindings/c/test/apitester/tests/`` directory are executed as a part
of the regression test suite. They can be executed using the ``ctest`` target ``fdb_c_api_tests``:
of the regression test suite as ``ctest`` targets with names ``fdb_c_api_test_{file_name}``.
The ``ctest`` targets provide a more convenient way for executing the API tests. We can execute
a single test:
.. code-block:: bash
ctest -R fdb_c_api_tests -VV
ctest -R fdb_c_api_test_CApiCorrectnessMultiThr -VV
or execute all of them in parallel (here ``-j20`` specifies the parallelization level):
.. code-block:: bash
ctest -R fdb_c_api_test_ -j20 --output-on-failure
More sophisticated filters can be applied to execute a selected set of tests, e.g. the tests using TLS:
.. code-block:: bash
ctest -R 'fdb_c_api_test_.*TLS' -j20 --output_on_failure

View File

@ -47,6 +47,7 @@
#include "fdbclient/IKnobCollection.h"
#include "fdbclient/RunTransaction.actor.h"
#include "fdbclient/S3BlobStore.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/json_spirit/json_spirit_writer_template.h"
#include "flow/Platform.h"
@ -155,6 +156,11 @@ enum {
OPT_RESTORE_CLUSTERFILE_ORIG,
OPT_RESTORE_BEGIN_VERSION,
OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY,
// The two restore options below allow callers of fdbrestore to divide a normal restore into one which restores just
// the system keyspace and another that restores just the user key space. This is unlike the backup command where
// all keys (both system and user) will be backed up together
OPT_RESTORE_USER_DATA,
OPT_RESTORE_SYSTEM_DATA,
// Shared constants
OPT_CLUSTERFILE,
@ -696,6 +702,8 @@ CSimpleOpt::SOption g_rgRestoreOptions[] = {
{ OPT_BACKUPKEYS, "--keys", SO_REQ_SEP },
{ OPT_WAITFORDONE, "-w", SO_NONE },
{ OPT_WAITFORDONE, "--waitfordone", SO_NONE },
{ OPT_RESTORE_USER_DATA, "--user-data", SO_NONE },
{ OPT_RESTORE_SYSTEM_DATA, "--system-metadata", SO_NONE },
{ OPT_RESTORE_VERSION, "--version", SO_REQ_SEP },
{ OPT_RESTORE_VERSION, "-v", SO_REQ_SEP },
{ OPT_TRACE, "--log", SO_NONE },
@ -1187,6 +1195,13 @@ static void printRestoreUsage(bool devhelp) {
printf(" The cluster file for the original database from which the backup was created. The "
"original database\n");
printf(" is only needed to convert a --timestamp argument to a database version.\n");
printf(" --user-data\n"
" Restore only the user keyspace. This option should NOT be used alongside "
"--system-metadata (below) and CANNOT be used alongside other specified key ranges.\n");
printf(
" --system-metadata\n"
" Restore only the relevant system keyspace. This option "
"should NOT be used alongside --user-data (above) and CANNOT be used alongside other specified key ranges.\n");
if (devhelp) {
#ifdef _WIN32
@ -3367,6 +3382,8 @@ int main(int argc, char* argv[]) {
bool trace = false;
bool quietDisplay = false;
bool dryRun = false;
bool restoreSystemKeys = false;
bool restoreUserKeys = false;
// TODO (Nim): Set this value when we add optional encrypt_files CLI argument to backup agent start
bool encryptionEnabled = true;
std::string traceDir = "";
@ -3691,6 +3708,14 @@ int main(int argc, char* argv[]) {
restoreVersion = ver;
break;
}
case OPT_RESTORE_USER_DATA: {
restoreUserKeys = true;
break;
}
case OPT_RESTORE_SYSTEM_DATA: {
restoreSystemKeys = true;
break;
}
case OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY: {
inconsistentSnapshotOnly.set(true);
break;
@ -3838,6 +3863,11 @@ int main(int argc, char* argv[]) {
}
}
if (restoreSystemKeys && restoreUserKeys) {
fprintf(stderr, "ERROR: Please only specify one of --user-data or --system-metadata, not both\n");
return FDB_EXIT_ERROR;
}
if (trace) {
if (!traceLogGroup.empty())
setNetworkOption(FDBNetworkOptions::TRACE_LOG_GROUP, StringRef(traceLogGroup));
@ -3938,10 +3968,30 @@ int main(int argc, char* argv[]) {
// The fastrestore tool does not yet support multiple ranges and is incompatible with tenants
// or other features that back up data in the system keys
if (backupKeys.empty() && programExe != ProgramExe::FASTRESTORE_TOOL) {
if (!restoreSystemKeys && !restoreUserKeys && backupKeys.empty() &&
programExe != ProgramExe::FASTRESTORE_TOOL) {
addDefaultBackupRanges(backupKeys);
}
if ((restoreSystemKeys || restoreUserKeys) && programExe == ProgramExe::FASTRESTORE_TOOL) {
fprintf(stderr, "ERROR: Options: --user-data and --system-metadata are not supported with fastrestore\n");
return FDB_EXIT_ERROR;
}
if ((restoreUserKeys || restoreSystemKeys) && !backupKeys.empty()) {
fprintf(stderr,
"ERROR: Cannot specify additional ranges when using --user-data or --system-metadata "
"options\n");
return FDB_EXIT_ERROR;
}
if (restoreUserKeys) {
backupKeys.push_back_deep(backupKeys.arena(), normalKeys);
} else if (restoreSystemKeys) {
for (const auto& r : getSystemBackupRanges()) {
backupKeys.push_back_deep(backupKeys.arena(), r);
}
}
switch (programExe) {
case ProgramExe::AGENT:
if (!initCluster())

View File

@ -93,8 +93,12 @@ ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
} else if (limitType == LimitType::RESERVED) {
quota.reservedQuota = (value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1;
}
if (!quota.isValid()) {
throw invalid_throttle_quota_value();
}
ThrottleApi::setTagQuota(tr, tag, quota.reservedQuota, quota.totalQuota);
wait(safeThreadFutureToFuture(tr->commit()));
fmt::print("Successfully updated quota.\n");
return Void();
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
@ -109,6 +113,7 @@ ACTOR Future<Void> clearQuota(Reference<IDatabase> db, TransactionTag tag) {
try {
tr->clear(ThrottleApi::getTagQuotaKey(tag));
wait(safeThreadFutureToFuture(tr->commit()));
fmt::print("Successfully cleared quota.\n");
return Void();
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));

View File

@ -1480,6 +1480,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
if (isCommitDesc && tokens.size() == 1) {
// prompt for description and add to txn
state Optional<std::string> raw;
warn.cancel();
while (!raw.present() || raw.get().empty()) {
fprintf(stdout,
"Please set a description for the change. Description must be non-empty.\n");
@ -1490,6 +1491,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
std::string line = raw.get();
config_tr->set("\xff\xff/description"_sr, line);
}
warn =
checkStatus(timeWarning(5.0, "\nWARNING: Long delay (Ctrl-C to interrupt)\n"), db, localDb);
if (transtype == TransType::Db) {
wait(commitTransaction(tr));
} else {
@ -1821,6 +1824,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
if (!intrans) {
// prompt for description and add to txn
state Optional<std::string> raw_desc;
warn.cancel();
while (!raw_desc.present() || raw_desc.get().empty()) {
fprintf(stdout,
"Please set a description for the change. Description must be non-empty\n");
@ -1830,6 +1834,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
}
std::string line = raw_desc.get();
config_tr->set("\xff\xff/description"_sr, line);
warn = checkStatus(
timeWarning(5.0, "\nWARNING: Long delay (Ctrl-C to interrupt)\n"), db, localDb);
wait(commitTransaction(config_tr));
} else {
isCommitDesc = true;

View File

@ -109,7 +109,7 @@ def quota(logger):
command = 'quota clear green'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == ''
assert output == 'Successfully cleared quota.'
command = 'quota get green total_throughput'
output = run_fdbcli_command(command)
@ -120,17 +120,17 @@ def quota(logger):
command = 'quota set red total_throughput 49152'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == ''
assert output == 'Successfully updated quota.'
command = 'quota set green total_throughput 32768'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == ''
assert output == 'Successfully updated quota.'
command = 'quota set green reserved_throughput 16384'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == ''
assert output == 'Successfully updated quota.'
command = 'quota get green total_throughput'
output = run_fdbcli_command(command)
@ -145,7 +145,7 @@ def quota(logger):
command = 'quota clear green'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == ''
assert output == 'Successfully cleared quota.'
command = 'quota get green total_throughput'
output = run_fdbcli_command(command)

View File

@ -63,7 +63,7 @@ public:
m_buffer = Standalone<VectorRef<uint8_t>>(old.slice(size, old.size()));
// Write the old buffer to the underlying file and update the write offset
Future<Void> r = holdWhile(old, m_file->write(old.begin(), size, m_writeOffset));
Future<Void> r = uncancellable(holdWhile(old, m_file->write(old.begin(), size, m_writeOffset)));
m_writeOffset += size;
return r;

View File

@ -1057,6 +1057,9 @@ ParsedDeltaBoundaryRef deltaAtVersion(const DeltaBoundaryRef& delta, Version beg
beginVersion <= delta.clearVersion.get();
if (delta.values.empty()) {
return ParsedDeltaBoundaryRef(delta.key, clearAfter);
} else if (readVersion >= delta.values.back().version && beginVersion <= delta.values.back().version) {
// for all but zero or one delta files, readVersion >= the entire delta file. optimize this case
return ParsedDeltaBoundaryRef(delta.key, clearAfter, delta.values.back());
}
auto valueAtVersion = std::lower_bound(delta.values.begin(),
delta.values.end(),
@ -1338,6 +1341,10 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
std::set<int16_t, std::greater<int16_t>> activeClears;
int16_t maxActiveClear = -1;
// trade off memory for cpu performance by assuming all inserts
RangeResult result;
int maxExpectedSize = 0;
// check if a given stream is actively clearing
bool clearActive[streams.size()];
for (int16_t i = 0; i < streams.size(); i++) {
@ -1355,14 +1362,12 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
item.streamIdx = i;
item.dataIdx = 0;
next.push(item);
maxExpectedSize += streams[i].size();
result.arena().dependsOn(streams[i].arena());
}
}
result.reserve(result.arena(), maxExpectedSize);
if (chunk.snapshotFile.present()) {
stats.snapshotRows += streams[0].size();
}
RangeResult result;
std::vector<MergeStreamNext> cur;
cur.reserve(streams.size());
while (!next.empty()) {
@ -1397,7 +1402,7 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
if (v.isSet() && maxActiveClear < it.streamIdx) {
KeyRef finalKey =
chunk.tenantPrefix.present() ? v.key.removePrefix(chunk.tenantPrefix.get()) : v.key;
result.push_back_deep(result.arena(), KeyValueRef(finalKey, v.value));
result.push_back(result.arena(), KeyValueRef(finalKey, v.value));
if (!includesSnapshot) {
stats.rowsInserted++;
} else if (it.streamIdx > 0) {
@ -1426,11 +1431,39 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
}
}
// FIXME: if memory assumption was wrong and result is significantly smaller than total input size, could copy it
// with push_back_deep to a new result. This is rare though
stats.outputBytes += result.expectedSize();
return result;
}
RangeResult materializeJustSnapshot(const BlobGranuleChunkRef& chunk,
Optional<StringRef> snapshotData,
const KeyRange& requestRange,
GranuleMaterializeStats& stats) {
stats.inputBytes += snapshotData.get().size();
Standalone<VectorRef<ParsedDeltaBoundaryRef>> snapshotRows = loadSnapshotFile(
chunk.snapshotFile.get().filename, snapshotData.get(), requestRange, chunk.snapshotFile.get().cipherKeysCtx);
RangeResult result;
if (!snapshotRows.empty()) {
result.arena().dependsOn(snapshotRows.arena());
result.reserve(result.arena(), snapshotRows.size());
for (auto& it : snapshotRows) {
// TODO REMOVE validation
ASSERT(it.op == MutationRef::Type::SetValue);
KeyRef finalKey = chunk.tenantPrefix.present() ? it.key.removePrefix(chunk.tenantPrefix.get()) : it.key;
result.push_back(result.arena(), KeyValueRef(finalKey, it.value));
}
stats.outputBytes += result.expectedSize();
stats.snapshotRows += result.size();
}
return result;
}
RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
KeyRangeRef keyRange,
Version beginVersion,
@ -1454,6 +1487,11 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
requestRange = keyRange;
}
// fast case for only-snapshot read
if (chunk.snapshotFile.present() && chunk.deltaFiles.empty() && chunk.newDeltas.empty()) {
return materializeJustSnapshot(chunk, snapshotData, requestRange, stats);
}
std::vector<Standalone<VectorRef<ParsedDeltaBoundaryRef>>> streams;
std::vector<bool> startClears;
// +1 for possible snapshot, +1 for possible memory deltas
@ -1471,7 +1509,10 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
streams.push_back(snapshotRows);
startClears.push_back(false);
arena.dependsOn(streams.back().arena());
stats.snapshotRows += snapshotRows.size();
}
} else {
ASSERT(!chunk.snapshotFile.present());
}
if (BG_READ_DEBUG) {
@ -2675,6 +2716,14 @@ struct CommonPrefixStats {
int totalKeys = 0;
int minKeySize = 1000000000;
int maxKeySize = 0;
int64_t logicalBytes = 0;
int64_t totalLogicalBytes = 0;
int deltas = 0;
int deltasSet = 0;
int deltasClear = 0;
int deltasNoOp = 0;
int deltasClearAfter = 0;
void addKey(const KeyRef& k) {
if (len == -1) {
@ -2689,7 +2738,38 @@ struct CommonPrefixStats {
maxKeySize = std::max(maxKeySize, k.size());
}
void addKeyValue(const KeyRef& k, const ValueRef& v) {
addKey(k);
logicalBytes += k.size();
logicalBytes += v.size();
}
void addBoundary(const ParsedDeltaBoundaryRef& d) {
addKey(d.key);
deltas++;
if (d.isSet()) {
deltasSet++;
logicalBytes += d.value.size();
} else if (d.isClear()) {
deltasClear++;
} else {
ASSERT(d.isNoOp());
deltasNoOp++;
}
if (d.clearAfter) {
deltasClearAfter++;
}
}
void doneFile() {
totalLogicalBytes += logicalBytes;
fmt::print("Logical Size: {0}\n", logicalBytes);
logicalBytes = 0;
}
Key done() {
doneFile();
ASSERT(len >= 0);
fmt::print("Common prefix: {0}\nCommon Prefix Length: {1}\nAverage Key Size: {2}\nMin Key Size: {3}, Max Key "
"Size: {4}\n",
@ -2698,11 +2778,21 @@ struct CommonPrefixStats {
totalKeySize / totalKeys,
minKeySize,
maxKeySize);
if (deltas > 0) {
fmt::print("Delta stats: {0} deltas, {1} sets, {2} clears, {3} noops, {4} clearAfters\n",
deltas,
deltasSet,
deltasClear,
deltasNoOp,
deltasClearAfter);
}
fmt::print("Logical Size: {0}\n", totalLogicalBytes);
return key.substr(0, len);
}
};
FileSet loadFileSet(std::string basePath, const std::vector<std::string>& filenames) {
FileSet loadFileSet(std::string basePath, const std::vector<std::string>& filenames, bool newFormat) {
FileSet files;
CommonPrefixStats stats;
for (int i = 0; i < filenames.size(); i++) {
@ -2713,40 +2803,66 @@ FileSet loadFileSet(std::string basePath, const std::vector<std::string>& filena
std::string fpath = basePath + filenames[i];
Value data = loadFileData(fpath);
Arena arena;
GranuleSnapshot file;
ObjectReader dataReader(data.begin(), Unversioned());
dataReader.deserialize(FileIdentifierFor<GranuleSnapshot>::value, file, arena);
Standalone<GranuleSnapshot> parsed(file, arena);
Standalone<GranuleSnapshot> parsed;
if (!newFormat) {
Arena arena;
GranuleSnapshot file;
ObjectReader dataReader(data.begin(), Unversioned());
dataReader.deserialize(FileIdentifierFor<GranuleSnapshot>::value, file, arena);
parsed = Standalone<GranuleSnapshot>(file, arena);
fmt::print("Loaded {0} rows from snapshot file\n", parsed.size());
for (auto& it : parsed) {
stats.addKeyValue(it.key, it.value);
}
} else {
Standalone<VectorRef<ParsedDeltaBoundaryRef>> res = loadSnapshotFile(""_sr, data, normalKeys, {});
fmt::print("Loaded {0} rows from snapshot file\n", res.size());
for (auto& it : res) {
stats.addKeyValue(it.key, it.value);
}
}
fmt::print("Loaded {0} rows from snapshot file\n", parsed.size());
files.snapshotFile = { filenames[i], version, data, parsed };
for (auto& it : parsed) {
stats.addKey(it.key);
}
} else {
std::string fpath = basePath + filenames[i];
Value data = loadFileData(fpath);
Arena arena;
GranuleDeltas file;
ObjectReader dataReader(data.begin(), Unversioned());
dataReader.deserialize(FileIdentifierFor<GranuleDeltas>::value, file, arena);
Standalone<GranuleDeltas> parsed(file, arena);
if (!newFormat) {
Arena arena;
GranuleDeltas file;
ObjectReader dataReader(data.begin(), Unversioned());
dataReader.deserialize(FileIdentifierFor<GranuleDeltas>::value, file, arena);
Standalone<GranuleDeltas> parsed(file, arena);
fmt::print("Loaded {0} deltas from delta file\n", parsed.size());
files.deltaFiles.push_back({ filenames[i], version, data, parsed });
fmt::print("Loaded {0} deltas from delta file\n", parsed.size());
files.deltaFiles.push_back({ filenames[i], version, data, parsed });
for (auto& it : parsed) {
for (auto& it2 : it.mutations) {
stats.addKey(it2.param1);
if (it2.type == MutationRef::Type::ClearRange) {
stats.addKey(it2.param2);
for (auto& it : parsed) {
for (auto& it2 : it.mutations) {
stats.addKey(it2.param1);
if (it2.type == MutationRef::Type::ClearRange) {
stats.addKey(it2.param2);
}
}
}
} else {
bool startClear = false;
Standalone<VectorRef<ParsedDeltaBoundaryRef>> res =
loadChunkedDeltaFile(""_sr, data, normalKeys, 0, version, {}, startClear);
ASSERT(!startClear);
Standalone<GranuleDeltas> parsed;
fmt::print("Loaded {0} boundaries from delta file\n", res.size());
files.deltaFiles.push_back({ filenames[i], version, data, parsed });
for (auto& it : res) {
stats.addBoundary(it);
}
}
}
stats.doneFile();
}
files.commonPrefix = stats.done();
@ -2804,6 +2920,28 @@ std::pair<int64_t, double> doDeltaWriteBench(const Standalone<GranuleDeltas>& da
return { serializedBytes, elapsed };
}
void chunkFromFileSet(const FileSet& fileSet,
Standalone<BlobGranuleChunkRef>& chunk,
StringRef* deltaPtrs,
Version readVersion,
Optional<BlobGranuleCipherKeysCtx> keys,
int numDeltaFiles) {
size_t snapshotSize = std::get<3>(fileSet.snapshotFile).size();
chunk.snapshotFile =
BlobFilePointerRef(chunk.arena(), std::get<0>(fileSet.snapshotFile), 0, snapshotSize, snapshotSize, keys);
for (int i = 0; i < numDeltaFiles; i++) {
size_t deltaSize = std::get<3>(fileSet.deltaFiles[i]).size();
chunk.deltaFiles.emplace_back_deep(
chunk.arena(), std::get<0>(fileSet.deltaFiles[i]), 0, deltaSize, deltaSize, keys);
deltaPtrs[i] = std::get<2>(fileSet.deltaFiles[i]);
}
chunk.keyRange = fileSet.range;
chunk.includedVersion = readVersion;
chunk.snapshotVersion = std::get<1>(fileSet.snapshotFile);
}
FileSet rewriteChunkedFileSet(const FileSet& fileSet,
Optional<BlobGranuleCipherKeysCtx> keys,
Optional<CompressionFilter> compressionFilter) {
@ -2830,41 +2968,30 @@ std::pair<int64_t, double> doReadBench(const FileSet& fileSet,
KeyRange readRange,
bool clearAllAtEnd,
Optional<BlobGranuleCipherKeysCtx> keys,
Optional<CompressionFilter> compressionFilter) {
int numDeltaFiles,
bool printStats = false) {
Version readVersion = std::get<1>(fileSet.deltaFiles.back());
Standalone<BlobGranuleChunkRef> chunk;
GranuleMaterializeStats stats;
StringRef deltaPtrs[fileSet.deltaFiles.size()];
ASSERT(numDeltaFiles >= 0 && numDeltaFiles <= fileSet.deltaFiles.size());
StringRef deltaPtrs[numDeltaFiles];
MutationRef clearAllAtEndMutation;
if (clearAllAtEnd) {
clearAllAtEndMutation = MutationRef(MutationRef::Type::ClearRange, readRange.begin, readRange.end);
}
if (chunked) {
size_t snapshotSize = std::get<3>(fileSet.snapshotFile).size();
chunk.snapshotFile =
BlobFilePointerRef(chunk.arena(), std::get<0>(fileSet.snapshotFile), 0, snapshotSize, snapshotSize, keys);
for (int i = 0; i < fileSet.deltaFiles.size(); i++) {
size_t deltaSize = std::get<3>(fileSet.deltaFiles[i]).size();
chunk.deltaFiles.emplace_back_deep(
chunk.arena(), std::get<0>(fileSet.deltaFiles[i]), 0, deltaSize, deltaSize, keys);
deltaPtrs[i] = std::get<2>(fileSet.deltaFiles[i]);
}
chunkFromFileSet(fileSet, chunk, deltaPtrs, readVersion, keys, numDeltaFiles);
if (clearAllAtEnd) {
readVersion++;
MutationsAndVersionRef lastDelta;
lastDelta.version = readVersion;
lastDelta.mutations.push_back(chunk.arena(), clearAllAtEndMutation);
chunk.includedVersion = readVersion;
chunk.newDeltas.push_back_deep(chunk.arena(), lastDelta);
}
chunk.keyRange = fileSet.range;
chunk.includedVersion = readVersion;
chunk.snapshotVersion = std::get<1>(fileSet.snapshotFile);
}
int64_t serializedBytes = 0;
@ -2897,15 +3024,16 @@ std::pair<int64_t, double> doReadBench(const FileSet& fileSet,
elapsed /= READ_RUNS;
serializedBytes /= READ_RUNS;
// TODO REMOVE
fmt::print("Materialize stats:\n");
fmt::print(" Input bytes: {0}\n", stats.inputBytes);
fmt::print(" Output bytes: {0}\n", stats.outputBytes);
fmt::print(" Write Amp: {0}\n", (1.0 * stats.inputBytes) / stats.outputBytes);
fmt::print(" Snapshot Rows: {0}\n", stats.snapshotRows);
fmt::print(" Rows Cleared: {0}\n", stats.rowsCleared);
fmt::print(" Rows Inserted: {0}\n", stats.rowsInserted);
fmt::print(" Rows Updated: {0}\n", stats.rowsUpdated);
if (printStats) {
fmt::print("Materialize stats:\n");
fmt::print(" Input bytes: {0}\n", stats.inputBytes / READ_RUNS);
fmt::print(" Output bytes: {0}\n", stats.outputBytes / READ_RUNS);
fmt::print(" Write Amp: {0}\n", (1.0 * stats.inputBytes) / stats.outputBytes);
fmt::print(" Snapshot Rows: {0}\n", stats.snapshotRows / READ_RUNS);
fmt::print(" Rows Cleared: {0}\n", stats.rowsCleared / READ_RUNS);
fmt::print(" Rows Inserted: {0}\n", stats.rowsInserted / READ_RUNS);
fmt::print(" Rows Updated: {0}\n", stats.rowsUpdated / READ_RUNS);
}
return { serializedBytes, elapsed };
}
@ -2937,7 +3065,7 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
int64_t logicalSnapshotSize = 0;
int64_t logicalDeltaSize = 0;
for (auto& it : fileSetNames) {
FileSet fileSet = loadFileSet(basePath, it);
FileSet fileSet = loadFileSet(basePath, it, false);
fileSets.push_back(fileSet);
logicalSnapshotSize += std::get<3>(fileSet.snapshotFile).expectedSize();
for (auto& deltaFile : fileSet.deltaFiles) {
@ -2968,7 +3096,7 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
if (encrypt) {
name += "ENC";
}
if (compressionFilter.present()) {
if (compressionFilter.present() && compressionFilter.get() != CompressionFilter::NONE) {
name += "CMP";
}
if (name.empty()) {
@ -3024,9 +3152,16 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
std::vector<std::string> readRunNames = {};
std::vector<std::pair<int64_t, double>> readMetrics;
bool doEdgeCaseReadTests = true;
bool doEdgeCaseReadTests = false;
bool doVaryingDeltaTests = false;
std::vector<double> clearAllReadMetrics;
std::vector<double> readSingleKeyMetrics;
std::vector<std::vector<std::pair<int64_t, double>>> varyingDeltaMetrics;
size_t maxDeltaFiles = 100000;
for (auto& f : fileSets) {
maxDeltaFiles = std::min(maxDeltaFiles, f.deltaFiles.size());
}
for (bool chunk : chunkModes) {
for (bool encrypt : encryptionModes) {
@ -3049,7 +3184,7 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
if (encrypt) {
name += "ENC";
}
if (compressionFilter.present()) {
if (compressionFilter.present() && compressionFilter.get() != CompressionFilter::NONE) {
name += "CMP";
}
if (name.empty()) {
@ -3062,6 +3197,10 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
double totalElapsed = 0.0;
double totalElapsedClearAll = 0.0;
double totalElapsedSingleKey = 0.0;
std::vector<std::pair<int64_t, double>> varyingDeltas;
for (int i = 0; i <= maxDeltaFiles; i++) {
varyingDeltas.push_back({ 0, 0.0 });
}
for (auto& fileSet : fileSets) {
FileSet newFileSet;
if (!chunk) {
@ -3070,24 +3209,38 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
newFileSet = rewriteChunkedFileSet(fileSet, keys, compressionFilter);
}
auto res = doReadBench(newFileSet, chunk, fileSet.range, false, keys, compressionFilter);
auto res = doReadBench(newFileSet, chunk, fileSet.range, false, keys, newFileSet.deltaFiles.size());
totalBytesRead += res.first;
totalElapsed += res.second;
if (doEdgeCaseReadTests) {
totalElapsedClearAll +=
doReadBench(newFileSet, chunk, fileSet.range, true, keys, compressionFilter).second;
doReadBench(newFileSet, chunk, fileSet.range, true, keys, newFileSet.deltaFiles.size())
.second;
Key k = std::get<3>(fileSet.snapshotFile).front().key;
KeyRange singleKeyRange(KeyRangeRef(k, keyAfter(k)));
totalElapsedSingleKey +=
doReadBench(newFileSet, chunk, singleKeyRange, false, keys, compressionFilter).second;
doReadBench(newFileSet, chunk, singleKeyRange, false, keys, newFileSet.deltaFiles.size())
.second;
}
if (doVaryingDeltaTests && chunk) {
for (int i = 0; i <= maxDeltaFiles; i++) {
auto r = doReadBench(newFileSet, chunk, fileSet.range, false, keys, i);
varyingDeltas[i].first += r.first;
varyingDeltas[i].second += r.second;
}
}
}
readMetrics.push_back({ totalBytesRead, totalElapsed });
if (doEdgeCaseReadTests) {
clearAllReadMetrics.push_back(totalElapsedClearAll);
readSingleKeyMetrics.push_back(totalElapsedSingleKey);
}
if (doVaryingDeltaTests) {
varyingDeltaMetrics.push_back(varyingDeltas);
}
}
}
}
@ -3121,6 +3274,25 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
}
}
if (doVaryingDeltaTests) {
ASSERT(readRunNames.size() == varyingDeltaMetrics.size());
fmt::print("\n\nVarying Deltas Read Results:\nDF#\t");
for (int i = 0; i <= maxDeltaFiles; i++) {
fmt::print("{0}\t", i);
}
fmt::print("\n");
for (int i = 0; i < readRunNames.size(); i++) {
fmt::print("{0}", readRunNames[i]);
for (auto& it : varyingDeltaMetrics[i]) {
double MBperCPUsec = (it.first / 1024.0 / 1024.0) / it.second;
fmt::print("\t{:.6}", MBperCPUsec);
}
fmt::print("\n");
}
}
fmt::print("\n\nCombined Results:\n");
ASSERT(readRunNames.size() == runNames.size() - 1);
for (int i = 0; i < readRunNames.size(); i++) {
@ -3137,3 +3309,22 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
return Void();
}
TEST_CASE("!/blobgranule/files/repeatFromFiles") {
std::string basePath = "SET_ME";
std::vector<std::vector<std::string>> fileSetNames = { { "SET_ME" } };
int64_t totalBytesRead = 0;
double totalElapsed = 0.0;
for (auto& it : fileSetNames) {
FileSet fileSet = loadFileSet(basePath, it, true);
auto res = doReadBench(fileSet, true, fileSet.range, false, {}, fileSet.deltaFiles.size(), true);
totalBytesRead += res.first;
totalElapsed += res.second;
}
double MBperCPUsec = (totalBytesRead / 1024.0 / 1024.0) / totalElapsed;
fmt::print("Read Results: {:.6} MB/cpusec\n", MBperCPUsec);
return Void();
}

View File

@ -142,7 +142,6 @@ bool isRangeFullyCovered(KeyRange range, Standalone<VectorRef<BlobGranuleChunkRe
for (const BlobGranuleChunkRef& chunk : blobChunks) {
blobRanges.push_back(chunk.keyRange);
}
return range.isCovered(blobRanges);
}
@ -194,7 +193,7 @@ TEST_CASE("/fdbserver/blobgranule/isRangeCoveredByBlob") {
testAddChunkRange("key_a1"_sr, "key_a9"_sr, continuedChunks);
testAddChunkRange("key_a9"_sr, "key_b1"_sr, continuedChunks);
testAddChunkRange("key_b1"_sr, "key_b9"_sr, continuedChunks);
ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_b9"_sr), continuedChunks) == false);
ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_b9"_sr), continuedChunks));
}
return Void();
}

View File

@ -1040,13 +1040,10 @@ private:
Key lastValue;
};
ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
Standalone<VectorRef<KeyValueRef>>* results,
bool encryptedBlock,
Optional<Database> cx) {
void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>* results) {
// Read begin key, if this fails then block was invalid.
state uint32_t kLen = reader->consumeNetworkUInt32();
state const uint8_t* k = reader->consume(kLen);
uint32_t kLen = reader->consumeNetworkUInt32();
const uint8_t* k = reader->consume(kLen);
results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef()));
// Read kv pairs and end key
@ -1075,7 +1072,6 @@ ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
for (auto b : reader->remainder())
if (b != 0xFF)
throw restore_corrupted_data_padding();
return Void();
}
ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file,
@ -1083,7 +1079,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
int len,
Optional<Database> cx) {
state Standalone<StringRef> buf = makeString(len);
int rLen = wait(file->read(mutateString(buf), len, offset));
int rLen = wait(uncancellable(holdWhile(buf, file->read(mutateString(buf), len, offset))));
if (rLen != len)
throw restore_bad_read();
@ -1098,7 +1094,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
// BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION
int32_t file_version = reader.consume<int32_t>();
if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {
wait(decodeKVPairs(&reader, &results, false, cx));
decodeKVPairs(&reader, &results);
} else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) {
CODE_PROBE(true, "decoding encrypted block");
ASSERT(cx.present());
@ -1121,7 +1117,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
StringRef decryptedData =
wait(EncryptedRangeFileWriter::decrypt(cx.get(), header, dataPayloadStart, dataLen, &results.arena()));
reader = StringRefReader(decryptedData, restore_corrupted_data());
wait(decodeKVPairs(&reader, &results, true, cx));
decodeKVPairs(&reader, &results);
} else {
throw restore_unsupported_file_version();
}

View File

@ -2559,19 +2559,19 @@ bool schemaMatch(json_spirit::mValue const& schemaValue,
}
}
void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota) {
void setStorageQuota(Transaction& tr, StringRef tenantName, int64_t quota) {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
auto key = storageQuotaKey(tenantName);
tr.set(key, BinaryWriter::toValue<uint64_t>(quota, Unversioned()));
tr.set(key, BinaryWriter::toValue<int64_t>(quota, Unversioned()));
}
ACTOR Future<Optional<uint64_t>> getStorageQuota(Transaction* tr, StringRef tenantName) {
ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantName) {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
state Optional<Value> v = wait(tr->get(storageQuotaKey(tenantName)));
if (!v.present()) {
return Optional<uint64_t>();
return Optional<int64_t>();
}
return BinaryReader::fromStringRef<uint64_t>(v.get(), Unversioned());
return BinaryReader::fromStringRef<int64_t>(v.get(), Unversioned());
}
std::string ManagementAPI::generateErrorMessage(const CoordinatorsResult& res) {

View File

@ -297,7 +297,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC, isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120;
init( DD_TENANT_AWARENESS_ENABLED, false );
init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL, 10 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
// TeamRemover
init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
@ -421,6 +422,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// Enable this knob only for experminatal purpose, never enable this in production.
// If enabled, all the committed in-memory memtable writes are lost on a crash.
init( ROCKSDB_DISABLE_WAL_EXPERIMENTAL, false );
// If ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE is enabled, disable ENABLE_CLEAR_RANGE_EAGER_READS knob.
// These knobs have contrary functionality.
init( ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE, false ); if( randomize && BUGGIFY ) ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE = deterministicRandom()->coinflip() ? false : true;
init( ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT, 200000 ); // 200KB
// Can commit will delay ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD seconds for
// ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD times, if rocksdb overloaded.
// Set ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD to 0, to disable
@ -788,7 +793,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( RANGESTREAM_LIMIT_BYTES, 2e6 ); if( randomize && BUGGIFY ) RANGESTREAM_LIMIT_BYTES = 1;
init( CHANGEFEEDSTREAM_LIMIT_BYTES, 1e6 ); if( randomize && BUGGIFY ) CHANGEFEEDSTREAM_LIMIT_BYTES = 1;
init( BLOBWORKERSTATUSSTREAM_LIMIT_BYTES, 1e4 ); if( randomize && BUGGIFY ) BLOBWORKERSTATUSSTREAM_LIMIT_BYTES = 1;
init( ENABLE_CLEAR_RANGE_EAGER_READS, true );
init( ENABLE_CLEAR_RANGE_EAGER_READS, true ); if( randomize && BUGGIFY ) ENABLE_CLEAR_RANGE_EAGER_READS = deterministicRandom()->coinflip() ? false : true;
init( CHECKPOINT_TRANSFER_BLOCK_BYTES, 40e6 );
init( QUICK_GET_VALUE_FALLBACK, true );
init( QUICK_GET_KEY_VALUES_FALLBACK, true );

View File

@ -62,8 +62,8 @@ struct BlobMetadataDetailsRef {
BlobMetadataDomainNameRef domainName,
Optional<StringRef> base,
VectorRef<StringRef> partitions,
int64_t refreshAt,
int64_t expireAt)
double refreshAt,
double expireAt)
: domainId(domainId), domainName(ar, domainName), partitions(ar, partitions), refreshAt(refreshAt),
expireAt(expireAt) {
if (base.present()) {

View File

@ -336,12 +336,13 @@ struct KeyRangeRef {
bool isCovered(std::vector<KeyRangeRef>& ranges) {
ASSERT(std::is_sorted(ranges.begin(), ranges.end(), KeyRangeRef::ArbitraryOrder()));
KeyRangeRef clone(begin, end);
for (auto r : ranges) {
if (begin < r.begin)
if (clone.begin < r.begin)
return false; // uncovered gap between clone.begin and r.begin
if (end <= r.end)
if (clone.end <= r.end)
return true; // range is fully covered
if (end > r.begin)
if (clone.end > r.begin)
// {clone.begin, r.end} is covered. need to check coverage for {r.end, clone.end}
clone = KeyRangeRef(r.end, clone.end);
}
@ -1402,6 +1403,25 @@ struct TenantMode {
serializer(ar, mode);
}
// This does not go back-and-forth cleanly with toString
// The '_experimental' suffix, if present, needs to be removed in order to be parsed.
static TenantMode fromString(std::string mode) {
if (mode.find("_experimental") != std::string::npos) {
mode.replace(mode.find("_experimental"), std::string::npos, "");
}
if (mode == "disabled") {
return TenantMode::DISABLED;
} else if (mode == "optional") {
return TenantMode::OPTIONAL_TENANT;
} else if (mode == "required") {
return TenantMode::REQUIRED;
} else {
TraceEvent(SevError, "UnknownTenantMode").detail("TenantMode", mode);
ASSERT(false);
throw internal_error();
}
}
std::string toString() const {
switch (mode) {
case DISABLED:
@ -1669,8 +1689,8 @@ struct Versionstamp {
template <class Ar>
void serialize(Ar& ar) {
uint64_t beVersion;
uint16_t beBatch;
int64_t beVersion;
int16_t beBatch;
if constexpr (!Ar::isDeserializing) {
beVersion = bigEndian64(version);
@ -1680,7 +1700,7 @@ struct Versionstamp {
serializer(ar, beVersion, beBatch);
if constexpr (Ar::isDeserializing) {
version = bigEndian64(version);
version = bigEndian64(beVersion);
batchNumber = bigEndian16(beBatch);
}
}

View File

@ -104,6 +104,11 @@ Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>> getL
// Collect cached cipher keys.
for (auto& domain : domains) {
if (domain.first == FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
ASSERT(domain.second == FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
} else if (domain.first == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
ASSERT(domain.second == FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
}
Reference<BlobCipherKey> cachedCipherKey = cipherKeyCache->getLatestCipherKey(domain.first /*domainId*/);
if (cachedCipherKey.isValid()) {
cipherKeys[domain.first] = cachedCipherKey;
@ -301,7 +306,7 @@ template <class T>
Future<TextAndHeaderCipherKeys> getLatestSystemEncryptCipherKeys(const Reference<AsyncVar<T> const>& db,
BlobCipherMetrics::UsageType usageType) {
return getLatestEncryptCipherKeysForDomain(
db, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME, usageType);
db, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME, usageType);
}
ACTOR template <class T>

View File

@ -164,8 +164,8 @@ bool schemaMatch(json_spirit::mValue const& schema,
ACTOR Future<Void> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID);
// Set and get the storage quota per tenant
void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota);
ACTOR Future<Optional<uint64_t>> getStorageQuota(Transaction* tr, StringRef tenantName);
void setStorageQuota(Transaction& tr, StringRef tenantName, int64_t quota);
ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantName);
#include "flow/unactorcompiler.h"
#endif

View File

@ -237,8 +237,10 @@ public:
DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC; // Minimal age of a correct-configured server before it's chosen to be wiggled
bool DD_TENANT_AWARENESS_ENABLED;
int TENANT_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantCache is refreshed
int TENANT_CACHE_STORAGE_REFRESH_INTERVAL; // How often the storage bytes used by each tenant in the TenantCache is
// refreshed
int TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL; // How often the storage bytes used by each tenant is refreshed
// in the TenantCache
int TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL; // How often the storage quota allocated to each tenant is
// refreshed in the TenantCache
// TeamRemover to remove redundant teams
bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor
@ -345,6 +347,8 @@ public:
int ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD;
int ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD;
bool ROCKSDB_DISABLE_WAL_EXPERIMENTAL;
bool ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE;
int64_t ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT;
int64_t ROCKSDB_COMPACTION_READAHEAD_SIZE;
int64_t ROCKSDB_BLOCK_SIZE;
bool ENABLE_SHARDED_ROCKSDB;

View File

@ -535,28 +535,33 @@ public:
}
void put(const TenantNameEntryPair& pair) {
TenantEntryCachePayload<T> payload = createPayloadFunc(pair.first, pair.second);
auto idItr = mapByTenantId.find(pair.second.id);
auto nameItr = mapByTenantName.find(pair.first);
const auto& [name, entry] = pair;
TenantEntryCachePayload<T> payload = createPayloadFunc(name, entry);
auto idItr = mapByTenantId.find(entry.id);
auto nameItr = mapByTenantName.find(name);
Optional<TenantName> existingName;
Optional<int64_t> existingId;
if (nameItr != mapByTenantName.end()) {
existingId = nameItr->value.entry.id;
mapByTenantId.erase(nameItr->value.entry.id);
}
if (idItr != mapByTenantId.end()) {
existingName = idItr->value.name;
mapByTenantName.erase(idItr->value.name);
}
if (existingId.present()) {
mapByTenantId.erase(existingId.get());
}
if (existingName.present()) {
mapByTenantName.erase(existingName.get());
}
mapByTenantId[pair.second.id] = payload;
mapByTenantName[pair.first] = payload;
mapByTenantId[entry.id] = payload;
mapByTenantName[name] = payload;
TraceEvent("TenantEntryCachePut")
.detail("TenantName", pair.first)
.detail("TenantName", name)
.detail("TenantNameExisting", existingName)
.detail("TenantID", pair.second.id)
.detail("TenantID", entry.id)
.detail("TenantIDExisting", existingId)
.detail("TenantPrefix", pair.second.prefix);
@ -582,4 +587,4 @@ public:
};
#include "flow/unactorcompiler.h"
#endif // FDBCLIENT_TENANTENTRYCACHE_ACTOR_H
#endif // FDBCLIENT_TENANTENTRYCACHE_ACTOR_H

View File

@ -202,8 +202,9 @@ description is not currently required but encouraged.
description="Deprecated. Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect."
defaultFor="23"/>
<Option name="transaction_automatic_idempotency" code="506"
description="Set a random idempotency id for all transactions. See the transaction option description for more information."
defaultFor="505"/>
description="Set a random idempotency id for all transactions. See the transaction option description for more information. This feature is in development and not ready for general use."
defaultFor="505"
hidden="true"/>
<Option name="transaction_bypass_unreadable" code="700"
description="Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. This sets the ``bypass_unreadable`` option of each transaction created by this database. See the transaction option description for more information."
defaultFor="1100"/>
@ -278,9 +279,11 @@ description is not currently required but encouraged.
description="Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit." />
<Option name="idempotency_id" code="504"
paramType="String" paramDescription="Unique ID"
description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes." />
description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes. This feature is in development and not ready for general use."
hidden="true" />
<Option name="automatic_idempotency" code="505"
description="Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future." />
description="Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future. This feature is in development and not ready for general use."
hidden="true" />
<Option name="snapshot_ryw_enable" code="600"
description="Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior." />
<Option name="snapshot_ryw_disable" code="601"

View File

@ -48,15 +48,17 @@ public:
ACTOR static Future<Standalone<StringRef>> readBlock(AsyncFileEncrypted* self, uint32_t block) {
state Arena arena;
state unsigned char* encrypted = new (arena) unsigned char[FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE];
int bytes = wait(
self->file->read(encrypted, FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE, FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE * block));
int bytes = wait(uncancellable(holdWhile(arena,
self->file->read(encrypted,
FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE,
FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE * block))));
StreamCipherKey const* cipherKey = StreamCipherKey::getGlobalCipherKey();
DecryptionStreamCipher decryptor(cipherKey, self->getIV(block));
auto decrypted = decryptor.decrypt(encrypted, bytes, arena);
return Standalone<StringRef>(decrypted, arena);
}
ACTOR static Future<int> read(AsyncFileEncrypted* self, void* data, int length, int64_t offset) {
ACTOR static Future<int> read(Reference<AsyncFileEncrypted> self, void* data, int length, int64_t offset) {
state const uint32_t firstBlock = offset / FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE;
state const uint32_t lastBlock = (offset + length - 1) / FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE;
state uint32_t block;
@ -70,7 +72,7 @@ public:
if (cachedBlock.present()) {
plaintext = cachedBlock.get();
} else {
wait(store(plaintext, readBlock(self, block)));
wait(store(plaintext, readBlock(self.getPtr(), block)));
self->readBuffers.insert(block, plaintext);
}
auto start = (block == firstBlock) ? plaintext.begin() + (offset % FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE)
@ -96,7 +98,7 @@ public:
return bytesRead;
}
ACTOR static Future<Void> write(AsyncFileEncrypted* self, void const* data, int length, int64_t offset) {
ACTOR static Future<Void> write(Reference<AsyncFileEncrypted> self, void const* data, int length, int64_t offset) {
ASSERT(self->mode == AsyncFileEncrypted::Mode::APPEND_ONLY);
// All writes must append to the end of the file:
ASSERT_EQ(offset, self->currentBlock * FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE + self->offsetInBlock);
@ -122,7 +124,7 @@ public:
return Void();
}
ACTOR static Future<Void> sync(AsyncFileEncrypted* self) {
ACTOR static Future<Void> sync(Reference<AsyncFileEncrypted> self) {
ASSERT(self->mode == AsyncFileEncrypted::Mode::APPEND_ONLY);
wait(self->writeLastBlockToFile());
wait(self->file->sync());
@ -135,7 +137,7 @@ public:
Arena arena;
auto zeroes = new (arena) unsigned char[length];
memset(zeroes, 0, length);
wait(self->write(zeroes, length, offset));
wait(uncancellable(holdWhile(arena, self->write(zeroes, length, offset))));
return Void();
}
};
@ -159,11 +161,11 @@ void AsyncFileEncrypted::delref() {
}
Future<int> AsyncFileEncrypted::read(void* data, int length, int64_t offset) {
return AsyncFileEncryptedImpl::read(this, data, length, offset);
return AsyncFileEncryptedImpl::read(Reference<AsyncFileEncrypted>::addRef(this), data, length, offset);
}
Future<Void> AsyncFileEncrypted::write(void const* data, int length, int64_t offset) {
return AsyncFileEncryptedImpl::write(this, data, length, offset);
return AsyncFileEncryptedImpl::write(Reference<AsyncFileEncrypted>::addRef(this), data, length, offset);
}
Future<Void> AsyncFileEncrypted::zeroRange(int64_t offset, int64_t length) {
@ -177,7 +179,7 @@ Future<Void> AsyncFileEncrypted::truncate(int64_t size) {
Future<Void> AsyncFileEncrypted::sync() {
ASSERT(mode == Mode::APPEND_ONLY);
return AsyncFileEncryptedImpl::sync(this);
return AsyncFileEncryptedImpl::sync(Reference<AsyncFileEncrypted>::addRef(this));
}
Future<Void> AsyncFileEncrypted::flush() {
@ -217,7 +219,11 @@ StreamCipher::IV AsyncFileEncrypted::getIV(uint32_t block) const {
}
Future<Void> AsyncFileEncrypted::writeLastBlockToFile() {
return file->write(&writeBuffer[0], offsetInBlock, currentBlock * FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE);
// The source buffer for the write is owned by *this so this must be kept alive by reference count until the write
// is finished.
return uncancellable(
holdWhile(Reference<AsyncFileEncrypted>::addRef(this),
file->write(&writeBuffer[0], offsetInBlock, currentBlock * FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE)));
}
size_t AsyncFileEncrypted::RandomCache::evict() {

View File

@ -71,8 +71,9 @@ public:
// Wait for diskDelay before submitting the I/O
// Template types are being provided explicitly because they can't be automatically deduced for some reason.
// Capture file by value in case this is destroyed during the delay
return mapAsync<Void, std::function<Future<int>(Void)>, int>(
delay(diskDelay), [=](Void _) -> Future<int> { return file->read(data, length, offset); });
delay(diskDelay), [=, file = file](Void _) -> Future<int> { return file->read(data, length, offset); });
}
Future<Void> write(void const* data, int length, int64_t offset) override {
@ -111,12 +112,14 @@ public:
}
// Wait for diskDelay before submitting the I/O
return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(delay(diskDelay), [=](Void _) -> Future<Void> {
if (pdata)
return holdWhile(arena, file->write(pdata, length, offset));
// Capture file by value in case this is destroyed during the delay
return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(
delay(diskDelay), [=, file = file](Void _) -> Future<Void> {
if (pdata)
return holdWhile(arena, file->write(pdata, length, offset));
return file->write(data, length, offset);
});
return file->write(data, length, offset);
});
}
Future<Void> truncate(int64_t size) override {
@ -125,8 +128,9 @@ public:
return file->truncate(size);
// Wait for diskDelay before submitting the I/O
// Capture file by value in case this is destroyed during the delay
return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(
delay(diskDelay), [=](Void _) -> Future<Void> { return file->truncate(size); });
delay(diskDelay), [=, file = file](Void _) -> Future<Void> { return file->truncate(size); });
}
Future<Void> sync() override {
@ -135,8 +139,9 @@ public:
return file->sync();
// Wait for diskDelay before submitting the I/O
// Capture file by value in case this is destroyed during the delay
return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(
delay(diskDelay), [=](Void _) -> Future<Void> { return file->sync(); });
delay(diskDelay), [=, file = file](Void _) -> Future<Void> { return file->sync(); });
}
Future<int64_t> size() const override {
@ -145,8 +150,9 @@ public:
return file->size();
// Wait for diskDelay before submitting the I/O
// Capture file by value in case this is destroyed during the delay
return mapAsync<Void, std::function<Future<int64_t>(Void)>, int64_t>(
delay(diskDelay), [=](Void _) -> Future<int64_t> { return file->size(); });
delay(diskDelay), [=, file = file](Void _) -> Future<int64_t> { return file->size(); });
}
int64_t debugFD() const override { return file->debugFD(); }

View File

@ -46,12 +46,17 @@ ACTOR Future<Void> sendErrorOnProcess(ISimulator::ProcessInfo* process,
TaskPriority taskID);
ACTOR template <class T>
Future<T> sendErrorOnShutdown(Future<T> in) {
choose {
when(wait(success(g_simulator->getCurrentProcess()->shutdownSignal.getFuture()))) {
throw io_error().asInjectedFault();
Future<T> sendErrorOnShutdown(Future<T> in, bool assertOnCancel = false) {
try {
choose {
when(wait(success(g_simulator->getCurrentProcess()->shutdownSignal.getFuture()))) {
throw io_error().asInjectedFault();
}
when(T rep = wait(in)) { return rep; }
}
when(T rep = wait(in)) { return rep; }
} catch (Error& e) {
ASSERT(e.code() != error_code_actor_cancelled || !assertOnCancel);
throw;
}
}
@ -59,9 +64,12 @@ class AsyncFileDetachable final : public IAsyncFile, public ReferenceCounted<Asy
private:
Reference<IAsyncFile> file;
Future<Void> shutdown;
bool assertOnReadWriteCancel;
public:
explicit AsyncFileDetachable(Reference<IAsyncFile> file) : file(file) { shutdown = doShutdown(this); }
explicit AsyncFileDetachable(Reference<IAsyncFile> file) : file(file), assertOnReadWriteCancel(true) {
shutdown = doShutdown(this);
}
ACTOR Future<Void> doShutdown(AsyncFileDetachable* self) {
wait(success(g_simulator->getCurrentProcess()->shutdownSignal.getFuture()));
@ -84,13 +92,13 @@ public:
Future<int> read(void* data, int length, int64_t offset) override {
if (!file.getPtr() || g_simulator->getCurrentProcess()->shutdownSignal.getFuture().isReady())
return io_error().asInjectedFault();
return sendErrorOnShutdown(file->read(data, length, offset));
return sendErrorOnShutdown(file->read(data, length, offset), assertOnReadWriteCancel);
}
Future<Void> write(void const* data, int length, int64_t offset) override {
if (!file.getPtr() || g_simulator->getCurrentProcess()->shutdownSignal.getFuture().isReady())
return io_error().asInjectedFault();
return sendErrorOnShutdown(file->write(data, length, offset));
return sendErrorOnShutdown(file->write(data, length, offset), assertOnReadWriteCancel);
}
Future<Void> truncate(int64_t size) override {

View File

@ -52,7 +52,7 @@ public:
state Reference<CacheBlock> block(new CacheBlock(length));
try {
int len = wait(f->m_f->read(block->data, length, offset));
int len = wait(uncancellable(holdWhile(block, f->m_f->read(block->data, length, offset))));
block->len = len;
} catch (Error& e) {
f->m_max_concurrent_reads.release(1);

View File

@ -32,14 +32,18 @@ public:
// For read() and write(), the data buffer must remain valid until the future is ready
Future<int> read(void* data, int length, int64_t offset) override {
return map(m_f->read(data, length, offset), [=](int r) {
updateChecksumHistory(false, offset, r, (uint8_t*)data);
// Lambda must hold a reference to this to keep it alive until after the read
auto self = Reference<AsyncFileWriteChecker>::addRef(this);
return map(m_f->read(data, length, offset), [self, data, offset](int r) {
self->updateChecksumHistory(false, offset, r, (uint8_t*)data);
return r;
});
}
Future<Void> readZeroCopy(void** data, int* length, int64_t offset) override {
return map(m_f->readZeroCopy(data, length, offset), [=](Void r) {
updateChecksumHistory(false, offset, *length, (uint8_t*)data);
// Lambda must hold a reference to this to keep it alive until after the read
auto self = Reference<AsyncFileWriteChecker>::addRef(this);
return map(m_f->readZeroCopy(data, length, offset), [self, data, length, offset](Void r) {
self->updateChecksumHistory(false, offset, *length, (uint8_t*)data);
return r;
});
}
@ -50,12 +54,14 @@ public:
}
Future<Void> truncate(int64_t size) override {
return map(m_f->truncate(size), [=](Void r) {
// Lambda must hold a reference to this to keep it alive until after the read
auto self = Reference<AsyncFileWriteChecker>::addRef(this);
return map(m_f->truncate(size), [self, size](Void r) {
// Truncate the page checksum history if it is in use
if ((size / checksumHistoryPageSize) < checksumHistory.size()) {
int oldCapacity = checksumHistory.capacity();
checksumHistory.resize(size / checksumHistoryPageSize);
checksumHistoryBudget.get() -= (checksumHistory.capacity() - oldCapacity);
if ((size / checksumHistoryPageSize) < self->checksumHistory.size()) {
int oldCapacity = self->checksumHistory.capacity();
self->checksumHistory.resize(size / checksumHistoryPageSize);
checksumHistoryBudget.get() -= (self->checksumHistory.capacity() - oldCapacity);
}
return r;
});

View File

@ -239,7 +239,7 @@ public:
// Sets endpoint to be a new local endpoint which delivers messages to the given receiver
void addEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID);
void addEndpoints(std::vector<std::pair<struct FlowReceiver*, TaskPriority>> const& streams);
void addEndpoints(std::vector<std::pair<class FlowReceiver*, TaskPriority>> const& streams);
// The given local endpoint no longer delivers messages to the given receiver or uses resources
void removeEndpoint(const Endpoint&, NetworkMessageReceiver*);

View File

@ -42,8 +42,6 @@ struct TenantInfo {
// Is set during deserialization. It will be set to true if the tenant
// name is set and the client is authorized to use this tenant.
bool tenantAuthorized = false;
// Number of storage bytes currently used by this tenant.
int64_t storageUsage = 0;
// Helper function for most endpoints that read/write data. This returns true iff
// the client is either a) a trusted peer or b) is accessing keyspace belonging to a tenant,

View File

@ -28,9 +28,14 @@
#include "fdbrpc/FailureMonitor.h"
#include "fdbrpc/networksender.actor.h"
struct FlowReceiver : public NetworkMessageReceiver {
// Common endpoint code for NetSAV<> and NetNotifiedQueue<>
// Common endpoint code for NetSAV<> and NetNotifiedQueue<>
class FlowReceiver : public NetworkMessageReceiver, public NonCopyable {
Optional<PeerCompatibilityPolicy> peerCompatibilityPolicy_;
Endpoint endpoint;
bool m_isLocalEndpoint;
bool m_stream;
protected:
FlowReceiver() : m_isLocalEndpoint(false), m_stream(false) {}
FlowReceiver(Endpoint const& remoteEndpoint, bool stream)
@ -46,8 +51,17 @@ struct FlowReceiver : public NetworkMessageReceiver {
}
}
bool isLocalEndpoint() { return m_isLocalEndpoint; }
bool isRemoteEndpoint() { return endpoint.isValid() && !m_isLocalEndpoint; }
public:
bool isLocalEndpoint() const { return m_isLocalEndpoint; }
bool isRemoteEndpoint() const { return endpoint.isValid() && !m_isLocalEndpoint; }
void setRemoteEndpoint(Endpoint const& remoteEndpoint, bool stream) {
ASSERT(!m_isLocalEndpoint);
ASSERT(!endpoint.isValid());
endpoint = remoteEndpoint;
m_stream = stream;
FlowTransport::transport().addPeerReference(endpoint, m_stream);
}
// If already a remote endpoint, returns that. Otherwise makes this
// a local endpoint and returns that.
@ -80,12 +94,6 @@ struct FlowReceiver : public NetworkMessageReceiver {
}
const Endpoint& getRawEndpoint() { return endpoint; }
private:
Optional<PeerCompatibilityPolicy> peerCompatibilityPolicy_;
Endpoint endpoint;
bool m_isLocalEndpoint;
bool m_stream;
};
template <class T>
@ -363,8 +371,9 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue<T>,
this->sendError(message.getError());
} else {
if (message.get().asUnderlyingType().acknowledgeToken.present()) {
acknowledgements = AcknowledgementReceiver(
FlowTransport::transport().loadedEndpoint(message.get().asUnderlyingType().acknowledgeToken.get()));
acknowledgements.setRemoteEndpoint(
FlowTransport::transport().loadedEndpoint(message.get().asUnderlyingType().acknowledgeToken.get()),
false);
if (onConnect.isValid() && onConnect.canBeSet()) {
onConnect.send(Void());
}

View File

@ -1240,6 +1240,7 @@ public:
PromiseTask* task = self->taskQueue.getReadyTask();
self->taskQueue.popReadyTask();
self->execTask(*task);
delete task;
self->yielded = false;
}
}
@ -2261,7 +2262,7 @@ public:
}
// Implementation
struct PromiseTask final {
struct PromiseTask final : public FastAllocated<PromiseTask> {
Promise<Void> promise;
ProcessInfo* machine;
explicit PromiseTask(ProcessInfo* machine) : machine(machine) {}

View File

@ -3537,7 +3537,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
}
// skip the rest of the algorithm for the first blob manager
if (bmData->epoch == 1) {
if (bmData->epoch == 1 && !isFullRestoreMode()) {
bmData->doneRecovering.send(Void());
return Void();
}

View File

@ -26,6 +26,7 @@
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbserver/Knobs.h"
#include "flow/FastRef.h"
#include "flow/Trace.h"
#include "flow/flow.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/BlobConnectionProvider.h"
@ -189,23 +190,6 @@ private:
static const int sMaxCount_{ 5 }; // max number of manifest file to keep
};
// Defines granule info that interests full restore
struct BlobGranuleVersion {
// Two constructors required by VectorRef
BlobGranuleVersion() {}
BlobGranuleVersion(Arena& a, const BlobGranuleVersion& copyFrom)
: granuleID(copyFrom.granuleID), keyRange(a, copyFrom.keyRange), version(copyFrom.version),
sizeInBytes(copyFrom.sizeInBytes) {}
UID granuleID;
KeyRangeRef keyRange;
Version version;
int64_t sizeInBytes;
};
// Defines a vector for BlobGranuleVersion
typedef Standalone<VectorRef<BlobGranuleVersion>> BlobGranuleVersionVector;
// Defines filename, version, size for each granule file that interests full restore
struct GranuleFileVersion {
Version version;
@ -226,16 +210,53 @@ public:
Value data = wait(readFromFile(self));
Standalone<BlobManifest> manifest = decode(data);
wait(writeSystemKeys(self, manifest.rows));
BlobGranuleVersionVector _ = wait(listGranules(self));
BlobGranuleRestoreVersionVector _ = wait(listGranules(self));
} catch (Error& e) {
dprint("WARNING: unexpected manifest loader error {}\n", e.what()); // skip error handling so far
}
return Void();
}
// Iterate active granules and return their version/sizes
ACTOR static Future<BlobGranuleRestoreVersionVector> listGranules(Reference<BlobManifestLoader> self) {
state Transaction tr(self->db_);
loop {
state BlobGranuleRestoreVersionVector results;
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
std::vector<KeyRangeRef> granules;
state int i = 0;
auto limit = GetRangeLimits::BYTE_LIMIT_UNLIMITED;
state RangeResult blobRanges = wait(tr.getRange(blobGranuleMappingKeys, limit));
for (i = 0; i < blobRanges.size() - 1; i++) {
Key startKey = blobRanges[i].key.removePrefix(blobGranuleMappingKeys.begin);
Key endKey = blobRanges[i + 1].key.removePrefix(blobGranuleMappingKeys.begin);
state KeyRange granuleRange = KeyRangeRef(startKey, endKey);
try {
Standalone<BlobGranuleRestoreVersion> granule = wait(getGranule(&tr, granuleRange));
results.push_back_deep(results.arena(), granule);
} catch (Error& e) {
if (e.code() == error_code_restore_missing_data) {
dprint("missing data for key range {} \n", granuleRange.toString());
TraceEvent("BlobRestoreMissingData").detail("KeyRange", granuleRange.toString());
} else {
throw;
}
}
}
return results;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Print out a summary for blob granules
ACTOR static Future<Void> print(Reference<BlobManifestLoader> self) {
state BlobGranuleVersionVector granules = wait(listGranules(self));
state BlobGranuleRestoreVersionVector granules = wait(listGranules(self));
for (auto granule : granules) {
wait(checkGranuleFiles(self, granule));
}
@ -285,41 +306,9 @@ private:
}
}
// Iterate active granules and return their version/sizes
ACTOR static Future<BlobGranuleVersionVector> listGranules(Reference<BlobManifestLoader> self) {
state Transaction tr(self->db_);
loop {
state BlobGranuleVersionVector results;
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
std::vector<KeyRangeRef> granules;
state int i = 0;
auto limit = GetRangeLimits::BYTE_LIMIT_UNLIMITED;
state RangeResult blobRanges = wait(tr.getRange(blobGranuleMappingKeys, limit));
for (i = 0; i < blobRanges.size() - 1; i++) {
Key startKey = blobRanges[i].key.removePrefix(blobGranuleMappingKeys.begin);
Key endKey = blobRanges[i + 1].key.removePrefix(blobGranuleMappingKeys.begin);
state KeyRange granuleRange = KeyRangeRef(startKey, endKey);
try {
Standalone<BlobGranuleVersion> granule = wait(getGranule(&tr, granuleRange));
results.push_back_deep(results.arena(), granule);
} catch (Error& e) {
dprint("missing data for key range {} \n", granuleRange.toString());
}
}
return results;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Find the newest granule for a key range. The newest granule has the max version and relevant files
ACTOR static Future<Standalone<BlobGranuleVersion>> getGranule(Transaction* tr, KeyRangeRef range) {
state Standalone<BlobGranuleVersion> granuleVersion;
ACTOR static Future<Standalone<BlobGranuleRestoreVersion>> getGranule(Transaction* tr, KeyRangeRef range) {
state Standalone<BlobGranuleRestoreVersion> granuleVersion;
KeyRange historyKeyRange = blobGranuleHistoryKeyRangeFor(range);
// reverse lookup so that the first row is the newest version
state RangeResult results =
@ -389,7 +378,7 @@ private:
}
// Read data from granules and print out summary
ACTOR static Future<Void> checkGranuleFiles(Reference<BlobManifestLoader> self, BlobGranuleVersion granule) {
ACTOR static Future<Void> checkGranuleFiles(Reference<BlobManifestLoader> self, BlobGranuleRestoreVersion granule) {
state KeyRangeRef range = granule.keyRange;
state Version readVersion = granule.version;
state Transaction tr(self->db_);
@ -441,3 +430,11 @@ ACTOR Future<Void> printRestoreSummary(Database db, Reference<BlobConnectionProv
wait(BlobManifestLoader::print(loader));
return Void();
}
// API to list blob granules
ACTOR Future<BlobGranuleRestoreVersionVector> listBlobGranules(Database db,
Reference<BlobConnectionProvider> blobConn) {
Reference<BlobManifestLoader> loader = makeReference<BlobManifestLoader>(db, blobConn);
BlobGranuleRestoreVersionVector result = wait(BlobManifestLoader::listGranules(loader));
return result;
}

View File

@ -30,54 +30,312 @@
#include "fdbclient/KeyRangeMap.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbserver/ServerDBInfo.actor.h"
#include "fdbserver/WaitFailure.h"
#include "fdbserver/MoveKeys.actor.h"
#include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "flow/actorcompiler.h" // has to be last include
#include "flow/network.h"
#include <algorithm>
#include <string>
#define ENABLE_DEBUG_MG true
template <typename... T>
static inline void dprint(fmt::format_string<T...> fmt, T&&... args) {
if (ENABLE_DEBUG_MG)
fmt::print(fmt, std::forward<T>(args)...);
}
// BlobMigrator manages data migration from blob storage to storage server. It implements a minimal set of
// StorageServerInterface APIs which are needed for DataDistributor to start data migration.
class BlobMigrator : public NonCopyable, public ReferenceCounted<BlobMigrator> {
public:
BlobMigrator(Reference<AsyncVar<ServerDBInfo> const> dbInfo, BlobMigratorInterface interf)
: blobMigratorInterf(interf), actors(false) {
if (!blobConn.isValid() && SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
blobConn = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL);
: interf_(interf), actors_(false) {
if (!blobConn_.isValid() && SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
blobConn_ = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL);
}
db = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True);
db_ = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True);
}
~BlobMigrator() {}
// Start migration
ACTOR static Future<Void> start(Reference<BlobMigrator> self) {
self->actors.add(waitFailureServer(self->blobMigratorInterf.waitFailure.getFuture()));
if (!isFullRestoreMode()) {
return Void();
}
wait(delay(10)); // TODO need to wait for a signal for readiness of blob manager
BlobGranuleRestoreVersionVector granules = wait(listBlobGranules(self->db_, self->blobConn_));
self->blobGranules_ = granules;
wait(prepare(self, normalKeys));
wait(serverLoop(self));
return Void();
}
private:
// Prepare for data migration for given key range.
ACTOR static Future<Void> prepare(Reference<BlobMigrator> self, KeyRangeRef keys) {
// Register as a storage server, so that DataDistributor could start data movement after
std::pair<Version, Tag> verAndTag = wait(addStorageServer(self->db_, self->interf_.ssi));
dprint("Started storage server interface {} {}\n", verAndTag.first, verAndTag.second.toString());
// Reassign key ranges to the storage server
// It'll restart DataDistributor so that internal data structures like ShardTracker, ShardsAffectedByTeamFailure
// could be re-initialized. Ideally it should be done within DataDistributor, then we don't need to
// restart DataDistributor
state int oldMode = wait(setDDMode(self->db_, 0));
wait(unassignServerKeys(self, keys));
wait(assignKeysToServer(self, keys, self->interf_.ssi.id()));
wait(success(setDDMode(self->db_, oldMode)));
return Void();
}
// Assign given key range to specified storage server. Subsquent
ACTOR static Future<Void> assignKeysToServer(Reference<BlobMigrator> self, KeyRangeRef keys, UID serverUID) {
state Transaction tr(self->db_);
loop {
choose {
when(HaltBlobMigratorRequest req = waitNext(self->blobMigratorInterf.haltBlobMigrator.getFuture())) {
req.reply.send(Void());
TraceEvent("BlobMigratorHalted", self->blobMigratorInterf.id()).detail("ReqID", req.requesterID);
break;
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
state Value value = keyServersValue(std::vector<UID>({ serverUID }), std::vector<UID>(), UID(), UID());
wait(krmSetRange(&tr, keyServersPrefix, keys, value));
wait(krmSetRange(&tr, serverKeysPrefixFor(serverUID), keys, serverKeysTrue));
wait(tr.commit());
dprint("Assign {} to server {}\n", normalKeys.toString(), serverUID.toString());
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Unassign given key range from its current storage servers
ACTOR static Future<Void> unassignServerKeys(Reference<BlobMigrator> self, KeyRangeRef keys) {
state Transaction tr(self->db_);
loop {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
state RangeResult serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));
ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY);
for (auto& server : serverList) {
state UID id = decodeServerListValue(server.value).id();
RangeResult ranges = wait(krmGetRanges(&tr, serverKeysPrefixFor(id), keys));
bool owning = false;
for (auto& r : ranges) {
if (r.value == serverKeysTrue) {
owning = true;
break;
}
}
if (owning) {
dprint("Unassign {} from storage server {}\n", keys.toString(), id.toString());
wait(krmSetRange(&tr, serverKeysPrefixFor(id), keys, serverKeysFalse));
}
}
when(wait(self->actors.getResult())) {}
wait(tr.commit());
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Main server loop
ACTOR static Future<Void> serverLoop(Reference<BlobMigrator> self) {
self->actors_.add(waitFailureServer(self->interf_.ssi.waitFailure.getFuture()));
self->actors_.add(handleRequest(self));
self->actors_.add(handleUnsupportedRequest(self));
loop {
try {
choose {
when(HaltBlobMigratorRequest req = waitNext(self->interf_.haltBlobMigrator.getFuture())) {
req.reply.send(Void());
TraceEvent("BlobMigratorHalted", self->interf_.id()).detail("ReqID", req.requesterID);
break;
}
when(wait(self->actors_.getResult())) {}
}
} catch (Error& e) {
dprint("Unexpected serverLoop error {}\n", e.what());
throw;
}
}
return Void();
}
// Handle StorageServerInterface APIs
ACTOR static Future<Void> handleRequest(Reference<BlobMigrator> self) {
state StorageServerInterface ssi = self->interf_.ssi;
loop {
try {
choose {
when(GetShardStateRequest req = waitNext(ssi.getShardState.getFuture())) {
dprint("Handle GetShardStateRequest\n");
Version version = maxVersion(self);
GetShardStateReply rep(version, version);
req.reply.send(rep); // return empty shards
}
when(WaitMetricsRequest req = waitNext(ssi.waitMetrics.getFuture())) {
// dprint("Handle WaitMetricsRequest\n");
self->actors_.add(processWaitMetricsRequest(self, req));
}
when(SplitMetricsRequest req = waitNext(ssi.splitMetrics.getFuture())) {
dprint("Handle SplitMetrics {}\n", req.keys.toString());
SplitMetricsReply rep;
for (auto granule : self->blobGranules_) {
// TODO: Use granule boundary as split point. A better approach is to split by size
if (granule.keyRange.begin > req.keys.begin && granule.keyRange.end < req.keys.end)
rep.splits.push_back_deep(rep.splits.arena(), granule.keyRange.begin);
}
req.reply.send(rep);
}
when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) {
fmt::print("Handle GetStorageMetrics\n");
StorageMetrics metrics;
metrics.bytes = sizeInBytes(self);
GetStorageMetricsReply resp;
resp.load = metrics;
req.reply.send(resp);
}
when(ReplyPromise<KeyValueStoreType> reply = waitNext(ssi.getKeyValueStoreType.getFuture())) {
dprint("Handle KeyValueStoreType\n");
reply.send(KeyValueStoreType::MEMORY);
}
}
} catch (Error& e) {
dprint("Unexpected blob migrator request error {}\n", e.what());
throw;
}
}
}
// Handle StorageServerInterface APIs that are not supported. Simply log and return error
ACTOR static Future<Void> handleUnsupportedRequest(Reference<BlobMigrator> self) {
state StorageServerInterface ssi = self->interf_.ssi;
loop {
try {
choose {
when(SplitRangeRequest req = waitNext(ssi.getRangeSplitPoints.getFuture())) {
dprint("Unsupported SplitRangeRequest\n");
req.reply.sendError(unsupported_operation());
}
when(StorageQueuingMetricsRequest req = waitNext(ssi.getQueuingMetrics.getFuture())) {
self->actors_.add(processStorageQueuingMetricsRequest(req));
}
when(ReadHotSubRangeRequest req = waitNext(ssi.getReadHotRanges.getFuture())) {
dprint("Unsupported ReadHotSubRange\n");
req.reply.sendError(unsupported_operation());
}
when(GetKeyValuesStreamRequest req = waitNext(ssi.getKeyValuesStream.getFuture())) {
dprint("Unsupported GetKeyValuesStreamRequest\n");
req.reply.sendError(unsupported_operation());
}
when(GetKeyRequest req = waitNext(ssi.getKey.getFuture())) {
dprint("Unsupported GetKeyRequest\n");
req.reply.sendError(unsupported_operation());
}
when(GetKeyValuesRequest req = waitNext(ssi.getKeyValues.getFuture())) {
/* dprint("Unsupported GetKeyValuesRequest {} - {} @ {}\n",
req.begin.getKey().printable(),
req.end.getKey().printable(),
req.version); */
req.reply.sendError(unsupported_operation());
}
when(GetValueRequest req = waitNext(ssi.getValue.getFuture())) {
dprint("Unsupported GetValueRequest\n");
req.reply.sendError(unsupported_operation());
}
when(GetCheckpointRequest req = waitNext(ssi.checkpoint.getFuture())) {
dprint("Unsupported GetCheckpoint \n");
req.reply.sendError(unsupported_operation());
}
when(FetchCheckpointRequest req = waitNext(ssi.fetchCheckpoint.getFuture())) {
dprint("Unsupported FetchCheckpointRequest\n");
req.reply.sendError(unsupported_operation());
}
when(UpdateCommitCostRequest req = waitNext(ssi.updateCommitCostRequest.getFuture())) {
dprint("Unsupported UpdateCommitCostRequest\n");
req.reply.sendError(unsupported_operation());
}
when(FetchCheckpointKeyValuesRequest req = waitNext(ssi.fetchCheckpointKeyValues.getFuture())) {
dprint("Unsupported FetchCheckpointKeyValuesRequest\n");
req.reply.sendError(unsupported_operation());
}
}
} catch (Error& e) {
dprint("Unexpected request handling error {}\n", e.what());
throw;
}
}
}
ACTOR static Future<Void> processWaitMetricsRequest(Reference<BlobMigrator> self, WaitMetricsRequest req) {
state WaitMetricsRequest waitMetricsRequest = req;
// FIXME get rid of this delay. it's a temp solution to avoid starvaion scheduling of DD
// processes
wait(delay(1));
StorageMetrics metrics;
metrics.bytes = sizeInBytes(self, waitMetricsRequest.keys);
waitMetricsRequest.reply.send(metrics);
return Void();
}
ACTOR static Future<Void> processStorageQueuingMetricsRequest(StorageQueuingMetricsRequest req) {
dprint("Unsupported StorageQueuingMetricsRequest\n");
// FIXME get rid of this delay. it's a temp solution to avoid starvaion scheduling of DD
// processes
wait(delay(1));
req.reply.sendError(unsupported_operation());
return Void();
}
// Return total storage size in bytes for migration
static int64_t sizeInBytes(Reference<BlobMigrator> self) { return sizeInBytes(self, normalKeys); }
// Return storage size in bytes for given key range
static int64_t sizeInBytes(Reference<BlobMigrator> self, KeyRangeRef range) {
int64_t bytes = 0;
for (auto granule : self->blobGranules_) {
if (range.intersects(granule.keyRange))
bytes += granule.sizeInBytes;
}
return bytes;
}
// Return max version for all blob granules
static Version maxVersion(Reference<BlobMigrator> self) {
Version max = 0;
for (auto granule : self->blobGranules_) {
max = std::max(granule.version, max);
}
return max;
}
private:
Database db;
Reference<BlobConnectionProvider> blobConn;
BlobMigratorInterface blobMigratorInterf;
ActorCollection actors;
Database db_;
Reference<BlobConnectionProvider> blobConn_;
BlobGranuleRestoreVersionVector blobGranules_;
BlobMigratorInterface interf_;
ActorCollection actors_;
};
// Main entry point
ACTOR Future<Void> blobMigrator(BlobMigratorInterface ssi, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
fmt::print("Start blob migrator {} \n", ssi.id().toString());
ACTOR Future<Void> blobMigrator(BlobMigratorInterface interf, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
fmt::print("Start blob migrator {} \n", interf.id().toString());
try {
Reference<BlobMigrator> self = makeReference<BlobMigrator>(dbInfo, ssi);
Reference<BlobMigrator> self = makeReference<BlobMigrator>(dbInfo, interf);
wait(BlobMigrator::start(self));
} catch (Error& e) {
fmt::print("unexpected blob migrator error {}\n", e.what());
dprint("Unexpected blob migrator error {}\n", e.what());
TraceEvent("BlobMigratorError", interf.id()).error(e);
}
return Void();
}

View File

@ -3961,7 +3961,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
}
}
if (createChangeFeed) {
if (createChangeFeed && !isFullRestoreMode()) {
// create new change feed for new version of granule
wait(updateChangeFeed(
&tr, granuleIDToCFKey(info.granuleID), ChangeFeedStatus::CHANGE_FEED_CREATE, req.keyRange));

View File

@ -2615,8 +2615,9 @@ ACTOR Future<Void> monitorBlobMigrator(ClusterControllerData* self) {
}
loop {
if (self->db.serverInfo->get().blobMigrator.present() && !self->recruitBlobMigrator.get()) {
state Future<Void> wfClient = waitFailureClient(self->db.serverInfo->get().blobMigrator.get().waitFailure,
SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME);
state Future<Void> wfClient =
waitFailureClient(self->db.serverInfo->get().blobMigrator.get().ssi.waitFailure,
SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME);
loop {
choose {
when(wait(wfClient)) {

View File

@ -687,6 +687,20 @@ struct DDQueue : public IDDRelocationQueue {
Reference<EventCacheHolder> movedKeyServersEventHolder;
int moveReusePhysicalShard;
int moveCreateNewPhysicalShard;
enum RetryFindDstReason {
None = 0,
RemoteBestTeamNotReady,
PrimaryNoHealthyTeam,
RemoteNoHealthyTeam,
RemoteTeamIsFull,
RemoteTeamIsNotHealthy,
NoAvailablePhysicalShard,
NumberOfTypes,
};
std::vector<int> retryFindDstReasonCount;
void startRelocation(int priority, int healthPriority) {
// Although PRIORITY_TEAM_REDUNDANT has lower priority than split and merge shard movement,
// we must count it into unhealthyRelocations; because team removers relies on unhealthyRelocations to
@ -750,7 +764,9 @@ struct DDQueue : public IDDRelocationQueue {
output(output), input(input), getShardMetrics(getShardMetrics), getTopKMetrics(getTopKMetrics), lastInterval(0),
suppressIntervals(0), rawProcessingUnhealthy(new AsyncVar<bool>(false)),
rawProcessingWiggle(new AsyncVar<bool>(false)), unhealthyRelocations(0),
movedKeyServersEventHolder(makeReference<EventCacheHolder>("MovedKeyServers")) {}
movedKeyServersEventHolder(makeReference<EventCacheHolder>("MovedKeyServers")), moveReusePhysicalShard(0),
moveCreateNewPhysicalShard(0), retryFindDstReasonCount(static_cast<int>(RetryFindDstReason::NumberOfTypes), 0) {
}
DDQueue() = default;
void validate() {
@ -1463,6 +1479,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
loop {
destOverloadedCount = 0;
stuckCount = 0;
state DDQueue::RetryFindDstReason retryFindDstReason = DDQueue::RetryFindDstReason::None;
// state int bestTeamStuckThreshold = 50;
loop {
state int tciIndex = 0;
@ -1489,10 +1506,13 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
.detail("TeamCollectionIndex", tciIndex)
.detail("RestoreDataMoveForDest",
describe(tciIndex == 0 ? rd.dataMove->primaryDest : rd.dataMove->remoteDest));
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteBestTeamNotReady;
foundTeams = false;
break;
}
if (!bestTeam.first.present() || !bestTeam.first.get()->isHealthy()) {
retryFindDstReason = tciIndex == 0 ? DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam
: DDQueue::RetryFindDstReason::RemoteNoHealthyTeam;
foundTeams = false;
break;
}
@ -1545,12 +1565,15 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
// getting the destination team or we could miss failure notifications for the storage
// servers in the destination team
TraceEvent("BestTeamNotReady");
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteBestTeamNotReady;
foundTeams = false;
break;
}
// If a DC has no healthy team, we stop checking the other DCs until
// the unhealthy DC is healthy again or is excluded.
if (!bestTeam.first.present()) {
retryFindDstReason = tciIndex == 0 ? DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam
: DDQueue::RetryFindDstReason::RemoteNoHealthyTeam;
foundTeams = false;
break;
}
@ -1574,6 +1597,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
if (tciIndex == 1 && !forceToUseNewPhysicalShard) {
bool minAvailableSpaceRatio = bestTeam.first.get()->getMinAvailableSpaceRatio(true);
if (minAvailableSpaceRatio < SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO) {
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsFull;
foundTeams = false;
break;
}
@ -1616,6 +1640,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
bestTeams.size() > 1 && !forceToUseNewPhysicalShard) {
if (!bestTeams[1].first->isHealthy()) {
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy;
foundTeams = false;
}
}
@ -1676,6 +1701,19 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
// when !rd.isRestore(), dataMoveId is just decided as physicalShardIDCandidate
// thus, update the physicalShardIDCandidate to related data structures
ASSERT(physicalShardIDCandidate != UID().first());
if (self->physicalShardCollection->physicalShardExists(physicalShardIDCandidate)) {
self->moveReusePhysicalShard++;
} else {
self->moveCreateNewPhysicalShard++;
if (retryFindDstReason == DDQueue::RetryFindDstReason::None) {
// When creating a new physical shard, but the reason is none, this can only happen when
// determinePhysicalShardIDGivenPrimaryTeam() finds that there is no available physical
// shard.
self->retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]++;
} else {
self->retryFindDstReasonCount[retryFindDstReason]++;
}
}
rd.dataMoveId = newShardId(physicalShardIDCandidate, AssignEmptyRange::False);
auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
inFlightRange.value().dataMoveId = rd.dataMoveId;
@ -2472,6 +2510,30 @@ ACTOR Future<Void> dataDistributionQueue(Reference<IDDTxnProcessor> db,
.trackLatest("MovingData"); // This trace event's trackLatest lifetime is controlled by
// DataDistributor::movingDataEventHolder. The track latest
// key we use here must match the key used in the holder.
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
TraceEvent("PhysicalShardMoveStats")
.detail("MoveCreateNewPhysicalShard", self.moveCreateNewPhysicalShard)
.detail("MoveReusePhysicalShard", self.moveReusePhysicalShard)
.detail("RemoteBestTeamNotReady",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteBestTeamNotReady])
.detail("PrimaryNoHealthyTeam",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam])
.detail("RemoteNoHealthyTeam",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteNoHealthyTeam])
.detail("RemoteTeamIsFull",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsFull])
.detail("RemoteTeamIsNotHealthy",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy])
.detail(
"NoAvailablePhysicalShard",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]);
self.moveCreateNewPhysicalShard = 0;
self.moveReusePhysicalShard = 0;
for (int i = 0; i < self.retryFindDstReasonCount.size(); ++i) {
self.retryFindDstReasonCount[i] = 0;
}
}
}
when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator
when(wait(waitForAll(ddQueueFutures))) {}

View File

@ -2081,6 +2081,10 @@ void PhysicalShardCollection::logPhysicalShardCollection() {
}
}
bool PhysicalShardCollection::physicalShardExists(uint64_t physicalShardID) {
return physicalShardInstances.find(physicalShardID) != physicalShardInstances.end();
}
// FIXME: complete this test with non-empty range
TEST_CASE("/DataDistributor/Tracker/FetchTopK") {
state DataDistributionTracker self;

View File

@ -286,8 +286,6 @@ public:
PromiseStream<RelocateShard> relocationProducer, relocationConsumer;
Reference<PhysicalShardCollection> physicalShardCollection;
StorageQuotaInfo storageQuotaInfo;
Promise<Void> initialized;
std::unordered_map<AuditType, std::vector<std::shared_ptr<DDAudit>>> audits;
@ -542,27 +540,6 @@ public:
}
};
ACTOR Future<Void> storageQuotaTracker(Database cx, StorageQuotaInfo* storageQuotaInfo) {
loop {
state Transaction tr(cx);
loop {
try {
state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY));
TraceEvent("StorageQuota_ReadCurrentQuotas").detail("Size", currentQuotas.size());
for (auto const kv : currentQuotas) {
Key const key = kv.key.removePrefix(storageQuotaPrefix);
uint64_t const quota = BinaryReader::fromStringRef<uint64_t>(kv.value, Unversioned());
storageQuotaInfo->quotaMap[key] = quota;
}
wait(delay(5.0));
break;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
}
// Periodically check and log the physicalShard status; clean up empty physicalShard;
ACTOR Future<Void> monitorPhysicalShardStatus(Reference<PhysicalShardCollection> self) {
ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
@ -683,16 +660,15 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
self->ddId,
&normalDDQueueErrors()));
actors.push_back(reportErrorsExcept(storageQuotaTracker(cx, &self->storageQuotaInfo),
"StorageQuotaTracker",
self->ddId,
&normalDDQueueErrors()));
if (ddIsTenantAware) {
actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorTenantMap(),
"DDTenantCacheMonitor",
self->ddId,
&normalDDQueueErrors()));
actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorStorageQuota(),
"StorageQuotaTracker",
self->ddId,
&normalDDQueueErrors()));
actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorStorageUsage(),
"StorageUsageTracker",
self->ddId,

View File

@ -429,7 +429,7 @@ public:
waitfor.push_back(self->files[1].f->write(pageData.begin(), pageData.size(), self->writingPos));
self->writingPos += pageData.size();
return waitForAll(waitfor);
return waitForAllReadyThenThrow(waitfor);
}
// Write the given data (pageData) to the queue files of self, sync data to disk, and delete the memory (pageMem)
@ -655,7 +655,7 @@ public:
for (int i = 0; i < 2; i++)
if (self->files[i].size > 0)
reads.push_back(self->files[i].f->read(self->firstPages[i], sizeof(Page), 0));
wait(waitForAll(reads));
wait(waitForAllReadyThenThrow(reads));
// Determine which file comes first
if (compare(self->firstPages[1], self->firstPages[0])) {
@ -743,7 +743,10 @@ public:
}
// Read nPages from pageOffset*sizeof(Page) offset in file self->files[file]
ACTOR static Future<Standalone<StringRef>> read(RawDiskQueue_TwoFiles* self, int file, int pageOffset, int nPages) {
ACTOR static UNCANCELLABLE Future<Standalone<StringRef>> read(RawDiskQueue_TwoFiles* self,
int file,
int pageOffset,
int nPages) {
state TrackMe trackMe(self);
state const size_t bytesRequested = nPages * sizeof(Page);
state Standalone<StringRef> result = makeAlignedString(sizeof(Page), bytesRequested);

View File

@ -388,6 +388,15 @@ ACTOR Future<Void> getCipherKeysByBaseCipherKeyIds(Reference<EncryptKeyProxyData
try {
KmsConnLookupEKsByKeyIdsReq keysByIdsReq;
for (const auto& item : lookupCipherInfoMap) {
// TODO: Currently getEncryptCipherKeys does not pass the domain name, once that is fixed we can remove
// the check on the empty domain name
if (!item.second.domainName.empty()) {
if (item.second.domainId == FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
ASSERT(item.second.domainName == FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
} else if (item.second.domainId == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
ASSERT(item.second.domainName == FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
}
}
keysByIdsReq.encryptKeyInfos.emplace_back_deep(
keysByIdsReq.arena, item.second.domainId, item.second.baseCipherId, item.second.domainName);
}
@ -527,6 +536,11 @@ ACTOR Future<Void> getLatestCipherKeys(Reference<EncryptKeyProxyData> ekpProxyDa
try {
KmsConnLookupEKsByDomainIdsReq keysByDomainIdReq;
for (const auto& item : lookupCipherDomains) {
if (item.second.domainId == FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
ASSERT(item.second.domainName == FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
} else if (item.second.domainId == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
ASSERT(item.second.domainName == FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
}
keysByDomainIdReq.encryptDomainInfos.emplace_back_deep(
keysByDomainIdReq.arena, item.second.domainId, item.second.domainName);
}

View File

@ -53,7 +53,11 @@ struct KeyValueStoreCompressTestData final : IKeyValueStore {
void set(KeyValueRef keyValue, const Arena* arena = nullptr) override {
store->set(KeyValueRef(keyValue.key, pack(keyValue.value)), arena);
}
void clear(KeyRangeRef range, const Arena* arena = nullptr) override { store->clear(range, arena); }
void clear(KeyRangeRef range,
const StorageServerMetrics* storageMetrics = nullptr,
const Arena* arena = nullptr) override {
store->clear(range, storageMetrics, arena);
}
Future<Void> commit(bool sequential = false) override { return store->commit(sequential); }
Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> options) override {

View File

@ -130,7 +130,7 @@ public:
}
}
void clear(KeyRangeRef range, const Arena* arena) override {
void clear(KeyRangeRef range, const StorageServerMetrics* storageMetrics, const Arena* arena) override {
// A commit that occurs with no available space returns Never, so we can throw out all modifications
if (getAvailableSize() <= 0)
return;

View File

@ -1846,22 +1846,52 @@ struct RocksDBKeyValueStore : IKeyValueStore {
void set(KeyValueRef kv, const Arena*) override {
if (writeBatch == nullptr) {
writeBatch.reset(new rocksdb::WriteBatch());
keysSet.clear();
}
ASSERT(defaultFdbCF != nullptr);
writeBatch->Put(defaultFdbCF, toSlice(kv.key), toSlice(kv.value));
if (SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE) {
keysSet.insert(kv.key);
}
}
void clear(KeyRangeRef keyRange, const Arena*) override {
void clear(KeyRangeRef keyRange, const StorageServerMetrics* storageMetrics, const Arena*) override {
if (writeBatch == nullptr) {
writeBatch.reset(new rocksdb::WriteBatch());
keysSet.clear();
}
ASSERT(defaultFdbCF != nullptr);
if (keyRange.singleKeyRange()) {
writeBatch->Delete(defaultFdbCF, toSlice(keyRange.begin));
} else {
writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end));
if (SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE && storageMetrics != nullptr &&
storageMetrics->byteSample.getEstimate(keyRange) <
SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT) {
rocksdb::ReadOptions options = sharedState->getReadOptions();
auto beginSlice = toSlice(keyRange.begin);
auto endSlice = toSlice(keyRange.end);
options.iterate_lower_bound = &beginSlice;
options.iterate_upper_bound = &endSlice;
auto cursor = std::unique_ptr<rocksdb::Iterator>(db->NewIterator(options, defaultFdbCF));
cursor->Seek(toSlice(keyRange.begin));
while (cursor->Valid() && toStringRef(cursor->key()) < keyRange.end) {
writeBatch->Delete(defaultFdbCF, cursor->key());
cursor->Next();
}
if (!cursor->status().ok()) {
// if readrange iteration fails, then do a deleteRange.
writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end));
} else {
auto it = keysSet.lower_bound(keyRange.begin);
while (it != keysSet.end() && *it < keyRange.end) {
writeBatch->Delete(defaultFdbCF, toSlice(*it));
it++;
}
}
} else {
writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end));
}
}
}
@ -1890,6 +1920,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
}
auto a = new Writer::CommitAction();
a->batchToCommit = std::move(writeBatch);
keysSet.clear();
auto res = a->done.getFuture();
writeThread->post(a);
return res;
@ -2083,6 +2114,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
Promise<Void> closePromise;
Future<Void> openFuture;
std::unique_ptr<rocksdb::WriteBatch> writeBatch;
std::set<Key> keysSet;
Optional<Future<Void>> metrics;
FlowLock readSemaphore;
int numReadWaiters;

View File

@ -1596,7 +1596,9 @@ public:
StorageBytes getStorageBytes() const override;
void set(KeyValueRef keyValue, const Arena* arena = nullptr) override;
void clear(KeyRangeRef range, const Arena* arena = nullptr) override;
void clear(KeyRangeRef range,
const StorageServerMetrics* storageMetrics = nullptr,
const Arena* arena = nullptr) override;
Future<Void> commit(bool sequential = false) override;
Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> optionss) override;
@ -2215,7 +2217,7 @@ void KeyValueStoreSQLite::set(KeyValueRef keyValue, const Arena* arena) {
++writesRequested;
writeThread->post(new Writer::SetAction(keyValue));
}
void KeyValueStoreSQLite::clear(KeyRangeRef range, const Arena* arena) {
void KeyValueStoreSQLite::clear(KeyRangeRef range, const StorageServerMetrics* storageMetrics, const Arena* arena) {
++writesRequested;
writeThread->post(new Writer::ClearAction(range));
}

View File

@ -49,6 +49,7 @@ static_assert((ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR == 27) ? ROCKSDB_PATCH >= 3 :
"Unsupported rocksdb version. Update the rocksdb to 6.27.3 version");
const std::string rocksDataFolderSuffix = "-data";
const std::string METADATA_SHARD_ID = "kvs-metadata";
const KeyRef shardMappingPrefix("\xff\xff/ShardMapping/"_sr);
// TODO: move constants to a header file.
const StringRef ROCKSDBSTORAGE_HISTOGRAM_GROUP = "RocksDBStorage"_sr;
@ -304,13 +305,12 @@ rocksdb::ReadOptions getReadOptions() {
}
struct ReadIterator {
rocksdb::ColumnFamilyHandle* cf;
uint64_t index; // incrementing counter to uniquely identify read iterator.
bool inUse;
std::shared_ptr<rocksdb::Iterator> iter;
double creationTime;
ReadIterator(rocksdb::ColumnFamilyHandle* cf, uint64_t index, rocksdb::DB* db, rocksdb::ReadOptions& options)
: cf(cf), index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options, cf)) {}
: index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options, cf)) {}
};
/*
@ -475,13 +475,26 @@ struct PhysicalShard {
}
~PhysicalShard() {
if (!deletePending)
return;
logShardEvent(id, ShardOp::CLOSE);
isInitialized.store(false);
readIterPool.reset();
// Destroy CF
auto s = db->DropColumnFamily(cf);
// Deleting default column family is not allowed.
if (id == "default") {
return;
}
if (deletePending) {
auto s = db->DropColumnFamily(cf);
if (!s.ok()) {
logRocksDBError(s, "DestroyShard");
logShardEvent(id, ShardOp::DESTROY, SevError, s.ToString());
return;
}
}
auto s = db->DestroyColumnFamilyHandle(cf);
if (!s.ok()) {
logRocksDBError(s, "DestroyShard");
logRocksDBError(s, "DestroyCFHandle");
logShardEvent(id, ShardOp::DESTROY, SevError, s.ToString());
return;
}
@ -628,7 +641,7 @@ public:
std::vector<rocksdb::ColumnFamilyDescriptor> descriptors;
bool foundMetadata = false;
for (const auto& name : columnFamilies) {
if (name == "kvs-metadata") {
if (name == METADATA_SHARD_ID) {
foundMetadata = true;
}
descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ name, cfOptions });
@ -652,19 +665,19 @@ public:
TraceEvent(SevInfo, "ShardedRocksInitLoadPhysicalShards", this->logId)
.detail("PhysicalShardCount", handles.size());
std::shared_ptr<PhysicalShard> metadataShard = nullptr;
for (auto handle : handles) {
if (handle->GetName() == "kvs-metadata") {
metadataShard = std::make_shared<PhysicalShard>(db, "kvs-metadata", handle);
} else {
physicalShards[handle->GetName()] = std::make_shared<PhysicalShard>(db, handle->GetName(), handle);
auto shard = std::make_shared<PhysicalShard>(db, handle->GetName(), handle);
if (shard->id == METADATA_SHARD_ID) {
metadataShard = shard;
}
physicalShards[shard->id] = shard;
columnFamilyMap[handle->GetID()] = handle;
TraceEvent(SevVerbose, "ShardedRocksInitPhysicalShard", this->logId)
.detail("PhysicalShard", handle->GetName());
TraceEvent(SevVerbose, "ShardedRocksInitPhysicalShard", this->logId).detail("PhysicalShard", shard->id);
}
std::set<std::string> unusedShards(columnFamilies.begin(), columnFamilies.end());
unusedShards.erase("kvs-metadata");
unusedShards.erase(METADATA_SHARD_ID);
unusedShards.erase("default");
KeyRange keyRange = prefixRange(shardMappingPrefix);
@ -746,9 +759,11 @@ public:
defaultShard->dataShards[specialKeys.begin.toString()] = std::move(dataShard);
physicalShards[defaultShard->id] = defaultShard;
metadataShard = std::make_shared<PhysicalShard>(db, "kvs-metadata");
// Create metadata shard.
auto metadataShard = std::make_shared<PhysicalShard>(db, METADATA_SHARD_ID);
metadataShard->init();
columnFamilyMap[metadataShard->cf->GetID()] = metadataShard->cf;
physicalShards[METADATA_SHARD_ID] = metadataShard;
// Write special key range metadata.
writeBatch = std::make_unique<rocksdb::WriteBatch>();
@ -763,7 +778,6 @@ public:
TraceEvent(SevInfo, "ShardedRocksInitializeMetaDataShard", this->logId)
.detail("MetadataShardCF", metadataShard->cf->GetID());
}
physicalShards["kvs-metadata"] = metadataShard;
writeBatch = std::make_unique<rocksdb::WriteBatch>();
dirtyShards = std::make_unique<std::set<PhysicalShard*>>();
@ -910,6 +924,9 @@ public:
std::vector<std::shared_ptr<PhysicalShard>> getPendingDeletionShards(double cleanUpDelay) {
std::vector<std::shared_ptr<PhysicalShard>> emptyShards;
double currentTime = now();
TraceEvent(SevInfo, "ShardedRocksDB", logId)
.detail("PendingDeletionShardQueueSize", pendingDeletionShards.size());
while (!pendingDeletionShards.empty()) {
const auto& id = pendingDeletionShards.front();
auto it = physicalShards.find(id);
@ -976,6 +993,10 @@ public:
.detail("Info", "RangeToPersist")
.detail("BeginKey", range.begin)
.detail("EndKey", range.end);
auto it = physicalShards.find(METADATA_SHARD_ID);
ASSERT(it != physicalShards.end());
auto metadataShard = it->second;
writeBatch->DeleteRange(metadataShard->cf,
getShardMappingKey(range.begin, shardMappingPrefix),
getShardMappingKey(range.end, shardMappingPrefix));
@ -1043,24 +1064,30 @@ public:
}
void closeAllShards() {
for (auto& [_, shard] : physicalShards) {
shard->readIterPool.reset();
}
columnFamilyMap.clear();
physicalShards.clear();
// Close DB.
auto s = db->Close();
if (!s.ok()) {
logRocksDBError(s, "Close");
return;
}
TraceEvent("ShardedRocksDB", this->logId).detail("Info", "DBClosed");
}
void destroyAllShards() {
closeAllShards();
std::vector<rocksdb::ColumnFamilyDescriptor> cfs;
for (const auto& [key, _] : physicalShards) {
cfs.push_back(rocksdb::ColumnFamilyDescriptor{ key, getCFOptions() });
columnFamilyMap.clear();
for (auto& [_, shard] : physicalShards) {
shard->deletePending = true;
}
auto s = rocksdb::DestroyDB(path, getOptions(), cfs);
physicalShards.clear();
// Close DB.
auto s = db->Close();
if (!s.ok()) {
logRocksDBError(s, "Close");
return;
}
s = rocksdb::DestroyDB(path, getOptions());
if (!s.ok()) {
logRocksDBError(s, "DestroyDB");
}
@ -1121,7 +1148,6 @@ private:
std::unique_ptr<rocksdb::WriteBatch> writeBatch;
std::unique_ptr<std::set<PhysicalShard*>> dirtyShards;
KeyRangeMap<DataShard*> dataShardMap;
std::shared_ptr<PhysicalShard> metadataShard = nullptr;
std::deque<std::string> pendingDeletionShards;
};
@ -2240,6 +2266,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
// TODO: Adapt the simulation framework to not advance time quickly when background reads/writes are
// occurring.
if (g_network->isSimulated()) {
TraceEvent(SevDebug, "ShardedRocksDB").detail("Info", "Use Coro threads in simulation.");
writeThread = CoroThreadPool::createThreadPool();
readThreads = CoroThreadPool::createThreadPool();
} else {
@ -2316,7 +2343,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
void set(KeyValueRef kv, const Arena*) override { shardManager.put(kv.key, kv.value); }
void clear(KeyRangeRef range, const Arena*) override {
void clear(KeyRangeRef range, const StorageServerMetrics*, const Arena*) override {
if (range.singleKeyRange()) {
shardManager.clear(range.begin);
} else {

View File

@ -405,10 +405,6 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
.detail("Offset", asset.offset)
.detail("Length", asset.len);
// Ensure data blocks in the same file are processed in order
wait(processedFileOffset->whenAtLeast(asset.offset));
ASSERT(processedFileOffset->get() == asset.offset);
state Arena tempArena;
state StringRefReader reader(buf, restore_corrupted_data());
try {
@ -430,8 +426,9 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
const uint8_t* message = reader.consume(msgSize);
// Skip mutations out of the version range
if (!asset.isInVersionRange(msgVersion.version))
if (!asset.isInVersionRange(msgVersion.version)) {
continue;
}
state VersionedMutationsMap::iterator it;
bool inserted;
@ -452,6 +449,7 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
// Skip mutation whose commitVesion < range kv's version
if (logMutationTooOld(pRangeVersions, mutation, msgVersion.version)) {
cc->oldLogMutations += 1;
wait(yield()); // avoid potential stack overflows
continue;
}
@ -459,6 +457,7 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
if (mutation.param1 >= asset.range.end ||
(isRangeMutation(mutation) && mutation.param2 < asset.range.begin) ||
(!isRangeMutation(mutation) && mutation.param1 < asset.range.begin)) {
wait(yield()); // avoid potential stack overflows
continue;
}
@ -509,7 +508,6 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
.detail("BlockLen", asset.len);
throw;
}
processedFileOffset->set(asset.offset + asset.len);
return Void();
}
@ -526,8 +524,19 @@ ACTOR static Future<Void> parsePartitionedLogFileOnLoader(
state int readFileRetries = 0;
loop {
try {
// Ensure data blocks in the same file are processed in order
wait(processedFileOffset->whenAtLeast(asset.offset));
ASSERT(processedFileOffset->get() == asset.offset);
wait(_parsePartitionedLogFileOnLoader(
pRangeVersions, processedFileOffset, kvOpsIter, samplesIter, cc, bc, asset, cx));
processedFileOffset->set(asset.offset + asset.len);
TraceEvent("FastRestoreLoaderDecodingLogFileDone")
.detail("BatchIndex", asset.batchIndex)
.detail("Filename", asset.filename)
.detail("Offset", asset.offset)
.detail("Length", asset.len);
break;
} catch (Error& e) {
if (e.code() == error_code_restore_bad_read || e.code() == error_code_restore_unsupported_file_version ||

View File

@ -529,6 +529,7 @@ ACTOR Future<Void> fetchCheckpointFile(Database cx,
state int64_t offset = 0;
state Reference<IAsyncFile> asyncFile;
loop {
offset = 0;
try {
asyncFile = Reference<IAsyncFile>();
++attempt;
@ -559,7 +560,8 @@ ACTOR Future<Void> fetchCheckpointFile(Database cx,
offset += rep.data.size();
}
} catch (Error& e) {
if (e.code() != error_code_end_of_stream) {
if (e.code() != error_code_end_of_stream ||
(g_network->isSimulated() && attempt == 1 && deterministicRandom()->coinflip())) {
TraceEvent("FetchCheckpointFileError")
.errorUnsuppressed(e)
.detail("RemoteFile", remoteFile)

View File

@ -107,7 +107,8 @@ bool destructed = false;
class TestConfig : public BasicTestConfig {
class ConfigBuilder {
using value_type = toml::basic_value<toml::discard_comments>;
using base_variant = std::variant<int, float, double, bool, std::string, std::vector<int>, ConfigDBType>;
using base_variant = std::
variant<int, float, double, bool, std::string, std::vector<int>, std::vector<std::string>, ConfigDBType>;
using types =
variant_map<variant_concat<base_variant, variant_map<base_variant, Optional>>, std::add_pointer_t>;
std::unordered_map<std::string_view, types> confMap;
@ -148,6 +149,17 @@ class TestConfig : public BasicTestConfig {
(*this)(&res);
*val = std::move(res);
}
void operator()(std::vector<std::string>* val) const {
auto arr = value.as_array();
for (const auto& i : arr) {
val->emplace_back(i.as_string());
}
}
void operator()(Optional<std::vector<std::string>>* val) const {
std::vector<std::string> res;
(*this)(&res);
*val = std::move(res);
}
};
struct trace_visitor {
@ -178,6 +190,26 @@ class TestConfig : public BasicTestConfig {
(*this)(&(val->get()));
}
}
void operator()(std::vector<std::string> const* val) const {
if (val->empty()) {
evt.detail(key.c_str(), "[]");
return;
}
std::stringstream value;
value << "[" << val->at(0);
for (int i = 1; i < val->size(); ++i) {
value << "," << val->at(i);
}
value << "]";
evt.detail(key.c_str(), value.str());
}
void operator()(Optional<std::vector<std::string>> const* val) const {
if (!val->present()) {
evt.detail(key.c_str(), *val);
} else {
(*this)(&(val->get()));
}
}
void operator()(ConfigDBType const* val) const { evt.detail(key.c_str(), *val); }
void operator()(Optional<ConfigDBType> const* val) const {
Optional<std::string> optStr;
@ -312,12 +344,24 @@ class TestConfig : public BasicTestConfig {
if (attrib == "blobGranulesEnabled") {
blobGranulesEnabled = strcmp(value.c_str(), "true") == 0;
}
if (attrib == "allowDefaultTenant") {
allowDefaultTenant = strcmp(value.c_str(), "true") == 0;
}
if (attrib == "allowCreatingTenants") {
allowCreatingTenants = strcmp(value.c_str(), "true") == 0;
}
if (attrib == "injectSSTargetedRestart") {
injectTargetedSSRestart = strcmp(value.c_str(), "true") == 0;
}
if (attrib == "injectSSDelay") {
injectSSDelay = strcmp(value.c_str(), "true") == 0;
if (attrib == "tenantModes") {
std::stringstream ss(value);
std::string token;
while (std::getline(ss, token, ',')) {
tenantModes.push_back(token);
}
}
if (attrib == "defaultTenant") {
defaultTenant = value;
}
}
@ -365,11 +409,14 @@ public:
bool randomlyRenameZoneId = false;
bool allowDefaultTenant = true;
bool allowDisablingTenants = true;
bool allowCreatingTenants = true;
bool injectTargetedSSRestart = false;
bool tenantModeRequired = false;
bool injectSSDelay = false;
// By default, tenant mode is set randomly
// If provided, set using TenantMode::fromString
// Ensure no '_experimental` suffix in the mode name
std::vector<std::string> tenantModes;
Optional<std::string> defaultTenant;
std::string testClass; // unused -- used in TestHarness
float testPriority; // unused -- used in TestHarness
@ -432,12 +479,12 @@ public:
.add("extraMachineCountDC", &extraMachineCountDC)
.add("blobGranulesEnabled", &blobGranulesEnabled)
.add("allowDefaultTenant", &allowDefaultTenant)
.add("allowDisablingTenants", &allowDisablingTenants)
.add("allowCreatingTenants", &allowCreatingTenants)
.add("tenantModeRequired", &tenantModeRequired)
.add("randomlyRenameZoneId", &randomlyRenameZoneId)
.add("injectTargetedSSRestart", &injectTargetedSSRestart)
.add("injectSSDelay", &injectSSDelay);
.add("injectSSDelay", &injectSSDelay)
.add("tenantModes", &tenantModes)
.add("defaultTenant", &defaultTenant);
try {
auto file = toml::parse(testFile);
if (file.contains("configuration") && toml::find(file, "configuration").is_table()) {
@ -1118,18 +1165,18 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
int* pTesterCount,
Optional<ClusterConnectionString>* pConnString,
Standalone<StringRef>* pStartingConfiguration,
TestConfig testConfig,
TestConfig* testConfig,
std::string whitelistBinPaths,
ProtocolVersion protocolVersion) {
CSimpleIni ini;
ini.SetUnicode();
ini.LoadFile(joinPath(baseFolder, "restartInfo.ini").c_str());
auto configDBType = testConfig.getConfigDBType();
auto configDBType = testConfig->getConfigDBType();
// Randomly change data center id names to test that localities
// can be modified on cluster restart
bool renameZoneIds = testConfig.randomlyRenameZoneId ? deterministicRandom()->random01() < 0.1 : false;
bool renameZoneIds = testConfig->randomlyRenameZoneId ? deterministicRandom()->random01() < 0.1 : false;
CODE_PROBE(renameZoneIds, "Zone ID names altered in restart test");
// allows multiple ipAddr entries
@ -1146,26 +1193,34 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
int desiredCoordinators = atoi(ini.GetValue("META", "desiredCoordinators"));
int testerCount = atoi(ini.GetValue("META", "testerCount"));
auto tssModeStr = ini.GetValue("META", "tssMode");
auto tenantMode = ini.GetValue("META", "tenantMode");
if (tenantMode != nullptr) {
testConfig->tenantModes.push_back(tenantMode);
}
std::string defaultTenant = ini.GetValue("META", "defaultTenant", "");
if (!defaultTenant.empty()) {
testConfig->defaultTenant = defaultTenant;
}
if (tssModeStr != nullptr) {
g_simulator->tssMode = (ISimulator::TSSMode)atoi(tssModeStr);
}
ClusterConnectionString conn(ini.GetValue("META", "connectionString"));
if (testConfig.extraDatabaseMode == ISimulator::ExtraDatabaseMode::Local) {
if (testConfig->extraDatabaseMode == ISimulator::ExtraDatabaseMode::Local) {
g_simulator->extraDatabases.clear();
g_simulator->extraDatabases.push_back(conn.toString());
}
if (!testConfig.disableHostname) {
if (!testConfig->disableHostname) {
auto mockDNSStr = ini.GetValue("META", "mockDNS");
if (mockDNSStr != nullptr) {
INetworkConnections::net()->parseMockDNSFromString(mockDNSStr);
}
}
auto& g_knobs = IKnobCollection::getMutableGlobalKnobCollection();
if (testConfig.disableRemoteKVS) {
if (testConfig->disableRemoteKVS) {
g_knobs.setKnob("remote_kv_store", KnobValueRef::create(bool{ false }));
TraceEvent(SevDebug, "DisableRemoteKVS");
}
if (testConfig.disableEncryption) {
if (testConfig->disableEncryption) {
g_knobs.setKnob("enable_encryption", KnobValueRef::create(bool{ false }));
g_knobs.setKnob("enable_tlog_encryption", KnobValueRef::create(bool{ false }));
g_knobs.setKnob("enable_storage_server_encryption", KnobValueRef::create(bool{ false }));
@ -2451,9 +2506,7 @@ ACTOR void setupAndRun(std::string dataFolder,
allowList.addTrustedSubnet("0.0.0.0/2"sv);
allowList.addTrustedSubnet("abcd::/16"sv);
state bool allowDefaultTenant = testConfig.allowDefaultTenant;
state bool allowDisablingTenants = testConfig.allowDisablingTenants;
state bool allowCreatingTenants = testConfig.allowCreatingTenants;
state bool tenantModeRequired = testConfig.tenantModeRequired;
if (!SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
testConfig.storageEngineExcludeTypes.push_back(5);
@ -2465,12 +2518,6 @@ ACTOR void setupAndRun(std::string dataFolder,
if (std::string_view(testFile).find("restarting") != std::string_view::npos) {
testConfig.storageEngineExcludeTypes.push_back(4);
testConfig.storageEngineExcludeTypes.push_back(5);
// Disable the default tenant in restarting tests for now
// TODO: persist the chosen default tenant in the restartInfo.ini file for the second test
allowDefaultTenant = false;
allowCreatingTenants = false;
tenantModeRequired = false;
}
// TODO: Currently backup and restore related simulation tests are failing when run with rocksDB storage engine
@ -2520,31 +2567,28 @@ ACTOR void setupAndRun(std::string dataFolder,
state Optional<TenantName> defaultTenant;
state Standalone<VectorRef<TenantNameRef>> tenantsToCreate;
state TenantMode tenantMode = TenantMode::DISABLED;
if (tenantModeRequired || (allowDefaultTenant && deterministicRandom()->random01() < 0.5)) {
defaultTenant = "SimulatedDefaultTenant"_sr;
tenantsToCreate.push_back_deep(tenantsToCreate.arena(), defaultTenant.get());
if (tenantModeRequired || deterministicRandom()->random01() < 0.9) {
tenantMode = TenantMode::REQUIRED;
} else {
// If this is a restarting test, restartInfo.ini is read in restartSimulatedSystem
// where we update the defaultTenant and tenantMode in the testConfig
// Defer setting tenant mode and default tenant until later
if (!rebooting) {
if (testConfig.tenantModes.size()) {
auto randomPick = deterministicRandom()->randomChoice(testConfig.tenantModes);
tenantMode = TenantMode::fromString(randomPick);
if (tenantMode == TenantMode::REQUIRED && allowDefaultTenant) {
defaultTenant = "SimulatedDefaultTenant"_sr;
}
} else if (allowDefaultTenant && deterministicRandom()->coinflip()) {
defaultTenant = "SimulatedDefaultTenant"_sr;
if (deterministicRandom()->random01() < 0.9) {
tenantMode = TenantMode::REQUIRED;
} else {
tenantMode = TenantMode::OPTIONAL_TENANT;
}
} else if (deterministicRandom()->coinflip()) {
tenantMode = TenantMode::OPTIONAL_TENANT;
}
} else if (!allowDisablingTenants || deterministicRandom()->random01() < 0.5) {
tenantMode = TenantMode::OPTIONAL_TENANT;
}
if (allowCreatingTenants && tenantMode != TenantMode::DISABLED && deterministicRandom()->random01() < 0.5) {
int numTenants = deterministicRandom()->randomInt(1, 6);
for (int i = 0; i < numTenants; ++i) {
tenantsToCreate.push_back_deep(tenantsToCreate.arena(),
TenantNameRef(format("SimulatedExtraTenant%04d", i)));
}
}
TraceEvent("SimulatedClusterTenantMode")
.detail("UsingTenant", defaultTenant)
.detail("TenantRequired", tenantMode.toString())
.detail("TotalTenants", tenantsToCreate.size());
try {
// systemActors.push_back( startSystemMonitor(dataFolder) );
if (rebooting) {
@ -2553,7 +2597,7 @@ ACTOR void setupAndRun(std::string dataFolder,
&testerCount,
&connectionString,
&startingConfiguration,
testConfig,
&testConfig,
whitelistBinPaths,
protocolVersion),
100.0));
@ -2574,6 +2618,31 @@ ACTOR void setupAndRun(std::string dataFolder,
tenantMode);
wait(delay(1.0)); // FIXME: WHY!!! //wait for machines to boot
}
// restartSimulatedSystem can adjust some testConfig params related to tenants
// so set/overwrite those options if necessary here
if (rebooting && testConfig.tenantModes.size()) {
tenantMode = TenantMode::fromString(testConfig.tenantModes[0]);
}
if (testConfig.defaultTenant.present() && tenantMode != TenantMode::DISABLED && allowDefaultTenant) {
// Default tenant set by testConfig or restarting data in restartInfo.ini
defaultTenant = testConfig.defaultTenant.get();
}
if (!rebooting) {
if (defaultTenant.present() && allowDefaultTenant) {
tenantsToCreate.push_back_deep(tenantsToCreate.arena(), defaultTenant.get());
}
if (allowCreatingTenants && tenantMode != TenantMode::DISABLED && deterministicRandom()->coinflip()) {
int numTenants = deterministicRandom()->randomInt(1, 6);
for (int i = 0; i < numTenants; ++i) {
tenantsToCreate.push_back_deep(tenantsToCreate.arena(),
TenantNameRef(format("SimulatedExtraTenant%04d", i)));
}
}
}
TraceEvent("SimulatedClusterTenantMode")
.detail("UsingTenant", defaultTenant)
.detail("TenantMode", tenantMode.toString())
.detail("TotalTenants", tenantsToCreate.size());
std::string clusterFileDir = joinPath(dataFolder, deterministicRandom()->randomUniqueID().toString());
platform::createDirectory(clusterFileDir);
writeFile(joinPath(clusterFileDir, "fdb.cluster"), connectionString.get().toString());

View File

@ -122,19 +122,20 @@ public:
ACTOR static Future<Void> monitorStorageUsage(TenantCache* tenantCache) {
TraceEvent(SevInfo, "StartingTenantCacheStorageUsageMonitor", tenantCache->id()).log();
state int refreshInterval = SERVER_KNOBS->TENANT_CACHE_STORAGE_REFRESH_INTERVAL;
state int refreshInterval = SERVER_KNOBS->TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL;
state double lastTenantListFetchTime = now();
loop {
state double fetchStartTime = now();
state std::vector<std::pair<KeyRef, TenantName>> tenantList = tenantCache->getTenantList();
state std::vector<TenantName> tenants = tenantCache->getTenantList();
state int i;
for (i = 0; i < tenantList.size(); i++) {
state ReadYourWritesTransaction tr(tenantCache->dbcx(), tenantList[i].second);
for (i = 0; i < tenants.size(); i++) {
state ReadYourWritesTransaction tr(tenantCache->dbcx(), tenants[i]);
loop {
try {
state int64_t size = wait(tr.getEstimatedRangeSizeBytes(normalKeys));
tenantCache->updateStorageUsage(tenantList[i].first, size);
tenantCache->tenantStorageMap[tenants[i]].usage = size;
break;
} catch (Error& e) {
TraceEvent("TenantCacheGetStorageUsageError", tenantCache->id()).error(e);
wait(tr.onError(e));
@ -149,6 +150,31 @@ public:
wait(delay(refreshInterval));
}
}
ACTOR static Future<Void> monitorStorageQuota(TenantCache* tenantCache) {
TraceEvent(SevInfo, "StartingTenantCacheStorageQuotaMonitor", tenantCache->id()).log();
state Transaction tr(tenantCache->dbcx());
loop {
loop {
try {
state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY));
for (auto const kv : currentQuotas) {
TenantName const tenant = kv.key.removePrefix(storageQuotaPrefix);
int64_t const quota = BinaryReader::fromStringRef<int64_t>(kv.value, Unversioned());
tenantCache->tenantStorageMap[tenant].quota = quota;
}
tr.reset();
break;
} catch (Error& e) {
TraceEvent("TenantCacheGetStorageQuotaError", tenantCache->id()).error(e);
wait(tr.onError(e));
}
}
wait(delay(SERVER_KNOBS->TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL));
}
}
};
void TenantCache::insert(TenantName& tenantName, TenantMapEntry& tenant) {
@ -203,21 +229,14 @@ int TenantCache::cleanup() {
return tenantsRemoved;
}
std::vector<std::pair<KeyRef, TenantName>> TenantCache::getTenantList() const {
std::vector<std::pair<KeyRef, TenantName>> tenants;
std::vector<TenantName> TenantCache::getTenantList() const {
std::vector<TenantName> tenants;
for (const auto& [prefix, entry] : tenantCache) {
tenants.push_back({ prefix, entry->name() });
tenants.push_back(entry->name());
}
return tenants;
}
void TenantCache::updateStorageUsage(KeyRef prefix, int64_t size) {
auto it = tenantCache.find(prefix);
if (it != tenantCache.end()) {
it->value->updateStorageUsage(size);
}
}
std::string TenantCache::desc() const {
std::string s("@Generation: ");
s += std::to_string(generation) + " ";
@ -264,6 +283,16 @@ Optional<Reference<TCTenantInfo>> TenantCache::tenantOwning(KeyRef key) const {
return it->value;
}
std::vector<TenantName> TenantCache::getTenantsOverQuota() const {
std::vector<TenantName> tenants;
for (const auto& [tenant, storage] : tenantStorageMap) {
if (storage.usage > storage.quota) {
tenants.push_back(tenant);
}
}
return tenants;
}
Future<Void> TenantCache::monitorTenantMap() {
return TenantCacheImpl::monitorTenantMap(this);
}
@ -272,6 +301,10 @@ Future<Void> TenantCache::monitorStorageUsage() {
return TenantCacheImpl::monitorStorageUsage(this);
}
Future<Void> TenantCache::monitorStorageQuota() {
return TenantCacheImpl::monitorStorageQuota(this);
}
class TenantCacheUnitTest {
public:
ACTOR static Future<Void> InsertAndTestPresence() {

View File

@ -1620,9 +1620,17 @@ ACTOR Future<Void> redwoodMetricsLogger() {
}
// Holds an index of recently used objects.
// ObjectType must have the methods
// bool evictable() const; // return true if the entry can be evicted
// Future<Void> onEvictable() const; // ready when entry can be evicted
// ObjectType must have these methods
//
// // Returns true iff the entry can be evicted
// bool evictable() const;
//
// // Ready when object is safe to evict from cache
// Future<Void> onEvictable() const;
//
// // Ready when object destruction is safe
// // Should cancel pending async operations that are safe to cancel when cache is being destroyed
// Future<Void> cancel() const;
template <class IndexType, class ObjectType>
class ObjectCache : NonCopyable {
struct Entry;
@ -1845,7 +1853,7 @@ public:
}
// Clears the cache, saving the entries to second cache, then waits for each item to be evictable and evicts it.
ACTOR static Future<Void> clear_impl(ObjectCache* self) {
ACTOR static Future<Void> clear_impl(ObjectCache* self, bool waitForSafeEviction) {
// Claim ownership of all of our cached items, removing them from the evictor's control and quota.
for (auto& ie : self->cache) {
self->pEvictor->reclaim(ie.second);
@ -1857,16 +1865,15 @@ public:
state typename CacheT::iterator i = self->cache.begin();
while (i != self->cache.end()) {
if (!i->second.item.evictable()) {
wait(i->second.item.onEvictable());
}
wait(waitForSafeEviction ? i->second.item.onEvictable() : i->second.item.cancel());
++i;
}
self->cache.clear();
return Void();
}
Future<Void> clear() { return clear_impl(this); }
Future<Void> clear(bool waitForSafeEviction = false) { return clear_impl(this, waitForSafeEviction); }
// Move the prioritized evictions queued to the front of the eviction order
void flushPrioritizedEvictions() { pEvictor->moveIn(prioritizedEvictions); }
@ -1927,6 +1934,13 @@ public:
// Entry is evictable when its write and read futures are ready, even if they are
// errors, so any buffers they hold are no longer needed by the underlying file actors
Future<Void> onEvictable() const { return ready(readFuture) && ready(writeFuture); }
// Read and write futures are safe to cancel so just cancel them and return
Future<Void> cancel() {
writeFuture.cancel();
readFuture.cancel();
return Void();
}
};
typedef ObjectCache<LogicalPageID, PageCacheEntry> PageCacheT;
@ -2475,14 +2489,15 @@ public:
Future<LogicalPageID> newExtentPageID(QueueID queueID) override { return newExtentPageID_impl(this, queueID); }
ACTOR static Future<Void> writePhysicalBlock(DWALPager* self,
Reference<ArenaPage> page,
int blockNum,
int blockSize,
PhysicalPageID pageID,
PagerEventReasons reason,
unsigned int level,
bool header) {
// Write one block of a page of a physical page in the page file. Futures returned must be allowed to complete.
ACTOR static UNCANCELLABLE Future<Void> writePhysicalBlock(DWALPager* self,
Reference<ArenaPage> page,
int blockNum,
int blockSize,
PhysicalPageID pageID,
PagerEventReasons reason,
unsigned int level,
bool header) {
state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(header ? ioMaxPriority : ioMinPriority));
++g_redwoodMetrics.metric.pagerDiskWrite;
@ -2506,7 +2521,11 @@ public:
// Note: Not using forwardError here so a write error won't be discovered until commit time.
debug_printf("DWALPager(%s) op=writeBlock %s\n", self->filename.c_str(), toString(pageID).c_str());
wait(self->pageFile->write(page->rawData() + (blockNum * blockSize), blockSize, (int64_t)pageID * blockSize));
debug_printf("DWALPager(%s) op=writeBlockDone %s\n", self->filename.c_str(), toString(pageID).c_str());
// This next line could crash on shutdown because this actor can't be cancelled so self could be destroyed after
// write, so enable this line with caution when debugging.
// debug_printf("DWALPager(%s) op=writeBlockDone %s\n", self->filename.c_str(), toString(pageID).c_str());
return Void();
}
@ -2530,6 +2549,7 @@ public:
return Void();
}
// All returned futures are added to the operations vector
Future<Void> writePhysicalPage(PagerEventReasons reason,
unsigned int level,
Standalone<VectorRef<PhysicalPageID>> pageIDs,
@ -2753,18 +2773,19 @@ public:
}
void freeExtent(LogicalPageID pageID) override { freeExtent_impl(this, pageID); }
ACTOR static Future<int> readPhysicalBlock(DWALPager* self,
uint8_t* data,
int blockSize,
int64_t offset,
int priority) {
ACTOR static UNCANCELLABLE Future<int> readPhysicalBlock(DWALPager* self,
Reference<ArenaPage> pageBuffer,
int pageOffset,
int blockSize,
int64_t offset,
int priority) {
state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(std::min(priority, ioMaxPriority)));
++g_redwoodMetrics.metric.pagerDiskRead;
int bytes = wait(self->pageFile->read(data, blockSize, offset));
int bytes = wait(self->pageFile->read(pageBuffer->rawData() + pageOffset, blockSize, offset));
return bytes;
}
// Read a physical page from the page file. Note that header pages use a page size of smallestPhysicalBlock
// Read a physical page from the page file. Note that header pages use a page size of smallestPhysicalBlock.
// If the user chosen physical page size is larger, then there will be a gap of unused space after the header pages
// and before the user-chosen sized pages.
ACTOR static Future<Reference<ArenaPage>> readPhysicalPage(DWALPager* self,
@ -2781,8 +2802,8 @@ public:
page->rawData(),
header);
int readBytes = wait(
readPhysicalBlock(self, page->rawData(), page->rawSize(), (int64_t)pageID * page->rawSize(), priority));
int readBytes =
wait(readPhysicalBlock(self, page, 0, page->rawSize(), (int64_t)pageID * page->rawSize(), priority));
debug_printf("DWALPager(%s) op=readPhysicalDiskReadComplete %s ptr=%p bytes=%d\n",
self->filename.c_str(),
toString(pageID).c_str(),
@ -2845,8 +2866,8 @@ public:
state int blockSize = self->physicalPageSize;
std::vector<Future<int>> reads;
for (int i = 0; i < pageIDs.size(); ++i) {
reads.push_back(readPhysicalBlock(
self, page->rawData() + (i * blockSize), blockSize, ((int64_t)pageIDs[i]) * blockSize, priority));
reads.push_back(
readPhysicalBlock(self, page, i * blockSize, blockSize, ((int64_t)pageIDs[i]) * blockSize, priority));
}
// wait for all the parallel read futures
wait(waitForAll(reads));
@ -3083,8 +3104,8 @@ public:
currentOffset = i * physicalReadSize;
debug_printf("DWALPager(%s) current offset %" PRId64 "\n", self->filename.c_str(), currentOffset);
++g_redwoodMetrics.metric.pagerDiskRead;
reads.push_back(
self->pageFile->read(extent->rawData() + currentOffset, physicalReadSize, startOffset + currentOffset));
reads.push_back(self->readPhysicalBlock(
self, extent, currentOffset, physicalReadSize, startOffset + currentOffset, ioMaxPriority));
}
// Handle the last read separately as it may be smaller than physicalReadSize
@ -3096,8 +3117,8 @@ public:
currentOffset,
lastReadSize);
++g_redwoodMetrics.metric.pagerDiskRead;
reads.push_back(
self->pageFile->read(extent->rawData() + currentOffset, lastReadSize, startOffset + currentOffset));
reads.push_back(self->readPhysicalBlock(
self, extent, currentOffset, lastReadSize, startOffset + currentOffset, ioMaxPriority));
}
// wait for all the parallel read futures for the given extent
@ -3562,30 +3583,36 @@ public:
Value getCommitRecord() const override { return lastCommittedHeader.userCommitRecord; }
ACTOR void shutdown(DWALPager* self, bool dispose) {
// Send to the error promise first and then delay(0) to give users a chance to cancel
// any outstanding operations
if (self->errorPromise.canBeSet()) {
debug_printf("DWALPager(%s) shutdown sending error\n", self->filename.c_str());
self->errorPromise.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress
}
wait(delay(0));
// The next section explicitly cancels all pending operations held in the pager
debug_printf("DWALPager(%s) shutdown kill ioLock\n", self->filename.c_str());
self->ioLock.kill();
debug_printf("DWALPager(%s) shutdown cancel recovery\n", self->filename.c_str());
self->recoverFuture.cancel();
debug_printf("DWALPager(%s) shutdown cancel commit\n", self->filename.c_str());
self->commitFuture.cancel();
debug_printf("DWALPager(%s) shutdown cancel remap\n", self->filename.c_str());
self->remapCleanupFuture.cancel();
debug_printf("DWALPager(%s) shutdown kill file extension\n", self->filename.c_str());
self->fileExtension.cancel();
if (self->errorPromise.canBeSet()) {
debug_printf("DWALPager(%s) shutdown sending error\n", self->filename.c_str());
self->errorPromise.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress
debug_printf("DWALPager(%s) shutdown cancel operations\n", self->filename.c_str());
for (auto& f : self->operations) {
f.cancel();
}
// Must wait for pending operations to complete, canceling them can cause a crash because the underlying
// operations may be uncancellable and depend on memory from calling scope's page reference
debug_printf("DWALPager(%s) shutdown wait for operations\n", self->filename.c_str());
// Pending ops must be all ready, errors are okay
wait(waitForAllReady(self->operations));
self->operations.clear();
debug_printf("DWALPager(%s) shutdown destroy page cache\n", self->filename.c_str());
wait(self->extentCache.clear());
wait(self->pageCache.clear());
wait(delay(0));
debug_printf("DWALPager(%s) shutdown remappedPagesMap: %s\n",
self->filename.c_str(),
@ -3810,7 +3837,11 @@ private:
Promise<Void> closedPromise;
Promise<Void> errorPromise;
Future<Void> commitFuture;
// The operations vector is used to hold all disk writes made by the Pager, but could also hold
// other operations that need to be waited on before a commit can finish.
std::vector<Future<Void>> operations;
Future<Void> recoverFuture;
Future<Void> remapCleanupFuture;
bool remapCleanupStop;
@ -4582,7 +4613,7 @@ struct BoundaryRefAndPage {
// DecodeBoundaryVerifier provides simulation-only verification of DeltaTree boundaries between
// reads and writes by using a static structure to track boundaries used during DeltaTree generation
// for all writes and updates across cold starts and virtual process restarts.
struct DecodeBoundaryVerifier {
class DecodeBoundaryVerifier {
struct DecodeBoundaries {
Key lower;
Key upper;
@ -4593,11 +4624,13 @@ struct DecodeBoundaryVerifier {
typedef std::map<Version, DecodeBoundaries> BoundariesByVersion;
std::unordered_map<LogicalPageID, BoundariesByVersion> boundariesByPageID;
std::vector<Key> boundarySamples;
int boundarySampleSize = 1000;
int boundaryPopulation = 0;
Reference<IPageEncryptionKeyProvider> keyProvider;
public:
std::vector<Key> boundarySamples;
// Sample rate of pages to be scanned to verify if all entries in the page meet domain prefix requirement.
double domainPrefixScanProbability = 0.01;
uint64_t domainPrefixScanCount = 0;
@ -4626,7 +4659,7 @@ struct DecodeBoundaryVerifier {
if (boundarySamples.empty()) {
return Key();
}
return boundarySamples[deterministicRandom()->randomInt(0, boundarySamples.size())];
return deterministicRandom()->randomChoice(boundarySamples);
}
bool update(BTreeNodeLinkRef id,
@ -5192,6 +5225,15 @@ public:
Future<Void> init() { return m_init; }
virtual ~VersionedBTree() {
// DecodeBoundaryVerifier objects outlive simulated processes.
// Thus, if we did not clear the key providers here, each DecodeBoundaryVerifier object might
// maintain references to untracked peers through its key provider. This would result in
// errors when FlowTransport::removePeerReference is called to remove a peer that is no
// longer tracked by FlowTransport::transport().
if (m_pBoundaryVerifier != nullptr) {
m_pBoundaryVerifier->setKeyProvider(Reference<IPageEncryptionKeyProvider>());
}
// This probably shouldn't be called directly (meaning deleting an instance directly) but it should be safe,
// it will cancel init and commit and leave the pager alive but with potentially an incomplete set of
// uncommitted writes so it should not be committed.
@ -8003,7 +8045,9 @@ public:
Future<Void> getError() const override { return delayed(m_error.getFuture()); };
void clear(KeyRangeRef range, const Arena* arena = 0) override {
void clear(KeyRangeRef range,
const StorageServerMetrics* storageMetrics = nullptr,
const Arena* arena = 0) override {
debug_printf("CLEAR %s\n", printable(range).c_str());
m_tree->clear(range);
}

View File

@ -140,9 +140,27 @@ private:
Future<Void> collection;
};
// Defines granule info that interests full restore
struct BlobGranuleRestoreVersion {
// Two constructors required by VectorRef
BlobGranuleRestoreVersion() {}
BlobGranuleRestoreVersion(Arena& a, const BlobGranuleRestoreVersion& copyFrom)
: granuleID(copyFrom.granuleID), keyRange(a, copyFrom.keyRange), version(copyFrom.version),
sizeInBytes(copyFrom.sizeInBytes) {}
UID granuleID;
KeyRangeRef keyRange;
Version version;
int64_t sizeInBytes;
};
// Defines a vector for BlobGranuleVersion
typedef Standalone<VectorRef<BlobGranuleRestoreVersion>> BlobGranuleRestoreVersionVector;
ACTOR Future<Void> dumpManifest(Database db, Reference<BlobConnectionProvider> blobConn, int64_t epoch, int64_t seqNo);
ACTOR Future<Void> loadManifest(Database db, Reference<BlobConnectionProvider> blobConn);
ACTOR Future<Void> printRestoreSummary(Database db, Reference<BlobConnectionProvider> blobConn);
ACTOR Future<BlobGranuleRestoreVersionVector> listBlobGranules(Database db, Reference<BlobConnectionProvider> blobConn);
inline bool isFullRestoreMode() {
return SERVER_KNOBS->BLOB_FULL_RESTORE_MODE;
};

View File

@ -30,23 +30,25 @@
struct BlobMigratorInterface {
constexpr static FileIdentifier file_identifier = 869199;
RequestStream<struct HaltBlobMigratorRequest> haltBlobMigrator;
RequestStream<ReplyPromise<Void>> waitFailure;
LocalityData locality;
UID uniqueID;
StorageServerInterface ssi;
BlobMigratorInterface() {}
BlobMigratorInterface(const struct LocalityData& l, UID id) : uniqueID(id), locality(l) {}
BlobMigratorInterface(const struct LocalityData& l, UID id) : uniqueID(id), locality(l) {
ssi.locality = l;
ssi.uniqueID = id;
}
void initEndpoints() {}
void initEndpoints() { ssi.initEndpoints(); }
UID id() const { return uniqueID; }
NetworkAddress address() const { return waitFailure.getEndpoint().getPrimaryAddress(); }
NetworkAddress address() const { return haltBlobMigrator.getEndpoint().getPrimaryAddress(); }
bool operator==(const BlobMigratorInterface& r) const { return id() == r.id(); }
bool operator!=(const BlobMigratorInterface& r) const { return !(*this == r); }
template <class Archive>
void serialize(Archive& ar) {
// StorageServerInterface::serialize(ar);
serializer(ar, waitFailure, haltBlobMigrator, locality, uniqueID);
serializer(ar, locality, uniqueID, haltBlobMigrator);
}
};

View File

@ -322,6 +322,9 @@ public:
// Log physicalShard
void logPhysicalShardCollection();
// Checks if a physical shard exists.
bool physicalShardExists(uint64_t physicalShardID);
private:
// Track physicalShard metrics by tracking keyRange metrics
void updatePhysicalShardMetricsByKeyRange(KeyRange keyRange,
@ -481,10 +484,6 @@ ShardSizeBounds getShardSizeBounds(KeyRangeRef shard, int64_t maxShardSize);
// Determines the maximum shard size based on the size of the database
int64_t getMaxShardSize(double dbSizeEstimate);
struct StorageQuotaInfo {
std::map<Key, uint64_t> quotaMap;
};
#ifndef __INTEL_COMPILER
#pragma endregion
#endif

View File

@ -29,6 +29,7 @@
#include "fdbserver/IClosable.h"
#include "fdbserver/IPageEncryptionKeyProvider.actor.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/StorageMetrics.h"
struct CheckpointRequest {
const Version version; // The FDB version at which the checkpoint is created.
@ -52,7 +53,9 @@ public:
// persistRangeMapping().
virtual bool shardAware() const { return false; }
virtual void set(KeyValueRef keyValue, const Arena* arena = nullptr) = 0;
virtual void clear(KeyRangeRef range, const Arena* arena = nullptr) = 0;
virtual void clear(KeyRangeRef range,
const StorageServerMetrics* storageMetrics = nullptr,
const Arena* arena = nullptr) = 0;
virtual Future<Void> canCommit() { return Void(); }
virtual Future<Void> commit(
bool sequential = false) = 0; // returns when prior sets and clears are (atomically) durable

View File

@ -390,7 +390,9 @@ struct RemoteIKeyValueStore : public IKeyValueStore {
void set(KeyValueRef keyValue, const Arena* arena = nullptr) override {
interf.set.send(IKVSSetRequest{ keyValue, ReplyPromise<Void>() });
}
void clear(KeyRangeRef range, const Arena* arena = nullptr) override {
void clear(KeyRangeRef range,
const StorageServerMetrics* storageMetrics = nullptr,
const Arena* arena = nullptr) override {
interf.clear.send(IKVSClearRequest{ range, ReplyPromise<Void>() });
}

View File

@ -268,5 +268,4 @@ public:
void removeTeam(TCTeamInfo team);
void updateCacheGeneration(int64_t generation) { m_cacheGeneration = generation; }
int64_t cacheGeneration() const { return m_cacheGeneration; }
void updateStorageUsage(int64_t size) { m_tenantInfo.storageUsage = size; }
};

View File

@ -32,6 +32,12 @@
typedef Map<KeyRef, Reference<TCTenantInfo>> TenantMapByPrefix;
struct Storage {
int64_t quota = std::numeric_limits<int64_t>::max();
int64_t usage = 0;
};
typedef std::unordered_map<TenantName, Storage> TenantStorageMap;
struct TenantCacheTenantCreated {
KeyRange keys;
Promise<bool> reply;
@ -50,6 +56,9 @@ private:
uint64_t generation;
TenantMapByPrefix tenantCache;
// Map from tenant names to storage quota and usage
TenantStorageMap tenantStorageMap;
// mark the start of a new sweep of the tenant cache
void startRefresh();
@ -62,11 +71,8 @@ private:
// return count of tenants that were found to be stale and removed from the cache
int cleanup();
// return the mapping from prefix -> tenant name for all tenants stored in the cache
std::vector<std::pair<KeyRef, TenantName>> getTenantList() const;
// update the size for a tenant; do nothing if the tenant doesn't exist in the map
void updateStorageUsage(KeyRef prefix, int64_t size);
// return all the TenantName for all tenants stored in the cache
std::vector<TenantName> getTenantList() const;
UID id() const { return distributorID; }
@ -85,9 +91,14 @@ public:
Future<Void> monitorStorageUsage();
Future<Void> monitorStorageQuota();
std::string desc() const;
bool isTenantKey(KeyRef key) const;
Optional<Reference<TCTenantInfo>> tenantOwning(KeyRef key) const;
// Get the list of tenants where the storage bytes currently used is greater than the quota allocated
std::vector<TenantName> getTenantsOverQuota() const;
};

Some files were not shown because too many files have changed in this diff Show More