Merge branch 'main' of github.com:apple/foundationdb into tenant-delete-id

This commit is contained in:
Jon Fu 2022-11-29 16:49:23 -08:00
commit b7cba23126
131 changed files with 3323 additions and 1454 deletions

View File

@ -1,5 +1,5 @@
[flake8]
ignore = E203, E266, E501, W503, F403, F401, E711, C901, W605
max-line-length = 79
ignore = E203, E266, E501, W503, F403, F401, E711, C901, E721, W605
max-line-length = 88
max-complexity = 18
select = B,C,E,F,W,T4,B9

View File

@ -110,6 +110,12 @@ set(FDB_PACKAGE_NAME "${FDB_MAJOR}.${FDB_MINOR}")
configure_file(${CMAKE_SOURCE_DIR}/versions.target.cmake ${CMAKE_CURRENT_BINARY_DIR}/versions.target)
file(WRITE ${CMAKE_BINARY_DIR}/version.txt ${FDB_VERSION})
set(FDB_CURRENT_VERSION ${PROJECT_VERSION})
set(FDB_FUTURE_VERSION "7.4.0")
set(FDB_PREV_RELEASE_VERSION "7.1.25")
set(FDB_PREV2_RELEASE_VERSION "7.0.0")
set(FDB_PREV3_RELEASE_VERSION "6.3.25")
################################################################################
# Flow
################################################################################

View File

@ -154,6 +154,8 @@ class ApiTest(Test):
snapshot_reads = [x + '_SNAPSHOT' for x in reads]
database_reads = [x + '_DATABASE' for x in reads]
database_mutations = [x + '_DATABASE' for x in mutations]
tenant_reads = [x + '_TENANT' for x in reads]
tenant_mutations = [x + '_TENANT' for x in mutations]
mutations += ['VERSIONSTAMP']
versions = ['GET_READ_VERSION', 'SET_READ_VERSION', 'GET_COMMITTED_VERSION']
snapshot_versions = ['GET_READ_VERSION_SNAPSHOT']
@ -183,6 +185,8 @@ class ApiTest(Test):
if not args.no_tenants:
op_choices += tenants
op_choices += tenant_reads
op_choices += tenant_mutations
idempotent_atomic_ops = ['BIT_AND', 'BIT_OR', 'MAX', 'MIN', 'BYTE_MIN', 'BYTE_MAX']
atomic_ops = idempotent_atomic_ops + ['ADD', 'BIT_XOR', 'APPEND_IF_FITS']

View File

@ -283,7 +283,8 @@ if(NOT WIN32)
foreach(test_file ${API_TEST_FILES})
get_filename_component(file_name "${test_file}" NAME_WE)
set(test_name "fdb_c_api_test_${file_name}")
add_test(NAME "${test_name}"
add_scripted_fdb_test(NAME "${test_name}"
TIMEOUT 300
COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
--build-dir ${CMAKE_BINARY_DIR}
--api-tester-bin $<TARGET_FILE:fdb_c_api_tester>
@ -291,99 +292,87 @@ if(NOT WIN32)
--test-file ${test_file}
--retain-client-lib-copies
)
set_tests_properties("${test_name}" PROPERTIES TIMEOUT 300)
endforeach()
add_test(NAME fdb_c_upgrade_to_future_version
add_scripted_fdb_test(NAME fdb_c_upgrade_to_future_version
COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
--build-dir ${CMAKE_BINARY_DIR}
--test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml
--upgrade-path "7.3.0" "7.4.0" "7.3.0"
--upgrade-path "${FDB_CURRENT_VERSION}" "${FDB_FUTURE_VERSION}" "${FDB_CURRENT_VERSION}"
--process-number 3
)
set_tests_properties("fdb_c_upgrade_to_future_version" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}")
add_test(NAME fdb_c_upgrade_to_future_version_blob_granules
add_scripted_fdb_test(NAME fdb_c_upgrade_to_future_version_blob_granules
COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
--build-dir ${CMAKE_BINARY_DIR}
--test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml
--upgrade-path "7.3.0" "7.4.0" "7.3.0"
--upgrade-path "${FDB_CURRENT_VERSION}" "${FDB_FUTURE_VERSION}" "${FDB_CURRENT_VERSION}"
--blob-granules-enabled
--process-number 3
)
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT USE_SANITIZER)
add_test(NAME fdb_c_client_config_tests
add_scripted_fdb_test(NAME fdb_c_client_config_tests
COMMAND $<TARGET_FILE:Python3::Interpreter> ${CMAKE_CURRENT_SOURCE_DIR}/test/fdb_c_client_config_tests.py
--build-dir ${CMAKE_BINARY_DIR}
--client-config-tester-bin $<TARGET_FILE:fdb_c_client_config_tester>
)
add_test(NAME fdb_c_upgrade_single_threaded_630api
COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
--build-dir ${CMAKE_BINARY_DIR}
--test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml
--upgrade-path "6.3.23" "7.0.0" "7.1.9" "7.3.0"
--process-number 1
)
add_test(NAME fdb_c_upgrade_single_threaded_700api
COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
--build-dir ${CMAKE_BINARY_DIR}
--test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml
--upgrade-path "7.0.0" "7.1.9" "7.3.0"
--process-number 1
)
add_test(NAME fdb_c_upgrade_multi_threaded_630api
add_scripted_fdb_test(NAME fdb_c_upgrade_from_prev3_gradual
COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
--build-dir ${CMAKE_BINARY_DIR}
--test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml
--upgrade-path "6.3.23" "7.0.0" "7.1.9" "7.3.0" "7.1.9"
--upgrade-path "${FDB_PREV3_RELEASE_VERSION}" "${FDB_PREV2_RELEASE_VERSION}" "${FDB_PREV_RELEASE_VERSION}" "${FDB_CURRENT_VERSION}" "${FDB_PREV_RELEASE_VERSION}"
--process-number 3
)
add_test(NAME fdb_c_upgrade_multi_threaded_700api
add_scripted_fdb_test(NAME fdb_c_upgrade_from_prev3_direct
COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
--build-dir ${CMAKE_BINARY_DIR}
--test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml
--upgrade-path "7.0.0" "7.1.9" "7.3.0" "7.1.9"
--upgrade-path "${FDB_PREV3_RELEASE_VERSION}" "${FDB_CURRENT_VERSION}" "${FDB_PREV_RELEASE_VERSION}"
--process-number 3
)
add_test(NAME fdb_c_upgrade_multi_threaded_710api
add_scripted_fdb_test(NAME fdb_c_upgrade_from_prev2_gradual
COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
--build-dir ${CMAKE_BINARY_DIR}
--test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml
--upgrade-path "7.1.9" "7.3.0" "7.1.9"
--upgrade-path "${FDB_PREV2_RELEASE_VERSION}" "${FDB_PREV_RELEASE_VERSION}" "${FDB_CURRENT_VERSION}" "${FDB_PREV_RELEASE_VERSION}"
--process-number 3
)
add_test(NAME fdb_c_cluster_wiggle
add_scripted_fdb_test(NAME fdb_c_upgrade_from_prev2_direct
COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
--build-dir ${CMAKE_BINARY_DIR}
--test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml
--upgrade-path "7.3.0" "wiggle"
--upgrade-path "${FDB_PREV2_RELEASE_VERSION}" "${FDB_CURRENT_VERSION}" "${FDB_PREV_RELEASE_VERSION}"
--process-number 3
)
add_scripted_fdb_test(NAME fdb_c_upgrade_from_prev
COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
--build-dir ${CMAKE_BINARY_DIR}
--test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml
--upgrade-path "${FDB_PREV_RELEASE_VERSION}" "${FDB_CURRENT_VERSION}" "${FDB_PREV_RELEASE_VERSION}"
--process-number 3
)
add_scripted_fdb_test(NAME fdb_c_wiggle_only
COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
--build-dir ${CMAKE_BINARY_DIR}
--test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml
--upgrade-path "${FDB_CURRENT_VERSION}" "wiggle"
--disable-log-dump
--process-number 3
--redundancy double
)
add_test(NAME fdb_c_wiggle_and_upgrade_latest
add_scripted_fdb_test(NAME fdb_c_wiggle_and_upgrade
COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
--build-dir ${CMAKE_BINARY_DIR}
--test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml
--upgrade-path "7.1.9" "wiggle" "7.3.0"
--disable-log-dump
--process-number 3
--redundancy double
)
add_test(NAME fdb_c_wiggle_and_upgrade_63
COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
--build-dir ${CMAKE_BINARY_DIR}
--test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml
--upgrade-path "6.3.24" "wiggle" "7.0.0"
--upgrade-path "${FDB_PREV_RELEASE_VERSION}" "wiggle" "${FDB_CURRENT_VERSION}"
--disable-log-dump
--process-number 3
--redundancy double
@ -470,7 +459,7 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-santizer
target_link_libraries(fdb_c_shim_lib_tester PRIVATE fdb_c_shim SimpleOpt fdb_cpp Threads::Threads)
target_include_directories(fdb_c_shim_lib_tester PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/ ${CMAKE_SOURCE_DIR}/flow/include)
add_test(NAME fdb_c_shim_library_tests
add_scripted_fdb_test(NAME fdb_c_shim_library_tests
COMMAND $<TARGET_FILE:Python3::Interpreter> ${CMAKE_CURRENT_SOURCE_DIR}/test/fdb_c_shim_tests.py
--build-dir ${CMAKE_BINARY_DIR}
--unit-tests-bin $<TARGET_FILE:fdb_c_shim_unit_tests>

View File

@ -1,43 +0,0 @@
[[test]]
title = 'Mixed Workload for Upgrade Tests with a Single FDB Thread'
multiThreaded = false
buggify = true
databasePerTransaction = false
minDatabases = 2
maxDatabases = 8
minClientThreads = 2
maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
runUntilStop = true
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
runUntilStop = true
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
runUntilStop = true
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
runUntilStop = true

View File

@ -7,16 +7,9 @@ import sys
import os
import glob
import unittest
sys.path[:0] = [os.path.join(os.path.dirname(__file__), "..", "..", "..", "tests", "TestRunner")]
# fmt: off
from binary_download import FdbBinaryDownloader, CURRENT_VERSION
from fdb_version import CURRENT_VERSION, PREV_RELEASE_VERSION, PREV2_RELEASE_VERSION
from binary_download import FdbBinaryDownloader
from local_cluster import LocalCluster, random_secret_string
# fmt: on
PREV_RELEASE_VERSION = "7.1.5"
PREV_PREV_RELEASE_VERSION = "7.0.0"
args = None
downloader = None
@ -180,15 +173,15 @@ class ClientConfigTests(unittest.TestCase):
def test_multiple_external_clients(self):
# Multiple external clients, normal case
test = ClientConfigTest(self)
test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV_PREV_RELEASE_VERSION])
test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV2_RELEASE_VERSION])
test.disable_local_client = True
test.api_version = api_version_from_str(PREV_PREV_RELEASE_VERSION)
test.api_version = api_version_from_str(PREV2_RELEASE_VERSION)
test.exec()
def test_no_external_client_support_api_version(self):
# Multiple external clients, API version supported by none of them
test = ClientConfigTest(self)
test.create_external_lib_dir([PREV_PREV_RELEASE_VERSION, PREV_RELEASE_VERSION])
test.create_external_lib_dir([PREV2_RELEASE_VERSION, PREV_RELEASE_VERSION])
test.disable_local_client = True
test.api_version = api_version_from_str(CURRENT_VERSION)
test.expected_error = 2204 # API function missing
@ -197,7 +190,7 @@ class ClientConfigTests(unittest.TestCase):
def test_no_external_client_support_api_version_ignore(self):
# Multiple external clients; API version supported by none of them; Ignore failures
test = ClientConfigTest(self)
test.create_external_lib_dir([PREV_PREV_RELEASE_VERSION, PREV_RELEASE_VERSION])
test.create_external_lib_dir([PREV2_RELEASE_VERSION, PREV_RELEASE_VERSION])
test.disable_local_client = True
test.api_version = api_version_from_str(CURRENT_VERSION)
test.ignore_external_client_failures = True
@ -207,7 +200,7 @@ class ClientConfigTests(unittest.TestCase):
def test_one_external_client_wrong_api_version(self):
# Multiple external clients, API version unsupported by one of othem
test = ClientConfigTest(self)
test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV_PREV_RELEASE_VERSION])
test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV2_RELEASE_VERSION])
test.disable_local_client = True
test.api_version = api_version_from_str(CURRENT_VERSION)
test.expected_error = 2204 # API function missing
@ -216,7 +209,7 @@ class ClientConfigTests(unittest.TestCase):
def test_one_external_client_wrong_api_version_ignore(self):
# Multiple external clients; API version unsupported by one of them; Ignore failures
test = ClientConfigTest(self)
test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV_PREV_RELEASE_VERSION])
test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV2_RELEASE_VERSION])
test.disable_local_client = True
test.api_version = api_version_from_str(CURRENT_VERSION)
test.ignore_external_client_failures = True
@ -286,6 +279,6 @@ if __name__ == "__main__":
downloader = FdbBinaryDownloader(args.build_dir)
downloader.download_old_binaries(PREV_RELEASE_VERSION)
downloader.download_old_binaries(PREV_PREV_RELEASE_VERSION)
downloader.download_old_binaries(PREV2_RELEASE_VERSION)
unittest.main(verbosity=2)

View File

@ -6,15 +6,10 @@ import shutil
import subprocess
import sys
import os
sys.path[:0] = [os.path.join(os.path.dirname(__file__), '..', '..', '..', 'tests', 'TestRunner')]
# fmt: off
from binary_download import FdbBinaryDownloader, CURRENT_VERSION
from binary_download import FdbBinaryDownloader
from local_cluster import LocalCluster, random_secret_string
# fmt: on
from fdb_version import CURRENT_VERSION, PREV_RELEASE_VERSION
LAST_RELEASE_VERSION = "7.1.5"
TESTER_STATS_INTERVAL_SEC = 5
DEFAULT_TEST_FILE = "CApiCorrectnessMultiThr.toml"
IMPLIBSO_ERROR_CODE = -6 # SIGABORT
@ -54,13 +49,12 @@ class TestEnv(LocalCluster):
self.downloader.binary_path(version, "fdbserver"),
self.downloader.binary_path(version, "fdbmonitor"),
self.downloader.binary_path(version, "fdbcli"),
1
1,
)
self.set_env_var("LD_LIBRARY_PATH", self.downloader.lib_dir(version))
client_lib = self.downloader.lib_path(version)
assert client_lib.exists(), "{} does not exist".format(client_lib)
self.client_lib_external = self.tmp_dir.joinpath(
"libfdb_c_external.so")
self.client_lib_external = self.tmp_dir.joinpath("libfdb_c_external.so")
shutil.copyfile(client_lib, self.client_lib_external)
def __enter__(self):
@ -73,22 +67,16 @@ class TestEnv(LocalCluster):
shutil.rmtree(self.tmp_dir)
def exec_client_command(self, cmd_args, env_vars=None, expected_ret_code=0):
print("Executing test command: {}".format(
" ".join([str(c) for c in cmd_args])
))
tester_proc = subprocess.Popen(
cmd_args, stdout=sys.stdout, stderr=sys.stderr, env=env_vars
)
print("Executing test command: {}".format(" ".join([str(c) for c in cmd_args])))
tester_proc = subprocess.Popen(cmd_args, stdout=sys.stdout, stderr=sys.stderr, env=env_vars)
tester_retcode = tester_proc.wait()
assert tester_retcode == expected_ret_code, "Tester completed return code {}, but {} was expected".format(
tester_retcode, expected_ret_code)
tester_retcode, expected_ret_code
)
class FdbCShimTests:
def __init__(
self,
args
):
def __init__(self, args):
self.build_dir = Path(args.build_dir).resolve()
assert self.build_dir.exists(), "{} does not exist".format(args.build_dir)
assert self.build_dir.is_dir(), "{} is not a directory".format(args.build_dir)
@ -97,15 +85,14 @@ class FdbCShimTests:
self.api_tester_bin = Path(args.api_tester_bin).resolve()
assert self.api_tester_bin.exists(), "{} does not exist".format(self.api_tests_bin)
self.shim_lib_tester_bin = Path(args.shim_lib_tester_bin).resolve()
assert self.shim_lib_tester_bin.exists(
), "{} does not exist".format(self.shim_lib_tester_bin)
assert self.shim_lib_tester_bin.exists(), "{} does not exist".format(self.shim_lib_tester_bin)
self.api_test_dir = Path(args.api_test_dir).resolve()
assert self.api_test_dir.exists(), "{} does not exist".format(self.api_test_dir)
self.downloader = FdbBinaryDownloader(args.build_dir)
# binary downloads are currently available only for x86_64
self.platform = platform.machine()
if (self.platform == "x86_64"):
self.downloader.download_old_binaries(LAST_RELEASE_VERSION)
if self.platform == "x86_64":
self.downloader.download_old_binaries(PREV_RELEASE_VERSION)
self.downloader.download_old_binaries("7.0.0")
def build_c_api_tester_args(self, test_env, test_file):
@ -127,34 +114,27 @@ class FdbCShimTests:
"--tmp-dir",
test_env.tmp_dir,
"--stats-interval",
str(TESTER_STATS_INTERVAL_SEC * 1000)
str(TESTER_STATS_INTERVAL_SEC * 1000),
]
def run_c_api_test(self, version, test_file):
print('-' * 80)
print("-" * 80)
print("C API Test - version: {}, workload: {}".format(version, test_file))
print('-' * 80)
print("-" * 80)
with TestEnv(self.build_dir, self.downloader, version) as test_env:
cmd_args = self.build_c_api_tester_args(test_env, test_file)
env_vars = os.environ.copy()
env_vars["FDB_LOCAL_CLIENT_LIBRARY_PATH"] = self.downloader.lib_path(
version)
env_vars["FDB_LOCAL_CLIENT_LIBRARY_PATH"] = self.downloader.lib_path(version)
test_env.exec_client_command(cmd_args, env_vars)
def run_c_unit_tests(self, version):
print('-' * 80)
print("-" * 80)
print("C Unit Tests - version: {}".format(version))
print('-' * 80)
print("-" * 80)
with TestEnv(self.build_dir, self.downloader, version) as test_env:
cmd_args = [
self.unit_tests_bin,
test_env.cluster_file,
"fdb",
test_env.client_lib_external
]
cmd_args = [self.unit_tests_bin, test_env.cluster_file, "fdb", test_env.client_lib_external]
env_vars = os.environ.copy()
env_vars["FDB_LOCAL_CLIENT_LIBRARY_PATH"] = self.downloader.lib_path(
version)
env_vars["FDB_LOCAL_CLIENT_LIBRARY_PATH"] = self.downloader.lib_path(version)
test_env.exec_client_command(cmd_args, env_vars)
def run_c_shim_lib_tester(
@ -167,9 +147,9 @@ class FdbCShimTests:
set_env_path=False,
set_ld_lib_path=False,
use_external_lib=True,
expected_ret_code=0
expected_ret_code=0,
):
print('-' * 80)
print("-" * 80)
if api_version is None:
api_version = api_version_from_str(version)
test_flags = []
@ -183,9 +163,8 @@ class FdbCShimTests:
test_flags.append("use_external_lib")
else:
test_flags.append("use_local_lib")
print("C Shim Tests - version: {}, API version: {}, {}".format(version,
api_version, ", ".join(test_flags)))
print('-' * 80)
print("C Shim Tests - version: {}, API version: {}, {}".format(version, api_version, ", ".join(test_flags)))
print("-" * 80)
cmd_args = [
self.shim_lib_tester_bin,
"--cluster-file",
@ -196,20 +175,16 @@ class FdbCShimTests:
if call_set_path:
cmd_args = cmd_args + [
"--local-client-library",
("dummy" if invalid_lib_path else self.downloader.lib_path(version))
("dummy" if invalid_lib_path else self.downloader.lib_path(version)),
]
if use_external_lib:
cmd_args = cmd_args + [
"--disable-local-client",
"--external-client-library",
test_env.client_lib_external
]
cmd_args = cmd_args + ["--disable-local-client", "--external-client-library", test_env.client_lib_external]
env_vars = os.environ.copy()
env_vars["LD_LIBRARY_PATH"] = (
self.downloader.lib_dir(version) if set_ld_lib_path else "")
env_vars["LD_LIBRARY_PATH"] = self.downloader.lib_dir(version) if set_ld_lib_path else ""
if set_env_path:
env_vars["FDB_LOCAL_CLIENT_LIBRARY_PATH"] = (
"dummy" if invalid_lib_path else self.downloader.lib_path(version))
"dummy" if invalid_lib_path else self.downloader.lib_path(version)
)
test_env.exec_client_command(cmd_args, env_vars, expected_ret_code)
def run_tests(self):
@ -221,50 +196,60 @@ class FdbCShimTests:
with TestEnv(self.build_dir, self.downloader, CURRENT_VERSION) as test_env:
# Test lookup of the client library over LD_LIBRARY_PATH
self.run_c_shim_lib_tester(
CURRENT_VERSION, test_env, set_ld_lib_path=True)
self.run_c_shim_lib_tester(CURRENT_VERSION, test_env, set_ld_lib_path=True)
# Test setting the client library path over an API call
self.run_c_shim_lib_tester(
CURRENT_VERSION, test_env, call_set_path=True)
self.run_c_shim_lib_tester(CURRENT_VERSION, test_env, call_set_path=True)
# Test setting the client library path over an environment variable
self.run_c_shim_lib_tester(
CURRENT_VERSION, test_env, set_env_path=True)
self.run_c_shim_lib_tester(CURRENT_VERSION, test_env, set_env_path=True)
# Test using the loaded client library as the local client
self.run_c_shim_lib_tester(
CURRENT_VERSION, test_env, call_set_path=True, use_external_lib=False)
self.run_c_shim_lib_tester(CURRENT_VERSION, test_env, call_set_path=True, use_external_lib=False)
# Test setting an invalid client library path over an API call
self.run_c_shim_lib_tester(
CURRENT_VERSION, test_env, call_set_path=True, invalid_lib_path=True, expected_ret_code=IMPLIBSO_ERROR_CODE)
CURRENT_VERSION,
test_env,
call_set_path=True,
invalid_lib_path=True,
expected_ret_code=IMPLIBSO_ERROR_CODE,
)
# Test setting an invalid client library path over an environment variable
self.run_c_shim_lib_tester(
CURRENT_VERSION, test_env, set_env_path=True, invalid_lib_path=True, expected_ret_code=IMPLIBSO_ERROR_CODE)
CURRENT_VERSION,
test_env,
set_env_path=True,
invalid_lib_path=True,
expected_ret_code=IMPLIBSO_ERROR_CODE,
)
# Test calling a function that exists in the loaded library, but not for the selected API version
self.run_c_shim_lib_tester(
CURRENT_VERSION, test_env, call_set_path=True, api_version=700)
self.run_c_shim_lib_tester(CURRENT_VERSION, test_env, call_set_path=True, api_version=700)
# binary downloads are currently available only for x86_64
if self.platform == "x86_64":
# Test the API workload with the release version
self.run_c_api_test(LAST_RELEASE_VERSION, DEFAULT_TEST_FILE)
self.run_c_api_test(PREV_RELEASE_VERSION, DEFAULT_TEST_FILE)
with TestEnv(self.build_dir, self.downloader, LAST_RELEASE_VERSION) as test_env:
with TestEnv(self.build_dir, self.downloader, PREV_RELEASE_VERSION) as test_env:
# Test using the loaded client library as the local client
self.run_c_shim_lib_tester(
LAST_RELEASE_VERSION, test_env, call_set_path=True, use_external_lib=False)
self.run_c_shim_lib_tester(PREV_RELEASE_VERSION, test_env, call_set_path=True, use_external_lib=False)
# Test the client library of the release version in combination with the dev API version
self.run_c_shim_lib_tester(
LAST_RELEASE_VERSION, test_env, call_set_path=True, api_version=api_version_from_str(CURRENT_VERSION), expected_ret_code=1)
PREV_RELEASE_VERSION,
test_env,
call_set_path=True,
api_version=api_version_from_str(CURRENT_VERSION),
expected_ret_code=1,
)
# Test calling a function that does not exist in the loaded library
self.run_c_shim_lib_tester(
"7.0.0", test_env, call_set_path=True, api_version=700, expected_ret_code=IMPLIBSO_ERROR_CODE)
"7.0.0", test_env, call_set_path=True, api_version=700, expected_ret_code=IMPLIBSO_ERROR_CODE
)
if __name__ == "__main__":
@ -285,25 +270,17 @@ if __name__ == "__main__":
required=True,
)
parser.add_argument(
'--unit-tests-bin',
type=str,
help='Path to the fdb_c_shim_unit_tests executable.',
required=True)
"--unit-tests-bin", type=str, help="Path to the fdb_c_shim_unit_tests executable.", required=True
)
parser.add_argument(
'--api-tester-bin',
type=str,
help='Path to the fdb_c_shim_api_tester executable.',
required=True)
"--api-tester-bin", type=str, help="Path to the fdb_c_shim_api_tester executable.", required=True
)
parser.add_argument(
'--shim-lib-tester-bin',
type=str,
help='Path to the fdb_c_shim_lib_tester executable.',
required=True)
"--shim-lib-tester-bin", type=str, help="Path to the fdb_c_shim_lib_tester executable.", required=True
)
parser.add_argument(
'--api-test-dir',
type=str,
help='Path to a directory with api test definitions.',
required=True)
"--api-test-dir", type=str, help="Path to a directory with api test definitions.", required=True
)
args = parser.parse_args()
test = FdbCShimTests(args)
test.run_tests()

View File

@ -42,6 +42,8 @@ import (
// usually created and committed automatically by the (Database).Transact
// method.
type Database struct {
// String reference to the cluster file.
clusterFile string
*database
}
@ -56,6 +58,16 @@ type DatabaseOptions struct {
d *database
}
// Close will close the Database and clean up all resources.
// You have to ensure that you're not resuing this database.
func (d *Database) Close() {
// Remove database object from the cached databases
delete(openDatabases, d.clusterFile)
// Destroy the database
d.destroy()
}
func (opt DatabaseOptions) setOpt(code int, param []byte) error {
return setOpt(func(p *C.uint8_t, pl C.int) C.fdb_error_t {
return C.fdb_database_set_option(opt.d.ptr, C.FDBDatabaseOption(code), p, pl)
@ -63,6 +75,10 @@ func (opt DatabaseOptions) setOpt(code int, param []byte) error {
}
func (d *database) destroy() {
if d.ptr == nil {
return
}
C.fdb_database_destroy(d.ptr)
}

View File

@ -39,6 +39,7 @@ import (
// Would put this in futures.go but for the documented issue with
// exports and functions in preamble
// (https://code.google.com/p/go-wiki/wiki/cgo#Global_functions)
//
//export unlockMutex
func unlockMutex(p unsafe.Pointer) {
m := (*sync.Mutex)(p)
@ -337,7 +338,7 @@ func createDatabase(clusterFile string) (Database, error) {
db := &database{outdb}
runtime.SetFinalizer(db, (*database).destroy)
return Database{db}, nil
return Database{clusterFile, db}, nil
}
// Deprecated: Use OpenDatabase instead.

View File

@ -48,7 +48,10 @@ func ExampleOpenDefault() {
return
}
_ = db
// Close the database after usage
defer db.Close()
// Do work here
// Output:
}
@ -313,3 +316,30 @@ func ExamplePrintable() {
fmt.Println(fdb.Printable([]byte{0, 1, 2, 'a', 'b', 'c', '1', '2', '3', '!', '?', 255}))
// Output: \x00\x01\x02abc123!?\xff
}
func TestDatabaseCloseRemovesResources(t *testing.T) {
err := fdb.APIVersion(API_VERSION)
if err != nil {
t.Fatalf("Unable to set API version: %v\n", err)
}
// OpenDefault opens the database described by the platform-specific default
// cluster file
db, err := fdb.OpenDefault()
if err != nil {
t.Fatalf("Unable to set API version: %v\n", err)
}
// Close the database after usage
db.Close()
// Open the same database again, if the database is still in the cache we would return the same object, if not we create a new object with a new pointer
newDB, err := fdb.OpenDefault()
if err != nil {
t.Fatalf("Unable to set API version: %v\n", err)
}
if db == newDB {
t.Fatalf("Expected a different database object, got: %v and %v\n", db, newDB)
}
}

View File

@ -25,14 +25,14 @@ https://apple.github.io/foundationdb/api-python.html"""
def open(*args, **kwargs):
raise RuntimeError('You must call api_version() before using any fdb methods')
raise RuntimeError("You must call api_version() before using any fdb methods")
init = open
def transactional(*args, **kwargs):
raise RuntimeError('You must call api_version() before using fdb.transactional')
raise RuntimeError("You must call api_version() before using fdb.transactional")
def _add_symbols(module, symbols):
@ -41,29 +41,29 @@ def _add_symbols(module, symbols):
def is_api_version_selected():
return '_version' in globals()
return "_version" in globals()
def get_api_version():
if is_api_version_selected():
return globals()['_version']
return globals()["_version"]
else:
raise RuntimeError('API version is not set')
raise RuntimeError("API version is not set")
def api_version(ver):
header_version = 720
if '_version' in globals():
if globals()['_version'] != ver:
raise RuntimeError('FDB API already loaded at version %d' % _version)
if "_version" in globals():
if globals()["_version"] != ver:
raise RuntimeError("FDB API already loaded at version %d" % _version)
return
if ver < 13:
raise RuntimeError('FDB API versions before 13 are not supported')
raise RuntimeError("FDB API versions before 13 are not supported")
if ver > header_version:
raise RuntimeError('Latest known FDB API version is %d' % header_version)
raise RuntimeError("Latest known FDB API version is %d" % header_version)
import fdb.impl
@ -71,31 +71,37 @@ def api_version(ver):
if err == 2203: # api_version_not_supported, but that's not helpful to the user
max_supported_ver = fdb.impl._capi.fdb_get_max_api_version()
if header_version > max_supported_ver:
raise RuntimeError("This version of the FoundationDB Python binding is not supported by the installed "
"FoundationDB C library. The binding requires a library that supports API version "
"%d, but the installed library supports a maximum version of %d." % (header_version, max_supported_ver))
raise RuntimeError(
"This version of the FoundationDB Python binding is not supported by the installed "
"FoundationDB C library. The binding requires a library that supports API version "
"%d, but the installed library supports a maximum version of %d."
% (header_version, max_supported_ver)
)
else:
raise RuntimeError("API version %d is not supported by the installed FoundationDB C library." % ver)
raise RuntimeError(
"API version %d is not supported by the installed FoundationDB C library."
% ver
)
elif err != 0:
raise RuntimeError('FoundationDB API error')
raise RuntimeError("FoundationDB API error")
fdb.impl.init_c_api()
list = (
'FDBError',
'predicates',
'Future',
'Database',
'Tenant',
'Transaction',
'KeyValue',
'KeySelector',
'open',
'transactional',
'options',
'StreamingMode',
"FDBError",
"predicates",
"Future",
"Database",
"Tenant",
"Transaction",
"KeyValue",
"KeySelector",
"open",
"transactional",
"options",
"StreamingMode",
)
_add_symbols(fdb.impl, list)
@ -134,14 +140,20 @@ def api_version(ver):
if not hasattr(self, "__iterating"):
self.__iterating = iter(self)
return next(self.__iterating)
setattr(fdb.impl.FDBRange, "next", next)
globals()['_version'] = ver
globals()["_version"] = ver
import fdb.directory_impl
directory_symbols = ('directory', 'DirectoryLayer',)
directory_symbols = (
"directory",
"DirectoryLayer",
)
_add_symbols(fdb.directory_impl, directory_symbols)
import fdb.subspace_impl
subspace_symbols = ('Subspace',)
subspace_symbols = ("Subspace",)
_add_symbols(fdb.subspace_impl, subspace_symbols)

View File

@ -35,8 +35,7 @@ class AllocatorTransactionState:
self.lock = threading.Lock()
class HighContentionAllocator (object):
class HighContentionAllocator(object):
def __init__(self, subspace):
self.counters = subspace[0]
self.recent = subspace[1]
@ -45,9 +44,9 @@ class HighContentionAllocator (object):
@_impl.transactional
def allocate(self, tr):
"""Returns a byte string that
1) has never and will never be returned by another call to this
method on the same subspace
2) is nearly as short as possible given the above
1) has never and will never be returned by another call to this
method on the same subspace
2) is nearly as short as possible given the above
"""
# Get transaction-local state
@ -59,16 +58,23 @@ class HighContentionAllocator (object):
tr_state = tr.__fdb_directory_layer_hca_state__
while True:
[start] = [self.counters.unpack(k)[0] for k, _ in tr.snapshot.get_range(
self.counters.range().start, self.counters.range().stop, limit=1, reverse=True)] or [0]
[start] = [
self.counters.unpack(k)[0]
for k, _ in tr.snapshot.get_range(
self.counters.range().start,
self.counters.range().stop,
limit=1,
reverse=True,
)
] or [0]
window_advanced = False
while True:
with tr_state.lock:
if window_advanced:
del tr[self.counters: self.counters[start]]
del tr[self.counters : self.counters[start]]
tr.options.set_next_write_no_write_conflict_range()
del tr[self.recent: self.recent[start]]
del tr[self.recent : self.recent[start]]
# Increment the allocation count for the current window
tr.add(self.counters[start], struct.pack("<q", 1))
@ -94,10 +100,15 @@ class HighContentionAllocator (object):
candidate = random.randrange(start, start + window)
with tr_state.lock:
latest_counter = tr.snapshot.get_range(self.counters.range().start, self.counters.range().stop, limit=1, reverse=True)
latest_counter = tr.snapshot.get_range(
self.counters.range().start,
self.counters.range().stop,
limit=1,
reverse=True,
)
candidate_value = tr[self.recent[candidate]]
tr.options.set_next_write_no_write_conflict_range()
tr[self.recent[candidate]] = b''
tr[self.recent[candidate]] = b""
latest_counter = [self.counters.unpack(k)[0] for k, _ in latest_counter]
if len(latest_counter) > 0 and latest_counter[0] > start:
@ -121,7 +132,7 @@ class HighContentionAllocator (object):
class Directory(object):
def __init__(self, directory_layer, path=(), layer=b''):
def __init__(self, directory_layer, path=(), layer=b""):
self._directory_layer = directory_layer
self._path = path
self._layer = layer
@ -129,7 +140,9 @@ class Directory(object):
@_impl.transactional
def create_or_open(self, tr, path, layer=None):
path = self._tuplify_path(path)
return self._directory_layer.create_or_open(tr, self._partition_subpath(path), layer)
return self._directory_layer.create_or_open(
tr, self._partition_subpath(path), layer
)
@_impl.transactional
def open(self, tr, path, layer=None):
@ -139,7 +152,9 @@ class Directory(object):
@_impl.transactional
def create(self, tr, path, layer=None, prefix=None):
path = self._tuplify_path(path)
return self._directory_layer.create(tr, self._partition_subpath(path), layer, prefix)
return self._directory_layer.create(
tr, self._partition_subpath(path), layer, prefix
)
@_impl.transactional
def list(self, tr, path=()):
@ -150,7 +165,9 @@ class Directory(object):
def move(self, tr, old_path, new_path):
old_path = self._tuplify_path(old_path)
new_path = self._tuplify_path(new_path)
return self._directory_layer.move(tr, self._partition_subpath(old_path), self._partition_subpath(new_path))
return self._directory_layer.move(
tr, self._partition_subpath(old_path), self._partition_subpath(new_path)
)
@_impl.transactional
def move_to(self, tr, new_absolute_path):
@ -161,25 +178,33 @@ class Directory(object):
if partition_path != directory_layer._path:
raise ValueError("Cannot move between partitions.")
return directory_layer.move(tr, self._path[partition_len:], new_absolute_path[partition_len:])
return directory_layer.move(
tr, self._path[partition_len:], new_absolute_path[partition_len:]
)
@_impl.transactional
def remove(self, tr, path=()):
path = self._tuplify_path(path)
directory_layer = self._get_layer_for_path(path)
return directory_layer.remove(tr, self._partition_subpath(path, directory_layer))
return directory_layer.remove(
tr, self._partition_subpath(path, directory_layer)
)
@_impl.transactional
def remove_if_exists(self, tr, path=()):
path = self._tuplify_path(path)
directory_layer = self._get_layer_for_path(path)
return directory_layer.remove_if_exists(tr, self._partition_subpath(path, directory_layer))
return directory_layer.remove_if_exists(
tr, self._partition_subpath(path, directory_layer)
)
@_impl.transactional
def exists(self, tr, path=()):
path = self._tuplify_path(path)
directory_layer = self._get_layer_for_path(path)
return directory_layer.exists(tr, self._partition_subpath(path, directory_layer))
return directory_layer.exists(
tr, self._partition_subpath(path, directory_layer)
)
def get_layer(self):
return self._layer
@ -194,7 +219,7 @@ class Directory(object):
def _partition_subpath(self, path, directory_layer=None):
directory_layer = directory_layer or self._directory_layer
return self._path[len(directory_layer._path):] + path
return self._path[len(directory_layer._path) :] + path
# Called by all functions that could operate on this subspace directly (move_to, remove, remove_if_exists, exists)
# Subclasses can choose to return a different directory layer to use for the operation if path is in fact ()
@ -203,8 +228,12 @@ class Directory(object):
class DirectoryLayer(Directory):
def __init__(self, node_subspace=Subspace(rawPrefix=b'\xfe'), content_subspace=Subspace(), allow_manual_prefixes=False):
def __init__(
self,
node_subspace=Subspace(rawPrefix=b"\xfe"),
content_subspace=Subspace(),
allow_manual_prefixes=False,
):
Directory.__init__(self, self)
# If specified, new automatically allocated prefixes will all fall within content_subspace
@ -215,11 +244,11 @@ class DirectoryLayer(Directory):
# The root node is the one whose contents are the node subspace
self._root_node = self._node_subspace[self._node_subspace.key()]
self._allocator = HighContentionAllocator(self._root_node[b'hca'])
self._allocator = HighContentionAllocator(self._root_node[b"hca"])
@_impl.transactional
def create_or_open(self, tr, path, layer=None):
""" Opens the directory with the given path.
"""Opens the directory with the given path.
If the directory does not exist, it is created (creating parent
directories if necessary).
@ -229,12 +258,16 @@ class DirectoryLayer(Directory):
"""
return self._create_or_open_internal(tr, path, layer)
def _create_or_open_internal(self, tr, path, layer=None, prefix=None, allow_create=True, allow_open=True):
def _create_or_open_internal(
self, tr, path, layer=None, prefix=None, allow_create=True, allow_open=True
):
self._check_version(tr, write_access=False)
if prefix is not None and not self._allow_manual_prefixes:
if len(self._path) == 0:
raise ValueError("Cannot specify a prefix unless manual prefixes are enabled.")
raise ValueError(
"Cannot specify a prefix unless manual prefixes are enabled."
)
else:
raise ValueError("Cannot specify a prefix in a partition.")
@ -248,7 +281,9 @@ class DirectoryLayer(Directory):
if existing_node.exists():
if existing_node.is_in_partition():
subpath = existing_node.get_partition_subpath()
return existing_node.get_contents(self)._directory_layer._create_or_open_internal(
return existing_node.get_contents(
self
)._directory_layer._create_or_open_internal(
tr, subpath, layer, prefix, allow_create, allow_open
)
@ -256,7 +291,9 @@ class DirectoryLayer(Directory):
raise ValueError("The directory already exists.")
if layer and existing_node.layer() != layer:
raise ValueError("The directory was created with an incompatible layer.")
raise ValueError(
"The directory was created with an incompatible layer."
)
return existing_node.get_contents(self)
@ -269,16 +306,23 @@ class DirectoryLayer(Directory):
prefix = self._content_subspace.key() + self._allocator.allocate(tr)
if len(list(tr.get_range_startswith(prefix, limit=1))) > 0:
raise Exception("The database has keys stored at the prefix chosen by the automatic prefix allocator: %r." % prefix)
raise Exception(
"The database has keys stored at the prefix chosen by the automatic prefix allocator: %r."
% prefix
)
if not self._is_prefix_free(tr.snapshot, prefix):
raise Exception("The directory layer has manually allocated prefixes that conflict with the automatic prefix allocator.")
raise Exception(
"The directory layer has manually allocated prefixes that conflict with the automatic prefix allocator."
)
elif not self._is_prefix_free(tr, prefix):
raise ValueError("The given prefix is already in use.")
if len(path) > 1:
parent_node = self._node_with_prefix(self.create_or_open(tr, path[:-1]).key())
parent_node = self._node_with_prefix(
self.create_or_open(tr, path[:-1]).key()
)
else:
parent_node = self._root_node
if not parent_node:
@ -288,15 +332,15 @@ class DirectoryLayer(Directory):
node = self._node_with_prefix(prefix)
tr[parent_node[self.SUBDIRS][path[-1]]] = prefix
if not layer:
layer = b''
layer = b""
tr[node[b'layer']] = layer
tr[node[b"layer"]] = layer
return self._contents_of_node(node, path, layer)
@_impl.transactional
def open(self, tr, path, layer=None):
""" Opens the directory with the given path.
"""Opens the directory with the given path.
An error is raised if the directory does not exist, or if a layer is
specified and a different layer was specified when the directory was
@ -321,7 +365,7 @@ class DirectoryLayer(Directory):
@_impl.transactional
def move_to(self, tr, new_absolute_path):
raise Exception('The root directory cannot be moved.')
raise Exception("The root directory cannot be moved.")
@_impl.transactional
def move(self, tr, old_path, new_path):
@ -339,8 +383,10 @@ class DirectoryLayer(Directory):
old_path = _to_unicode_path(old_path)
new_path = _to_unicode_path(new_path)
if old_path == new_path[:len(old_path)]:
raise ValueError("The destination directory cannot be a subdirectory of the source directory.")
if old_path == new_path[: len(old_path)]:
raise ValueError(
"The destination directory cannot be a subdirectory of the source directory."
)
old_node = self._find(tr, old_path).prefetch_metadata(tr)
new_node = self._find(tr, new_path).prefetch_metadata(tr)
@ -349,18 +395,30 @@ class DirectoryLayer(Directory):
raise ValueError("The source directory does not exist.")
if old_node.is_in_partition() or new_node.is_in_partition():
if not old_node.is_in_partition() or not new_node.is_in_partition() or old_node.path != new_node.path:
if (
not old_node.is_in_partition()
or not new_node.is_in_partition()
or old_node.path != new_node.path
):
raise ValueError("Cannot move between partitions.")
return new_node.get_contents(self).move(tr, old_node.get_partition_subpath(), new_node.get_partition_subpath())
return new_node.get_contents(self).move(
tr, old_node.get_partition_subpath(), new_node.get_partition_subpath()
)
if new_node.exists():
raise ValueError("The destination directory already exists. Remove it first.")
raise ValueError(
"The destination directory already exists. Remove it first."
)
parent_node = self._find(tr, new_path[:-1])
if not parent_node.exists():
raise ValueError("The parent of the destination directory does not exist. Create it first.")
tr[parent_node.subspace[self.SUBDIRS][new_path[-1]]] = self._node_subspace.unpack(old_node.subspace.key())[0]
raise ValueError(
"The parent of the destination directory does not exist. Create it first."
)
tr[
parent_node.subspace[self.SUBDIRS][new_path[-1]]
] = self._node_subspace.unpack(old_node.subspace.key())[0]
self._remove_from_parent(tr, old_path)
return self._contents_of_node(old_node.subspace, new_path, old_node.layer())
@ -400,7 +458,9 @@ class DirectoryLayer(Directory):
return False
if node.is_in_partition():
return node.get_contents(self)._directory_layer._remove_internal(tr, node.get_partition_subpath(), fail_on_nonexistent)
return node.get_contents(self)._directory_layer._remove_internal(
tr, node.get_partition_subpath(), fail_on_nonexistent
)
self._remove_recursive(tr, node.subspace)
self._remove_from_parent(tr, path)
@ -447,7 +507,7 @@ class DirectoryLayer(Directory):
VERSION = (1, 0, 0)
def _check_version(self, tr, write_access=True):
version = tr[self._root_node[b'version']]
version = tr[self._root_node[b"version"]]
if not version.present():
if write_access:
@ -455,16 +515,22 @@ class DirectoryLayer(Directory):
return
version = struct.unpack('<III', bytes(version))
version = struct.unpack("<III", bytes(version))
if version[0] > self.VERSION[0]:
raise Exception("Cannot load directory with version %d.%d.%d using directory layer %d.%d.%d" % (version + self.VERSION))
raise Exception(
"Cannot load directory with version %d.%d.%d using directory layer %d.%d.%d"
% (version + self.VERSION)
)
if version[1] > self.VERSION[1] and write_access:
raise Exception("Directory with version %d.%d.%d is read-only when opened using directory layer %d.%d.%d" % (version + self.VERSION))
raise Exception(
"Directory with version %d.%d.%d is read-only when opened using directory layer %d.%d.%d"
% (version + self.VERSION)
)
def _initialize_directory(self, tr):
tr[self._root_node[b'version']] = struct.pack('<III', *self.VERSION)
tr[self._root_node[b"version"]] = struct.pack("<III", *self.VERSION)
def _node_containing_key(self, tr, key):
# Right now this is only used for _is_prefix_free(), but if we add
@ -472,10 +538,12 @@ class DirectoryLayer(Directory):
# path based on a key.
if key.startswith(self._node_subspace.key()):
return self._root_node
for k, v in tr.get_range(self._node_subspace.range(()).start,
self._node_subspace.pack((key,)) + b'\x00',
reverse=True,
limit=1):
for k, v in tr.get_range(
self._node_subspace.range(()).start,
self._node_subspace.pack((key,)) + b"\x00",
reverse=True,
limit=1,
):
prev_prefix = self._node_subspace.unpack(k)[0]
if key.startswith(prev_prefix):
return self._node_with_prefix(prev_prefix)
@ -489,7 +557,7 @@ class DirectoryLayer(Directory):
def _contents_of_node(self, node, path, layer=None):
prefix = self._node_subspace.unpack(node.key())[0]
if layer == b'partition':
if layer == b"partition":
return DirectoryPartition(self._path + path, prefix, self)
else:
return DirectorySubspace(self._path + path, prefix, self, layer)
@ -497,8 +565,12 @@ class DirectoryLayer(Directory):
def _find(self, tr, path):
n = _Node(self._root_node, (), path)
for i, name in enumerate(path):
n = _Node(self._node_with_prefix(tr[n.subspace[self.SUBDIRS][name]]), path[:i + 1], path)
if not n.exists() or n.layer(tr) == b'partition':
n = _Node(
self._node_with_prefix(tr[n.subspace[self.SUBDIRS][name]]),
path[: i + 1],
path,
)
if not n.exists() or n.layer(tr) == b"partition":
return n
return n
@ -521,8 +593,19 @@ class DirectoryLayer(Directory):
# Returns true if the given prefix does not "intersect" any currently
# allocated prefix (including the root node). This means that it neither
# contains any other prefix nor is contained by any other prefix.
return prefix and not self._node_containing_key(tr, prefix) \
and not len(list(tr.get_range(self._node_subspace.pack((prefix,)), self._node_subspace.pack((_impl.strinc(prefix),)), limit=1)))
return (
prefix
and not self._node_containing_key(tr, prefix)
and not len(
list(
tr.get_range(
self._node_subspace.pack((prefix,)),
self._node_subspace.pack((_impl.strinc(prefix),)),
limit=1,
)
)
)
)
def _is_prefix_empty(self, tr, prefix):
return len(list(tr.get_range(prefix, _impl.strinc(prefix), limit=1))) == 0
@ -541,11 +624,15 @@ def _to_unicode_path(path):
if isinstance(name, bytes):
path[i] = six.text_type(path[i])
elif not isinstance(name, six.text_type):
raise ValueError('Invalid path: must be a unicode string or a tuple of unicode strings')
raise ValueError(
"Invalid path: must be a unicode string or a tuple of unicode strings"
)
return tuple(path)
raise ValueError('Invalid path: must be a unicode string or a tuple of unicode strings')
raise ValueError(
"Invalid path: must be a unicode string or a tuple of unicode strings"
)
directory = DirectoryLayer()
@ -561,43 +648,59 @@ class DirectorySubspace(Subspace, Directory):
Directory.__init__(self, directory_layer, path, layer)
def __repr__(self):
return 'DirectorySubspace(path=' + repr(self._path) + ', prefix=' + repr(self.rawPrefix) + ')'
return (
"DirectorySubspace(path="
+ repr(self._path)
+ ", prefix="
+ repr(self.rawPrefix)
+ ")"
)
class DirectoryPartition(DirectorySubspace):
def __init__(self, path, prefix, parent_directory_layer):
directory_layer = DirectoryLayer(Subspace(rawPrefix=prefix + b'\xfe'), Subspace(rawPrefix=prefix))
directory_layer = DirectoryLayer(
Subspace(rawPrefix=prefix + b"\xfe"), Subspace(rawPrefix=prefix)
)
directory_layer._path = path
DirectorySubspace.__init__(self, path, prefix, directory_layer, b'partition')
DirectorySubspace.__init__(self, path, prefix, directory_layer, b"partition")
self._parent_directory_layer = parent_directory_layer
def __repr__(self):
return 'DirectoryPartition(path=' + repr(self._path) + ', prefix=' + repr(self.rawPrefix) + ')'
return (
"DirectoryPartition(path="
+ repr(self._path)
+ ", prefix="
+ repr(self.rawPrefix)
+ ")"
)
def __getitem__(self, name):
raise Exception('Cannot open subspace in the root of a directory partition.')
raise Exception("Cannot open subspace in the root of a directory partition.")
def key(self):
raise Exception('Cannot get key for the root of a directory partition.')
raise Exception("Cannot get key for the root of a directory partition.")
def pack(self, t=tuple()):
raise Exception('Cannot pack keys using the root of a directory partition.')
raise Exception("Cannot pack keys using the root of a directory partition.")
def unpack(self, key):
raise Exception('Cannot unpack keys using the root of a directory partition.')
raise Exception("Cannot unpack keys using the root of a directory partition.")
def range(self, t=tuple()):
raise Exception('Cannot get range for the root of a directory partition.')
raise Exception("Cannot get range for the root of a directory partition.")
def contains(self, key):
raise Exception('Cannot check whether a key belongs to the root of a directory partition.')
raise Exception(
"Cannot check whether a key belongs to the root of a directory partition."
)
def as_foundationdb_key(self):
raise Exception('Cannot use the root of a directory partition as a key.')
raise Exception("Cannot use the root of a directory partition as a key.")
def subspace(self, tuple):
raise Exception('Cannot open subspace in the root of a directory partition.')
raise Exception("Cannot open subspace in the root of a directory partition.")
def _get_layer_for_path(self, path):
if path == ():
@ -606,8 +709,7 @@ class DirectoryPartition(DirectorySubspace):
return self._directory_layer
class _Node (object):
class _Node(object):
def __init__(self, subspace, path, target_path):
self.subspace = subspace
self.path = path
@ -625,17 +727,23 @@ class _Node (object):
def layer(self, tr=None):
if tr:
self._layer = tr[self.subspace[b'layer']]
self._layer = tr[self.subspace[b"layer"]]
elif self._layer is None:
raise Exception('Layer has not been read')
raise Exception("Layer has not been read")
return self._layer
def is_in_partition(self, tr=None, include_empty_subpath=False):
return self.exists() and self.layer(tr) == b'partition' and (include_empty_subpath or len(self.target_path) > len(self.path))
return (
self.exists()
and self.layer(tr) == b"partition"
and (include_empty_subpath or len(self.target_path) > len(self.path))
)
def get_partition_subpath(self):
return self.target_path[len(self.path):]
return self.target_path[len(self.path) :]
def get_contents(self, directory_layer, tr=None):
return directory_layer._contents_of_node(self.subspace, self.path, self.layer(tr))
return directory_layer._contents_of_node(
self.subspace, self.path, self.layer(tr)
)

File diff suppressed because it is too large Load Diff

View File

@ -40,13 +40,15 @@ def _get_boundary_keys(db_or_tr, begin, end):
lastbegin = begin
tr.options.set_read_system_keys()
tr.options.set_lock_aware()
kvs = tr.snapshot.get_range(b'\xff' + b'/keyServers/' + begin, b'\xff' + b'/keyServers/' + end)
kvs = tr.snapshot.get_range(
b"\xff" + b"/keyServers/" + begin, b"\xff" + b"/keyServers/" + end
)
if first_time:
first_time = False
yield None # trick to get the above get_range to be asynchronously dispatched before get_boundary_keys() returns.
for kv in kvs:
yield kv.key[13:]
begin = kv.key[13:] + b'\x00'
begin = kv.key[13:] + b"\x00"
begin = end
except _impl.FDBError as e:
# if we get a transaction_too_old and *something* has happened, then we are no longer transactional
@ -71,4 +73,8 @@ def get_boundary_keys(db_or_tr, begin, end):
@_impl.transactional
def get_addresses_for_key(tr, key):
keyBytes = _impl.keyToBytes(key)
return _impl.FutureStringArray(tr.capi.fdb_transaction_get_addresses_for_key(tr.tpointer, keyBytes, len(keyBytes)))
return _impl.FutureStringArray(
tr.capi.fdb_transaction_get_addresses_for_key(
tr.tpointer, keyBytes, len(keyBytes)
)
)

View File

@ -23,13 +23,12 @@
import fdb.tuple
class Subspace (object):
def __init__(self, prefixTuple=tuple(), rawPrefix=b''):
class Subspace(object):
def __init__(self, prefixTuple=tuple(), rawPrefix=b""):
self.rawPrefix = fdb.tuple.pack(prefixTuple, prefix=rawPrefix)
def __repr__(self):
return 'Subspace(rawPrefix=' + repr(self.rawPrefix) + ')'
return "Subspace(rawPrefix=" + repr(self.rawPrefix) + ")"
def __getitem__(self, name):
return Subspace((name,), self.rawPrefix)
@ -45,7 +44,7 @@ class Subspace (object):
def unpack(self, key):
if not self.contains(key):
raise ValueError('Cannot unpack key that is not in subspace.')
raise ValueError("Cannot unpack key that is not in subspace.")
return fdb.tuple.unpack(key, prefix_len=len(self.rawPrefix))

View File

@ -25,9 +25,10 @@ https://apple.github.io/foundationdb/api-python.html"""
from fdb import impl as _impl
_tenant_map_prefix = b'\xff\xff/management/tenant/map/'
_tenant_map_prefix = b"\xff\xff/management/tenant/map/"
# If the existence_check_marker is an empty list, then check whether the tenant exists.
# If the existence_check_marker is an empty list, then check whether the tenant exists.
# After the check, append an item to the existence_check_marker list so that subsequent
# calls to this function will not perform the existence check.
#
@ -37,11 +38,12 @@ def _check_tenant_existence(tr, key, existence_check_marker, force_maybe_commite
existing_tenant = tr[key].wait()
existence_check_marker.append(None)
if force_maybe_commited:
raise _impl.FDBError(1021) # maybe_committed
raise _impl.FDBError(1021) # maybe_committed
return existing_tenant != None
return None
# Attempt to create a tenant in the cluster. If existence_check_marker is an empty
# list, then this function will check if the tenant already exists and fail if it does.
# Once the existence check is completed, it will not be done again if this function
@ -51,15 +53,23 @@ def _check_tenant_existence(tr, key, existence_check_marker, force_maybe_commite
#
# If the existence_check_marker is a non-empty list, then the existence check is skipped.
@_impl.transactional
def _create_tenant_impl(tr, tenant_name, existence_check_marker, force_existence_check_maybe_committed=False):
def _create_tenant_impl(
tr, tenant_name, existence_check_marker, force_existence_check_maybe_committed=False
):
tr.options.set_special_key_space_enable_writes()
key = b'%s%s' % (_tenant_map_prefix, tenant_name)
key = b"%s%s" % (_tenant_map_prefix, tenant_name)
if (
_check_tenant_existence(
tr, key, existence_check_marker, force_existence_check_maybe_committed
)
is True
):
raise _impl.FDBError(2132) # tenant_already_exists
tr[key] = b""
if _check_tenant_existence(tr, key, existence_check_marker, force_existence_check_maybe_committed) is True:
raise _impl.FDBError(2132) # tenant_already_exists
tr[key] = b''
# Attempt to delete a tenant from the cluster. If existence_check_marker is an empty
# list, then this function will check if the tenant already exists and fail if it does
# not. Once the existence check is completed, it will not be done again if this function
@ -69,15 +79,23 @@ def _create_tenant_impl(tr, tenant_name, existence_check_marker, force_existence
#
# If the existence_check_marker is a non-empty list, then the existence check is skipped.
@_impl.transactional
def _delete_tenant_impl(tr, tenant_name, existence_check_marker, force_existence_check_maybe_committed=False):
def _delete_tenant_impl(
tr, tenant_name, existence_check_marker, force_existence_check_maybe_committed=False
):
tr.options.set_special_key_space_enable_writes()
key = b'%s%s' % (_tenant_map_prefix, tenant_name)
key = b"%s%s" % (_tenant_map_prefix, tenant_name)
if _check_tenant_existence(tr, key, existence_check_marker, force_existence_check_maybe_committed) is False:
raise _impl.FDBError(2131) # tenant_not_found
if (
_check_tenant_existence(
tr, key, existence_check_marker, force_existence_check_maybe_committed
)
is False
):
raise _impl.FDBError(2131) # tenant_not_found
del tr[key]
class FDBTenantList(object):
"""Iterates over the results of list_tenants query. Returns
KeyValue objects.
@ -96,6 +114,7 @@ class FDBTenantList(object):
tenant_name = _impl.remove_prefix(next_item.key, _tenant_map_prefix)
yield _impl.KeyValue(tenant_name, next_item.value)
# Lists the tenants created in the cluster, specified by the begin and end range.
# Also limited in number of results by the limit parameter.
# Returns an iterable object that yields KeyValue objects
@ -104,29 +123,36 @@ class FDBTenantList(object):
@_impl.transactional
def _list_tenants_impl(tr, begin, end, limit):
tr.options.set_raw_access()
begin_key = b'%s%s' % (_tenant_map_prefix, begin)
end_key = b'%s%s' % (_tenant_map_prefix, end)
begin_key = b"%s%s" % (_tenant_map_prefix, begin)
end_key = b"%s%s" % (_tenant_map_prefix, end)
rangeresult = tr.get_range(begin_key, end_key, limit)
return FDBTenantList(rangeresult)
def create_tenant(db_or_tr, tenant_name):
tenant_name = _impl.process_tenant_name(tenant_name)
# Only perform the existence check when run using a database
# Callers using a transaction are expected to check existence themselves if required
existence_check_marker = [] if not isinstance(db_or_tr, _impl.TransactionRead) else [None]
existence_check_marker = (
[] if not isinstance(db_or_tr, _impl.TransactionRead) else [None]
)
_create_tenant_impl(db_or_tr, tenant_name, existence_check_marker)
def delete_tenant(db_or_tr, tenant_name):
tenant_name = _impl.process_tenant_name(tenant_name)
# Only perform the existence check when run using a database
# Callers using a transaction are expected to check existence themselves if required
existence_check_marker = [] if not isinstance(db_or_tr, _impl.TransactionRead) else [None]
existence_check_marker = (
[] if not isinstance(db_or_tr, _impl.TransactionRead) else [None]
)
_delete_tenant_impl(db_or_tr, tenant_name, existence_check_marker)
def list_tenants(db_or_tr, begin, end, limit):
begin = _impl.process_tenant_name(begin)
end = _impl.process_tenant_name(end)

View File

@ -39,8 +39,8 @@ BYTES_CODE = 0x01
STRING_CODE = 0x02
NESTED_CODE = 0x05
INT_ZERO_CODE = 0x14
POS_INT_END = 0x1d
NEG_INT_START = 0x0b
POS_INT_END = 0x1D
NEG_INT_START = 0x0B
FLOAT_CODE = 0x20
DOUBLE_CODE = 0x21
FALSE_CODE = 0x26
@ -54,10 +54,10 @@ VERSIONSTAMP_CODE = 0x33
def _find_terminator(v, pos):
# Finds the start of the next terminator [\x00]![\xff] or the end of v
while True:
pos = v.find(b'\x00', pos)
pos = v.find(b"\x00", pos)
if pos < 0:
return len(v)
if pos + 1 == len(v) or v[pos + 1:pos + 2] != b'\xff':
if pos + 1 == len(v) or v[pos + 1 : pos + 2] != b"\xff":
return pos
pos += 2
@ -66,9 +66,9 @@ def _find_terminator(v, pos):
# If decoding and sign bit is 0 (negative), flip all of the bits. Otherwise, just flip sign.
def _float_adjust(v, encode):
if encode and six.indexbytes(v, 0) & 0x80 != 0x00:
return b''.join(map(lambda x: six.int2byte(x ^ 0xff), six.iterbytes(v)))
return b"".join(map(lambda x: six.int2byte(x ^ 0xFF), six.iterbytes(v)))
elif not encode and six.indexbytes(v, 0) & 0x80 != 0x80:
return b''.join(map(lambda x: six.int2byte(x ^ 0xff), six.iterbytes(v)))
return b"".join(map(lambda x: six.int2byte(x ^ 0xFF), six.iterbytes(v)))
else:
return six.int2byte(six.indexbytes(v, 0) ^ 0x80) + v[1:]
@ -84,7 +84,9 @@ class SingleFloat(object):
elif isinstance(value, six.integer_types):
self.value = ctypes.c_float(value).value
else:
raise ValueError("Incompatible type for single-precision float: " + repr(value))
raise ValueError(
"Incompatible type for single-precision float: " + repr(value)
)
# Comparisons
def __eq__(self, other):
@ -119,24 +121,42 @@ class Versionstamp(object):
LENGTH = 12
_TR_VERSION_LEN = 10
_MAX_USER_VERSION = (1 << 16) - 1
_UNSET_TR_VERSION = 10 * six.int2byte(0xff)
_STRUCT_FORMAT_STRING = '>' + str(_TR_VERSION_LEN) + 'sH'
_UNSET_TR_VERSION = 10 * six.int2byte(0xFF)
_STRUCT_FORMAT_STRING = ">" + str(_TR_VERSION_LEN) + "sH"
@classmethod
def validate_tr_version(cls, tr_version):
if tr_version is None:
return
if not isinstance(tr_version, bytes):
raise TypeError("Global version has illegal type " + str(type(tr_version)) + " (requires bytes)")
raise TypeError(
"Global version has illegal type "
+ str(type(tr_version))
+ " (requires bytes)"
)
elif len(tr_version) != cls._TR_VERSION_LEN:
raise ValueError("Global version has incorrect length " + str(len(tr_version)) + " (requires " + str(cls._TR_VERSION_LEN) + ")")
raise ValueError(
"Global version has incorrect length "
+ str(len(tr_version))
+ " (requires "
+ str(cls._TR_VERSION_LEN)
+ ")"
)
@classmethod
def validate_user_version(cls, user_version):
if not isinstance(user_version, six.integer_types):
raise TypeError("Local version has illegal type " + str(type(user_version)) + " (requires integer type)")
raise TypeError(
"Local version has illegal type "
+ str(type(user_version))
+ " (requires integer type)"
)
elif user_version < 0 or user_version > cls._MAX_USER_VERSION:
raise ValueError("Local version has value " + str(user_version) + " which is out of range")
raise ValueError(
"Local version has value "
+ str(user_version)
+ " which is out of range"
)
def __init__(self, tr_version=None, user_version=0):
Versionstamp.validate_tr_version(tr_version)
@ -153,30 +173,50 @@ class Versionstamp(object):
if not isinstance(v, bytes):
raise TypeError("Cannot parse versionstamp from non-byte string")
elif len(v) - start < cls.LENGTH:
raise ValueError("Versionstamp byte string is too short (only " + str(len(v) - start) + " bytes to read from")
raise ValueError(
"Versionstamp byte string is too short (only "
+ str(len(v) - start)
+ " bytes to read from"
)
else:
tr_version = v[start:start + cls._TR_VERSION_LEN]
tr_version = v[start : start + cls._TR_VERSION_LEN]
if tr_version == cls._UNSET_TR_VERSION:
tr_version = None
user_version = six.indexbytes(v, start + cls._TR_VERSION_LEN) * (1 << 8) + six.indexbytes(v, start + cls._TR_VERSION_LEN + 1)
user_version = six.indexbytes(v, start + cls._TR_VERSION_LEN) * (
1 << 8
) + six.indexbytes(v, start + cls._TR_VERSION_LEN + 1)
return Versionstamp(tr_version, user_version)
def is_complete(self):
return self.tr_version is not None
def __repr__(self):
return "fdb.tuple.Versionstamp(" + repr(self.tr_version) + ", " + repr(self.user_version) + ")"
return (
"fdb.tuple.Versionstamp("
+ repr(self.tr_version)
+ ", "
+ repr(self.user_version)
+ ")"
)
def __str__(self):
return "Versionstamp(" + repr(self.tr_version) + ", " + str(self.user_version) + ")"
return (
"Versionstamp("
+ repr(self.tr_version)
+ ", "
+ str(self.user_version)
+ ")"
)
def to_bytes(self):
tr_version = self.tr_version
if isinstance(tr_version, fdb.impl.Value):
tr_version = tr_version.value
return struct.pack(self._STRUCT_FORMAT_STRING,
tr_version if self.is_complete() else self._UNSET_TR_VERSION,
self.user_version)
return struct.pack(
self._STRUCT_FORMAT_STRING,
tr_version if self.is_complete() else self._UNSET_TR_VERSION,
self.user_version,
)
def completed(self, new_tr_version):
if self.is_complete():
@ -187,7 +227,10 @@ class Versionstamp(object):
# Comparisons
def __eq__(self, other):
if isinstance(other, Versionstamp):
return self.tr_version == other.tr_version and self.user_version == other.user_version
return (
self.tr_version == other.tr_version
and self.user_version == other.user_version
)
else:
return False
@ -224,18 +267,22 @@ def _decode(v, pos):
return None, pos + 1
elif code == BYTES_CODE:
end = _find_terminator(v, pos + 1)
return v[pos + 1:end].replace(b"\x00\xFF", b"\x00"), end + 1
return v[pos + 1 : end].replace(b"\x00\xFF", b"\x00"), end + 1
elif code == STRING_CODE:
end = _find_terminator(v, pos + 1)
return v[pos + 1:end].replace(b"\x00\xFF", b"\x00").decode("utf-8"), end + 1
return v[pos + 1 : end].replace(b"\x00\xFF", b"\x00").decode("utf-8"), end + 1
elif code >= INT_ZERO_CODE and code < POS_INT_END:
n = code - 20
end = pos + 1 + n
return struct.unpack(">Q", b'\x00' * (8 - n) + v[pos + 1:end])[0], end
return struct.unpack(">Q", b"\x00" * (8 - n) + v[pos + 1 : end])[0], end
elif code > NEG_INT_START and code < INT_ZERO_CODE:
n = 20 - code
end = pos + 1 + n
return struct.unpack(">Q", b'\x00' * (8 - n) + v[pos + 1:end])[0] - _size_limits[n], end
return (
struct.unpack(">Q", b"\x00" * (8 - n) + v[pos + 1 : end])[0]
- _size_limits[n],
end,
)
elif code == POS_INT_END: # 0x1d; Positive 9-255 byte integer
length = six.indexbytes(v, pos + 1)
val = 0
@ -244,25 +291,37 @@ def _decode(v, pos):
val += six.indexbytes(v, pos + 2 + i)
return val, pos + 2 + length
elif code == NEG_INT_START: # 0x0b; Negative 9-255 byte integer
length = six.indexbytes(v, pos + 1) ^ 0xff
length = six.indexbytes(v, pos + 1) ^ 0xFF
val = 0
for i in _range(length):
val = val << 8
val += six.indexbytes(v, pos + 2 + i)
return val - (1 << (length * 8)) + 1, pos + 2 + length
elif code == FLOAT_CODE:
return SingleFloat(struct.unpack(">f", _float_adjust(v[pos + 1:pos + 5], False))[0]), pos + 5
return (
SingleFloat(
struct.unpack(">f", _float_adjust(v[pos + 1 : pos + 5], False))[0]
),
pos + 5,
)
elif code == DOUBLE_CODE:
return struct.unpack(">d", _float_adjust(v[pos + 1:pos + 9], False))[0], pos + 9
return (
struct.unpack(">d", _float_adjust(v[pos + 1 : pos + 9], False))[0],
pos + 9,
)
elif code == UUID_CODE:
return uuid.UUID(bytes=v[pos + 1:pos + 17]), pos + 17
return uuid.UUID(bytes=v[pos + 1 : pos + 17]), pos + 17
elif code == FALSE_CODE:
if fdb.is_api_version_selected() and fdb.get_api_version() < 500:
raise ValueError("Invalid API version " + str(fdb._version) + " for boolean types")
raise ValueError(
"Invalid API version " + str(fdb._version) + " for boolean types"
)
return False, pos + 1
elif code == TRUE_CODE:
if fdb.is_api_version_selected() and fdb.get_api_version() < 500:
raise ValueError("Invalid API version " + str(fdb._version) + " for boolean types")
raise ValueError(
"Invalid API version " + str(fdb._version) + " for boolean types"
)
return True, pos + 1
elif code == VERSIONSTAMP_CODE:
return Versionstamp.from_bytes(v, pos + 1), pos + 1 + Versionstamp.LENGTH
@ -271,7 +330,7 @@ def _decode(v, pos):
end_pos = pos + 1
while end_pos < len(v):
if six.indexbytes(v, end_pos) == 0x00:
if end_pos + 1 < len(v) and six.indexbytes(v, end_pos + 1) == 0xff:
if end_pos + 1 < len(v) and six.indexbytes(v, end_pos + 1) == 0xFF:
ret.append(None)
end_pos += 2
else:
@ -299,11 +358,15 @@ def _reduce_children(child_values):
if sys.version_info < (2, 7):
def _bit_length(x):
s = bin(x) # binary representation: bin(-37) --> '-0b100101'
s = s.lstrip('-0b') # remove leading zeros and minus sign
s = bin(x) # binary representation: bin(-37) --> '-0b100101'
s = s.lstrip("-0b") # remove leading zeros and minus sign
return len(s)
else:
def _bit_length(x):
return x.bit_length()
@ -314,23 +377,33 @@ def _encode(value, nested=False):
# sorting need to work too!
if value == None: # ==, not is, because some fdb.impl.Value are equal to None
if nested:
return b''.join([six.int2byte(NULL_CODE), six.int2byte(0xff)]), -1
return b"".join([six.int2byte(NULL_CODE), six.int2byte(0xFF)]), -1
else:
return b''.join([six.int2byte(NULL_CODE)]), -1
return b"".join([six.int2byte(NULL_CODE)]), -1
elif isinstance(value, bytes): # also gets non-None fdb.impl.Value
return six.int2byte(BYTES_CODE) + value.replace(b'\x00', b'\x00\xFF') + b'\x00', -1
return (
six.int2byte(BYTES_CODE) + value.replace(b"\x00", b"\x00\xFF") + b"\x00",
-1,
)
elif isinstance(value, six.text_type):
return six.int2byte(STRING_CODE) + value.encode('utf-8').replace(b'\x00', b'\x00\xFF') + b'\x00', -1
elif isinstance(value, six.integer_types) and (not isinstance(value, bool) or (hasattr(fdb, '_version') and fdb._version < 500)):
return (
six.int2byte(STRING_CODE)
+ value.encode("utf-8").replace(b"\x00", b"\x00\xFF")
+ b"\x00",
-1,
)
elif isinstance(value, six.integer_types) and (
not isinstance(value, bool) or (hasattr(fdb, "_version") and fdb._version < 500)
):
if value == 0:
return b''.join([six.int2byte(INT_ZERO_CODE)]), -1
return b"".join([six.int2byte(INT_ZERO_CODE)]), -1
elif value > 0:
if value >= _size_limits[-1]:
length = (_bit_length(value) + 7) // 8
data = [six.int2byte(POS_INT_END), six.int2byte(length)]
for i in _range(length - 1, -1, -1):
data.append(six.int2byte((value >> (8 * i)) & 0xff))
return b''.join(data), -1
data.append(six.int2byte((value >> (8 * i)) & 0xFF))
return b"".join(data), -1
n = bisect_left(_size_limits, value)
return six.int2byte(INT_ZERO_CODE + n) + struct.pack(">Q", value)[-n:], -1
@ -338,34 +411,53 @@ def _encode(value, nested=False):
if -value >= _size_limits[-1]:
length = (_bit_length(value) + 7) // 8
value += (1 << (length * 8)) - 1
data = [six.int2byte(NEG_INT_START), six.int2byte(length ^ 0xff)]
data = [six.int2byte(NEG_INT_START), six.int2byte(length ^ 0xFF)]
for i in _range(length - 1, -1, -1):
data.append(six.int2byte((value >> (8 * i)) & 0xff))
return b''.join(data), -1
data.append(six.int2byte((value >> (8 * i)) & 0xFF))
return b"".join(data), -1
n = bisect_left(_size_limits, -value)
maxv = _size_limits[n]
return six.int2byte(INT_ZERO_CODE - n) + struct.pack(">Q", maxv + value)[-n:], -1
return (
six.int2byte(INT_ZERO_CODE - n) + struct.pack(">Q", maxv + value)[-n:],
-1,
)
elif isinstance(value, ctypes.c_float) or isinstance(value, SingleFloat):
return six.int2byte(FLOAT_CODE) + _float_adjust(struct.pack(">f", value.value), True), -1
return (
six.int2byte(FLOAT_CODE)
+ _float_adjust(struct.pack(">f", value.value), True),
-1,
)
elif isinstance(value, ctypes.c_double):
return six.int2byte(DOUBLE_CODE) + _float_adjust(struct.pack(">d", value.value), True), -1
return (
six.int2byte(DOUBLE_CODE)
+ _float_adjust(struct.pack(">d", value.value), True),
-1,
)
elif isinstance(value, float):
return six.int2byte(DOUBLE_CODE) + _float_adjust(struct.pack(">d", value), True), -1
return (
six.int2byte(DOUBLE_CODE) + _float_adjust(struct.pack(">d", value), True),
-1,
)
elif isinstance(value, uuid.UUID):
return six.int2byte(UUID_CODE) + value.bytes, -1
elif isinstance(value, bool):
if value:
return b''.join([six.int2byte(TRUE_CODE)]), -1
return b"".join([six.int2byte(TRUE_CODE)]), -1
else:
return b''.join([six.int2byte(FALSE_CODE)]), -1
return b"".join([six.int2byte(FALSE_CODE)]), -1
elif isinstance(value, Versionstamp):
version_pos = -1 if value.is_complete() else 1
return six.int2byte(VERSIONSTAMP_CODE) + value.to_bytes(), version_pos
elif isinstance(value, tuple) or isinstance(value, list):
child_bytes, version_pos = _reduce_children(map(lambda x: _encode(x, True), value))
child_bytes, version_pos = _reduce_children(
map(lambda x: _encode(x, True), value)
)
new_version_pos = -1 if version_pos < 0 else version_pos + 1
return b''.join([six.int2byte(NESTED_CODE)] + child_bytes + [six.int2byte(0x00)]), new_version_pos
return (
b"".join([six.int2byte(NESTED_CODE)] + child_bytes + [six.int2byte(0x00)]),
new_version_pos,
)
else:
raise ValueError("Unsupported data type: " + str(type(value)))
@ -387,13 +479,13 @@ def _pack_maybe_with_versionstamp(t, prefix=None):
version_pos += len(prefix) if prefix is not None else 0
bytes_list.extend(child_bytes)
if fdb.is_api_version_selected() and fdb.get_api_version() < 520:
bytes_list.append(struct.pack('<H', version_pos))
bytes_list.append(struct.pack("<H", version_pos))
else:
bytes_list.append(struct.pack('<L', version_pos))
bytes_list.append(struct.pack("<L", version_pos))
else:
bytes_list.extend(child_bytes)
return b''.join(bytes_list), version_pos
return b"".join(bytes_list), version_pos
# packs the specified tuple into a key
@ -408,7 +500,9 @@ def pack(t, prefix=None):
def pack_with_versionstamp(t, prefix=None):
res, version_pos = _pack_maybe_with_versionstamp(t, prefix)
if version_pos < 0:
raise ValueError("No incomplete versionstamp included in tuple pack with versionstamp")
raise ValueError(
"No incomplete versionstamp included in tuple pack with versionstamp"
)
return res
@ -433,6 +527,7 @@ def has_incomplete_versionstamp(t):
return has_incomplete_versionstamp(item)
else:
return False
return any(map(_elem_has_incomplete, t))
@ -450,9 +545,7 @@ def range(t):
raise Exception("fdbtuple range() expects a tuple, got a " + str(type(t)))
p = pack(t)
return slice(
p + b'\x00',
p + b'\xff')
return slice(p + b"\x00", p + b"\xff")
def _code_for(value):
@ -462,7 +555,9 @@ def _code_for(value):
return BYTES_CODE
elif isinstance(value, six.text_type):
return STRING_CODE
elif (not hasattr(fdb, '_version') or fdb._version >= 500) and isinstance(value, bool):
elif (not hasattr(fdb, "_version") or fdb._version >= 500) and isinstance(
value, bool
):
return FALSE_CODE
elif isinstance(value, six.integer_types):
return INT_ZERO_CODE
@ -514,8 +609,8 @@ def _compare_values(value1, value2):
if code1 == NULL_CODE:
return 0
elif code1 == STRING_CODE:
encoded1 = value1.encode('utf-8')
encoded2 = value2.encode('utf-8')
encoded1 = value1.encode("utf-8")
encoded2 = value2.encode("utf-8")
return -1 if encoded1 < encoded2 else 0 if encoded1 == encoded2 else 1
elif code1 == FLOAT_CODE:
f1 = value1 if isinstance(value1, SingleFloat) else SingleFloat(value1.value)

View File

@ -518,7 +518,7 @@ def test_timeouts(db):
for i in range(2):
tr.options.set_timeout(1500)
tr.set_read_version(0x7ffffffffffffff0)
x = tr[b'foo']
_ = tr[b'foo']
try:
tr.commit().wait()
tr.reset()
@ -557,7 +557,7 @@ def test_db_timeouts(db):
tr[b'foo'] = b'bar'
tr.on_error(err).wait() # should not throw
time.sleep(1)
tr[b'foo']
_ = tr[b'foo']
try:
tr.commit().wait() # should throw
raise TestError("(2) Timeout didn't fire.")
@ -574,7 +574,7 @@ def test_db_timeouts(db):
time.sleep(0.75)
tr[b'foo'] = b'bar'
tr.on_error(err).wait() # should not throw
tr[b'foo']
_ = tr[b'foo']
time.sleep(0.75)
try:
tr.commit().wait() # should throw
@ -615,7 +615,7 @@ def test_db_timeouts(db):
tr.reset()
tr[b'foo'] = b'bar'
time.sleep(0.2)
tr.on_error(err).wait() #should not throw
tr.on_error(err).wait() # should not throw
tr[b'foo'] = b'bar'
time.sleep(0.8)
try:

View File

@ -24,15 +24,18 @@ import sys
if __name__ == '__main__':
fdb.api_version(720)
@fdb.transactional
def setValue(tr, key, value):
tr[key] = value
@fdb.transactional
def setValueWithLimit(tr, key, value, limit):
tr.options.set_size_limit(limit)
tr[key] = value
def test_size_limit_option(db):
value = b'a' * 1024
@ -69,6 +72,7 @@ def test_size_limit_option(db):
# Reset the size limit for future tests
db.options.set_transaction_size_limit(10000000)
@fdb.transactional
def test_get_approximate_size(tr):
tr[b'key1'] = b'value1'
@ -90,6 +94,7 @@ def test_get_approximate_size(tr):
s5 = tr.get_approximate_size().wait()
assert(s4 < s5)
# Expect a cluster file as input. This test will write to the FDB cluster, so
# be aware of potential side effects.
if __name__ == '__main__':

View File

@ -27,24 +27,26 @@ from fdb.tuple import pack
if __name__ == '__main__':
fdb.api_version(720)
def cleanup_tenant(db, tenant_name):
try:
tenant = db.open_tenant(tenant_name)
del tenant[:]
fdb.tenant_management.delete_tenant(db, tenant_name)
except fdb.FDBError as e:
if e.code == 2131: # tenant not found
if e.code == 2131: # tenant not found
pass
else:
raise
def test_tenant_tuple_name(db):
tuplename=(b'test', b'level', b'hierarchy', 3, 1.24, 'str')
tuplename = (b'test', b'level', b'hierarchy', 3, 1.24, 'str')
cleanup_tenant(db, tuplename)
fdb.tenant_management.create_tenant(db, tuplename)
tenant=db.open_tenant(tuplename)
tenant = db.open_tenant(tuplename)
tenant[b'foo'] = b'bar'
assert tenant[b'foo'] == b'bar'
@ -100,7 +102,7 @@ def test_tenant_operations(db):
del tr1[:]
tr1.commit().wait()
except fdb.FDBError as e:
tr.on_error(e).wait()
tr1.on_error(e).wait()
assert tenant1[b'tenant_test_key'] == None
assert db[prefix1 + b'tenant_test_key'] == None
@ -113,7 +115,7 @@ def test_tenant_operations(db):
tenant1[b'tenant_test_key']
assert False
except fdb.FDBError as e:
assert e.code == 2131 # tenant not found
assert e.code == 2131 # tenant not found
del tenant2[:]
fdb.tenant_management.delete_tenant(db, b'tenant2')
@ -126,6 +128,7 @@ def test_tenant_operations(db):
assert db[b'tenant_test_key'] == None
def test_tenant_operation_retries(db):
cleanup_tenant(db, b'tenant1')
cleanup_tenant(db, b'tenant2')
@ -138,7 +141,7 @@ def test_tenant_operation_retries(db):
fdb.tenant_management.create_tenant(db, b'tenant1')
assert False
except fdb.FDBError as e:
assert e.code == 2132 # tenant already exists
assert e.code == 2132 # tenant already exists
# Using a transaction skips the existence check
tr = db.create_transaction()
@ -166,7 +169,7 @@ def test_tenant_operation_retries(db):
fdb.tenant_management.delete_tenant(db, b'tenant1')
assert False
except fdb.FDBError as e:
assert e.code == 2131 # tenant not found
assert e.code == 2131 # tenant not found
# Using a transaction skips the existence check
tr = db.create_transaction()
@ -186,11 +189,13 @@ def test_tenant_operation_retries(db):
except fdb.FDBError as e:
tr.on_error(e).wait()
def test_tenants(db):
test_tenant_tuple_name(db)
test_tenant_operations(db)
test_tenant_operation_retries(db)
# Expect a cluster file as input. This test will write to the FDB cluster, so
# be aware of potential side effects.
if __name__ == '__main__':

View File

@ -26,7 +26,6 @@ import sys
import os
import struct
import threading
import time
import random
import time
import traceback
@ -136,7 +135,7 @@ def test_fdb_transactional_generator(db):
def function_that_yields(tr):
yield 0
assert fdb.get_api_version() < 630, "Pre-6.3, a decorator may wrap a function that yields"
except ValueError as e:
except ValueError:
assert fdb.get_api_version() >= 630, "Post-6.3, a decorator should throw if wrapped function yields"
@ -144,12 +143,13 @@ def test_fdb_transactional_returns_generator(db):
try:
def function_that_yields(tr):
yield 0
@fdb.transactional
def function_that_returns(tr):
return function_that_yields(tr)
function_that_returns()
assert fdb.get_api_version() < 630, "Pre-6.3, returning a generator is allowed"
except ValueError as e:
except ValueError:
assert fdb.get_api_version() >= 630, "Post-6.3, returning a generator should throw"
@ -400,11 +400,11 @@ class Tester:
inst.push(f)
elif inst.op == six.u("GET_ESTIMATED_RANGE_SIZE"):
begin, end = inst.pop(2)
estimatedSize = obj.get_estimated_range_size_bytes(begin, end).wait()
obj.get_estimated_range_size_bytes(begin, end).wait()
inst.push(b"GOT_ESTIMATED_RANGE_SIZE")
elif inst.op == six.u("GET_RANGE_SPLIT_POINTS"):
begin, end, chunkSize = inst.pop(3)
estimatedSize = obj.get_range_split_points(begin, end, chunkSize).wait()
obj.get_range_split_points(begin, end, chunkSize).wait()
inst.push(b"GOT_RANGE_SPLIT_POINTS")
elif inst.op == six.u("GET_KEY"):
key, or_equal, offset, prefix = inst.pop(4)
@ -522,7 +522,7 @@ class Tester:
self.last_version = inst.tr.get_committed_version()
inst.push(b"GOT_COMMITTED_VERSION")
elif inst.op == six.u("GET_APPROXIMATE_SIZE"):
approximate_size = inst.tr.get_approximate_size().wait()
inst.tr.get_approximate_size().wait()
inst.push(b"GOT_APPROXIMATE_SIZE")
elif inst.op == six.u("GET_VERSIONSTAMP"):
inst.push(inst.tr.get_versionstamp())
@ -613,9 +613,9 @@ class Tester:
result += [tenant.key]
try:
metadata = json.loads(tenant.value)
id = metadata["id"]
prefix = metadata["prefix"]
except (json.decoder.JSONDecodeError, KeyError) as e:
_ = metadata["id"]
_ = metadata["prefix"]
except (json.decoder.JSONDecodeError, KeyError):
assert False, "Invalid Tenant Metadata"
inst.push(fdb.tuple.pack(tuple(result)))
elif inst.op == six.u("UNIT_TESTS"):

View File

@ -173,7 +173,7 @@ def tupleTest(N=10000):
print("Prefix not before prefixed:\n Tuple: %s\n Bytes: %s\n Other: %s\n Bytes: %s" % (t, repr(pack(t)), t2, repr(pack(t2))))
return False
print ("Tuple check %d OK" % N)
print("Tuple check %d OK" % N)
return True
# test:

View File

@ -622,3 +622,39 @@ function(add_java_test)
-Djava.library.path=${CMAKE_BINARY_DIR}/lib
${T_CLASS} "@CLUSTER_FILE@")
endfunction()
# Adds a FDB test implemented by a script that does the full setup, such as creating cluster
# and running client binaries as necessary
function(add_scripted_fdb_test)
set(options DISABLED ENABLED)
set(oneValueArgs NAME TEST_TIMEOUT)
set(multiValueArgs COMMAND)
cmake_parse_arguments(T "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}")
if(OPEN_FOR_IDE)
return()
endif()
if(NOT T_ENABLED AND T_DISABLED)
return()
endif()
if(NOT T_NAME)
message(FATAL_ERROR "NAME is a required argument for add_scripted_fdb_test")
endif()
if(NOT T_COMMAND)
message(FATAL_ERROR "COMMAND is a required argument for add_scripted_fdb_test")
endif()
message(STATUS "Adding Scripted FDB test ${T_NAME}")
add_test(NAME "${T_NAME}"
COMMAND ${T_COMMAND})
set_tests_properties("${T_NAME}" PROPERTIES ENVIRONMENT
"${SANITIZER_OPTIONS};PYTHONPATH=${CMAKE_SOURCE_DIR}/tests/TestRunner:${CMAKE_BINARY_DIR}/tests/TestRunner")
if (T_TEST_TIMEOUT)
set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT ${T_TEST_TIMEOUT})
else()
# default timeout
if(USE_SANITIZER)
set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 1200)
else()
set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 300)
endif()
endif()
endfunction()

View File

@ -1,6 +1,6 @@
# FindRocksDB
find_package(RocksDB 6.27.3)
find_package(RocksDB 7.7.3)
include(ExternalProject)
@ -49,8 +49,8 @@ if(ROCKSDB_FOUND)
${BINARY_DIR}/librocksdb.a)
else()
ExternalProject_Add(rocksdb
URL https://github.com/facebook/rocksdb/archive/refs/tags/v6.27.3.tar.gz
URL_HASH SHA256=ee29901749b9132692b26f0a6c1d693f47d1a9ed8e3771e60556afe80282bf58
URL https://github.com/facebook/rocksdb/archive/refs/tags/v7.7.3.tar.gz
URL_HASH SHA256=b8ac9784a342b2e314c821f6d701148912215666ac5e9bdbccd93cf3767cb611
CMAKE_ARGS ${RocksDB_CMAKE_ARGS}
BUILD_BYPRODUCTS <BINARY_DIR>/librocksdb.a
INSTALL_COMMAND ""

View File

@ -42,6 +42,7 @@ parser.add_argument('--no-graph', action='store_true', default=False, help='Disa
args = parser.parse_args()
def print_choices_list(context=None):
if context == 'workload' or context is None:
print('Workloads:')
@ -70,6 +71,7 @@ def print_choices_list(context=None):
name = name[0:-len('Limiter')]
print(' %s' % name)
if args.workload is None or args.ratekeeper is None:
print('ERROR: A workload (-w/--workload) and ratekeeper model (-r/--ratekeeper) must be specified.\n')
print_choices_list()
@ -79,16 +81,18 @@ if args.list:
print_choices_list()
sys.exit(0)
def validate_class_type(var, name, superclass):
cls = getattr(var, name, None)
return cls is not None and inspect.isclass(cls) and issubclass(cls, superclass)
if not args.ratekeeper in ratekeeper_model.predefined_ratekeeper:
if args.ratekeeper not in ratekeeper_model.predefined_ratekeeper:
print('Invalid ratekeeper model `%s\'' % args.ratekeeper)
print_choices_list('ratekeeper')
sys.exit(1)
if not args.workload in workload_model.predefined_workloads:
if args.workload not in workload_model.predefined_workloads:
print('Invalid workload model `%s\'' % args.workload)
print_choices_list('workload')
sys.exit(1)
@ -120,11 +124,11 @@ for priority in workload.priorities():
still_queued = sum([r.count for r in proxy.request_queue if r.priority == priority])
if len(latencies) > 0:
print('\n%s: %d requests in %d seconds (rate=%f). %d still queued.' % (priority, total_started, proxy.time, float(total_started)/proxy.time, still_queued))
print(' Median latency: %f' % latencies[len(latencies)//2])
print(' 90%% latency: %f' % latencies[int(0.9*len(latencies))])
print(' 99%% latency: %f' % latencies[int(0.99*len(latencies))])
print(' 99.9%% latency: %f' % latencies[int(0.999*len(latencies))])
print('\n%s: %d requests in %d seconds (rate=%f). %d still queued.' % (priority, total_started, proxy.time, float(total_started) / proxy.time, still_queued))
print(' Median latency: %f' % latencies[len(latencies) // 2])
print(' 90%% latency: %f' % latencies[int(0.9 * len(latencies))])
print(' 99%% latency: %f' % latencies[int(0.99 * len(latencies))])
print(' 99.9%% latency: %f' % latencies[int(0.999 * len(latencies))])
print(' Max latency: %f' % latencies[-1])
print('')

View File

@ -20,6 +20,7 @@
import matplotlib.pyplot as plt
class Plotter:
def __init__(self, results):
self.results = results
@ -28,13 +29,13 @@ class Plotter:
out_data = {}
counts = {}
for t in data.keys():
out_data.setdefault(t//time_resolution*time_resolution, 0)
counts.setdefault(t//time_resolution*time_resolution, 0)
out_data[t//time_resolution*time_resolution] += data[t]
counts[t//time_resolution*time_resolution] += 1
out_data.setdefault(t // time_resolution * time_resolution, 0)
counts.setdefault(t // time_resolution * time_resolution, 0)
out_data[t // time_resolution * time_resolution] += data[t]
counts[t // time_resolution * time_resolution] += 1
if use_avg:
out_data = { t: v/counts[t] for t,v in out_data.items() }
out_data = {t: v / counts[t] for t, v in out_data.items()}
plt.plot(list(out_data.keys()), list(out_data.values()), label=label)
@ -42,7 +43,7 @@ class Plotter:
plt.plot(list(data.keys()), list(data.values()), label=label)
def display(self, time_resolution=0.1):
plt.figure(figsize=(40,9))
plt.figure(figsize=(40, 9))
plt.subplot(3, 3, 1)
for priority in self.results.started.keys():
Plotter.add_plot(self.results.started[priority], time_resolution, priority)
@ -61,7 +62,7 @@ class Plotter:
plt.subplot(3, 3, 3)
for priority in self.results.unprocessed_queue_sizes.keys():
data = {k: max(v) for (k,v) in self.results.unprocessed_queue_sizes[priority].items()}
data = {k: max(v) for (k, v) in self.results.unprocessed_queue_sizes[priority].items()}
Plotter.add_plot(data, time_resolution, priority)
plt.xlabel('Time (s)')
@ -71,9 +72,11 @@ class Plotter:
num = 4
for priority in self.results.latencies.keys():
plt.subplot(3, 3, num)
median_latencies = {k: v[int(0.5*len(v))] if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()}
percentile90_latencies = {k: v[int(0.9*len(v))] if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()}
max_latencies = {k: max(v) if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()}
median_latencies = {k: v[int(0.5 * len(v))] if len(v) > 0 else 0 for (k, v) in
self.results.latencies[priority].items()}
percentile90_latencies = {k: v[int(0.9 * len(v))] if len(v) > 0 else 0 for (k, v) in
self.results.latencies[priority].items()}
max_latencies = {k: max(v) if len(v) > 0 else 0 for (k, v) in self.results.latencies[priority].items()}
Plotter.add_plot(median_latencies, time_resolution, 'median')
Plotter.add_plot(percentile90_latencies, time_resolution, '90th percentile')
@ -94,7 +97,8 @@ class Plotter:
if len(self.results.limit[priority]) > 0:
Plotter.add_plot(self.results.limit[priority], time_resolution, 'Limit', use_avg=True)
if len(self.results.limit_and_budget[priority]) > 0:
Plotter.add_plot(self.results.limit_and_budget[priority], time_resolution, 'Limit and budget', use_avg=True)
Plotter.add_plot(self.results.limit_and_budget[priority], time_resolution, 'Limit and budget',
use_avg=True)
if len(self.results.budget[priority]) > 0:
Plotter.add_plot(self.results.budget[priority], time_resolution, 'Budget', use_avg=True)
@ -104,4 +108,3 @@ class Plotter:
num += 1
plt.show()

View File

@ -20,6 +20,7 @@
import functools
@functools.total_ordering
class Priority:
def __init__(self, priority_value, label):
@ -35,6 +36,7 @@ class Priority:
def __repr__(self):
return repr(self.label)
Priority.SYSTEM = Priority(0, "System")
Priority.DEFAULT = Priority(1, "Default")
Priority.BATCH = Priority(2, "Batch")

View File

@ -25,6 +25,7 @@ import heapq
from priority import Priority
from smoother import Smoother
@functools.total_ordering
class Task:
def __init__(self, time, fxn):
@ -34,6 +35,7 @@ class Task:
def __lt__(self, other):
return self.time < other.time
class Limiter:
class UpdateRateParams:
def __init__(self, time):
@ -79,6 +81,7 @@ class Limiter:
def update_budget(self, params):
pass
class OriginalLimiter(Limiter):
def __init__(self, priority, limit_rate_model, proxy_model):
Limiter.__init__(self, priority, limit_rate_model, proxy_model)
@ -100,6 +103,7 @@ class OriginalLimiter(Limiter):
def update_budget(self, params):
self.limit -= params.num_started
class PositiveBudgetLimiter(OriginalLimiter):
def __init__(self, priority, limit_rate_model, proxy_model):
OriginalLimiter.__init__(self, priority, limit_rate_model, proxy_model)
@ -108,6 +112,7 @@ class PositiveBudgetLimiter(OriginalLimiter):
self.limit += params.elapsed * self.rate
self.limit = min(self.limit, 2.0 * self.rate)
class ClampedBudgetLimiter(PositiveBudgetLimiter):
def __init__(self, priority, limit_rate_model, proxy_model):
PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model)
@ -117,6 +122,7 @@ class ClampedBudgetLimiter(PositiveBudgetLimiter):
if self.limit > min_budget:
self.limit = max(self.limit - params.num_started, min_budget)
class TimeLimiter(PositiveBudgetLimiter):
def __init__(self, priority, limit_rate_model, proxy_model):
PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model)
@ -126,15 +132,17 @@ class TimeLimiter(PositiveBudgetLimiter):
return params.time >= self.locked_until and PositiveBudgetLimiter.can_start(self, params)
def update_budget(self, params):
#print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch))
# print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch))
if params.min_priority >= self.priority or params.num_started < self.limit:
self.limit -= params.num_started
else:
self.limit = min(self.limit, max(self.limit - params.num_started, -params.last_batch))
self.locked_until = min(params.time + 2.0, max(params.time, self.locked_until) + (params.num_started - self.limit)/self.rate)
self.locked_until = min(params.time + 2.0,
max(params.time, self.locked_until) + (params.num_started - self.limit) / self.rate)
# print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority))
#print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority))
class TimePositiveBudgetLimiter(PositiveBudgetLimiter):
def __init__(self, priority, limit_rate_model, proxy_model):
@ -149,17 +157,18 @@ class TimePositiveBudgetLimiter(PositiveBudgetLimiter):
return params.num_started + params.count <= self.limit
def update_budget(self, params):
#if params.num_started > 0:
#print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch))
# if params.num_started > 0:
# print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch))
if params.num_started > self.limit:
self.locked_until = min(params.time + 2.0, max(params.time, self.locked_until) + penalty/self.rate)
self.locked_until = min(params.time + 2.0, max(params.time, self.locked_until) + (params.num_started - self.limit) / self.rate)
self.limit = 0
else:
self.limit -= params.num_started
#if params.num_started > 0:
#print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority))
# if params.num_started > 0:
# print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority))
class SmoothingLimiter(OriginalLimiter):
def __init__(self, priority, limit_rate_model, proxy_model):
@ -177,7 +186,8 @@ class SmoothingLimiter(OriginalLimiter):
self.smooth_rate_limit.set_total(params.time, self.rate)
def update_limit(self, params):
self.limit = 2.0 * (self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time))
self.limit = 2.0 * (
self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time))
def can_start(self, params):
return params.num_started + params.count <= self.limit
@ -185,15 +195,17 @@ class SmoothingLimiter(OriginalLimiter):
def update_budget(self, params):
self.smooth_released.add_delta(params.time, params.num_started)
class SmoothingBudgetLimiter(SmoothingLimiter):
def __init__(self, priority, limit_rate_model, proxy_model):
SmoothingLimiter.__init__(self, priority, limit_rate_model, proxy_model)
#self.smooth_filled = Smoother(2)
# self.smooth_filled = Smoother(2)
self.budget = 0
def update_limit(self, params):
release_rate = (self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time))
#self.smooth_filled.set_total(params.time, 1 if release_rate > 0 else 0)
release_rate = (
self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time))
# self.smooth_filled.set_total(params.time, 1 if release_rate > 0 else 0)
self.limit = 2.0 * release_rate
self.proxy_model.results.rate[self.priority][params.time] = self.smooth_rate_limit.smooth_total(params.time)
@ -202,15 +214,15 @@ class SmoothingBudgetLimiter(SmoothingLimiter):
self.proxy_model.results.limit_and_budget[self.priority][params.time] = self.limit + self.budget
self.proxy_model.results.budget[self.priority][params.time] = self.budget
#self.budget = max(0, self.budget + params.elapsed * self.smooth_rate_limit.smooth_total(params.time))
# self.budget = max(0, self.budget + params.elapsed * self.smooth_rate_limit.smooth_total(params.time))
#if self.smooth_filled.smooth_total(params.time) >= 0.1:
#self.budget += params.elapsed * self.smooth_rate_limit.smooth_total(params.time)
# if self.smooth_filled.smooth_total(params.time) >= 0.1:
# self.budget += params.elapsed * self.smooth_rate_limit.smooth_total(params.time)
#print('Update limit: time=%f, priority=%s, limit=%f, rate=%f, released=%f, budget=%f' % (params.time, self.priority, self.limit, self.smooth_rate_limit.smooth_total(params.time), self.smooth_released.smooth_rate(params.time), self.budget))
# print('Update limit: time=%f, priority=%s, limit=%f, rate=%f, released=%f, budget=%f' % (params.time, self.priority, self.limit, self.smooth_rate_limit.smooth_total(params.time), self.smooth_released.smooth_rate(params.time), self.budget))
def can_start(self, params):
return params.num_started + params.count <= self.limit + self.budget #or params.num_started + params.count <= self.budget
return params.num_started + params.count <= self.limit + self.budget # or params.num_started + params.count <= self.budget
def update_budget(self, params):
self.budget = max(0, self.budget + (self.limit - params.num_started_at_priority) / 2 * params.elapsed)
@ -220,6 +232,7 @@ class SmoothingBudgetLimiter(SmoothingLimiter):
self.smooth_released.add_delta(params.time, params.num_started_at_priority)
class ProxyModel:
class Results:
def __init__(self, priorities, duration):
@ -228,11 +241,11 @@ class ProxyModel:
self.latencies = self.init_result(priorities, [], duration)
self.unprocessed_queue_sizes = self.init_result(priorities, [], duration)
self.rate = {p:{} for p in priorities}
self.released = {p:{} for p in priorities}
self.limit = {p:{} for p in priorities}
self.limit_and_budget = {p:{} for p in priorities}
self.budget = {p:{} for p in priorities}
self.rate = {p: {} for p in priorities}
self.released = {p: {} for p in priorities}
self.limit = {p: {} for p in priorities}
self.limit_and_budget = {p: {} for p in priorities}
self.budget = {p: {} for p in priorities}
def init_result(self, priorities, starting_value, duration):
return {p: {s: copy.copy(starting_value) for s in range(0, duration)} for p in priorities}
@ -241,9 +254,10 @@ class ProxyModel:
self.time = 0
self.log_time = 0
self.duration = duration
self.priority_limiters = { priority: Limiter(priority, ratekeeper_model, self) for priority in workload_model.priorities() }
self.priority_limiters = {priority: Limiter(priority, ratekeeper_model, self) for priority in
workload_model.priorities()}
self.workload_model = workload_model
self.request_scheduled = { p: False for p in self.workload_model.priorities()}
self.request_scheduled = {p: False for p in self.workload_model.priorities()}
self.tasks = []
self.request_queue = []
@ -256,13 +270,14 @@ class ProxyModel:
for priority in self.workload_model.priorities():
next_request = self.workload_model.next_request(self.time, priority)
assert next_request is not None
heapq.heappush(self.tasks, Task(next_request.time, lambda next_request=next_request: self.receive_request(next_request)))
heapq.heappush(self.tasks, Task(next_request.time,
lambda next_request=next_request: self.receive_request(next_request)))
self.request_scheduled[priority] = True
while True:# or len(self.request_queue) > 0:
while True: # or len(self.request_queue) > 0:
if int(self.time) > self.log_time:
self.log_time = int(self.time)
#print(self.log_time)
# print(self.log_time)
task = heapq.heappop(self.tasks)
self.time = task.time
@ -294,14 +309,15 @@ class ProxyModel:
limiter.update_limit(Limiter.UpdateLimitParams(self.time, elapsed))
current_started = 0
started = {p:0 for p in self.workload_model.priorities()}
started = {p: 0 for p in self.workload_model.priorities()}
min_priority = Priority.SYSTEM
last_batch = 0
while len(self.request_queue) > 0:
request = self.request_queue[0]
if not self.priority_limiters[request.priority].can_start(Limiter.CanStartParams(self.time, current_started, request.count)):
if not self.priority_limiters[request.priority].can_start(
Limiter.CanStartParams(self.time, current_started, request.count)):
break
min_priority = request.priority
@ -310,7 +326,8 @@ class ProxyModel:
if self.workload_model.request_completed(request) and not self.request_scheduled[request.priority]:
next_request = self.workload_model.next_request(self.time, request.priority)
assert next_request is not None
heapq.heappush(self.tasks, Task(next_request.time, lambda next_request=next_request: self.receive_request(next_request)))
heapq.heappush(self.tasks, Task(next_request.time,
lambda next_request=next_request: self.receive_request(next_request)))
self.request_scheduled[request.priority] = True
current_started += request.count
@ -318,21 +335,23 @@ class ProxyModel:
heapq.heappop(self.request_queue)
self.results.started[request.priority][int(self.time)] += request.count
self.results.latencies[request.priority][int(self.time)].append(self.time-request.time)
self.results.latencies[request.priority][int(self.time)].append(self.time - request.time)
if len(self.request_queue) == 0:
min_priority = Priority.BATCH
for priority, limiter in self.priority_limiters.items():
started_at_priority = sum([v for p,v in started.items() if p <= priority])
limiter.update_budget(Limiter.UpdateBudgetParams(self.time, current_started, started_at_priority, min_priority, last_batch, len(self.request_queue) == 0 or self.request_queue[0].priority > priority, elapsed))
started_at_priority = sum([v for p, v in started.items() if p <= priority])
limiter.update_budget(
Limiter.UpdateBudgetParams(self.time, current_started, started_at_priority, min_priority, last_batch,
len(self.request_queue) == 0 or self.request_queue[0].priority > priority,
elapsed))
for priority in self.workload_model.priorities():
self.results.unprocessed_queue_sizes[priority][int(self.time)].append(self.workload_model.workload_models[priority].outstanding)
self.results.unprocessed_queue_sizes[priority][int(self.time)].append(
self.workload_model.workload_models[priority].outstanding)
current_time = self.time
delay = 0.001
heapq.heappush(self.tasks, Task(self.time + delay, lambda: self.process_requests(current_time)))

View File

@ -20,6 +20,7 @@
import numpy
class RateModel:
def __init__(self):
pass
@ -27,6 +28,7 @@ class RateModel:
def get_rate(self, time):
pass
class FixedRateModel(RateModel):
def __init__(self, rate):
RateModel.__init__(self)
@ -35,10 +37,12 @@ class FixedRateModel(RateModel):
def get_rate(self, time):
return self.rate
class UnlimitedRateModel(FixedRateModel):
def __init__(self):
self.rate = 1e9
class IntervalRateModel(RateModel):
def __init__(self, intervals):
self.intervals = sorted(intervals)
@ -46,16 +50,17 @@ class IntervalRateModel(RateModel):
def get_rate(self, time):
if len(self.intervals) == 0 or time < self.intervals[0][0]:
return 0
target_interval = len(self.intervals)-1
target_interval = len(self.intervals) - 1
for i in range(1, len(self.intervals)):
if time < self.intervals[i][0]:
target_interval = i-1
target_interval = i - 1
break
self.intervals = self.intervals[target_interval:]
return self.intervals[0][1]
class SawtoothRateModel(RateModel):
def __init__(self, low, high, frequency):
self.low = low
@ -63,11 +68,12 @@ class SawtoothRateModel(RateModel):
self.frequency = frequency
def get_rate(self, time):
if int(2*time/self.frequency) % 2 == 0:
if int(2 * time / self.frequency) % 2 == 0:
return self.low
else:
return self.high
class DistributionRateModel(RateModel):
def __init__(self, distribution, frequency):
self.distribution = distribution

View File

@ -22,6 +22,7 @@ import numpy
import rate_model
from priority import Priority
class RatekeeperModel:
def __init__(self, limit_models):
self.limit_models = limit_models
@ -29,39 +30,40 @@ class RatekeeperModel:
def get_limit(self, time, priority):
return self.limit_models[priority].get_rate(time)
predefined_ratekeeper = {}
predefined_ratekeeper['default200_batch100'] = RatekeeperModel(
{
Priority.SYSTEM: rate_model.UnlimitedRateModel(),
Priority.DEFAULT: rate_model.FixedRateModel(200),
Priority.BATCH: rate_model.FixedRateModel(100)
})
{
Priority.SYSTEM: rate_model.UnlimitedRateModel(),
Priority.DEFAULT: rate_model.FixedRateModel(200),
Priority.BATCH: rate_model.FixedRateModel(100)
})
predefined_ratekeeper['default_sawtooth'] = RatekeeperModel(
{
Priority.SYSTEM: rate_model.UnlimitedRateModel(),
Priority.DEFAULT: rate_model.SawtoothRateModel(10, 200, 1),
Priority.BATCH: rate_model.FixedRateModel(0)
})
{
Priority.SYSTEM: rate_model.UnlimitedRateModel(),
Priority.DEFAULT: rate_model.SawtoothRateModel(10, 200, 1),
Priority.BATCH: rate_model.FixedRateModel(0)
})
predefined_ratekeeper['default_uniform_random'] = RatekeeperModel(
{
Priority.SYSTEM: rate_model.UnlimitedRateModel(),
Priority.DEFAULT: rate_model.DistributionRateModel(lambda: numpy.random.uniform(10, 200), 1),
Priority.BATCH: rate_model.FixedRateModel(0)
})
{
Priority.SYSTEM: rate_model.UnlimitedRateModel(),
Priority.DEFAULT: rate_model.DistributionRateModel(lambda: numpy.random.uniform(10, 200), 1),
Priority.BATCH: rate_model.FixedRateModel(0)
})
predefined_ratekeeper['default_trickle'] = RatekeeperModel(
{
Priority.SYSTEM: rate_model.UnlimitedRateModel(),
Priority.DEFAULT: rate_model.FixedRateModel(3),
Priority.BATCH: rate_model.FixedRateModel(0)
})
{
Priority.SYSTEM: rate_model.UnlimitedRateModel(),
Priority.DEFAULT: rate_model.FixedRateModel(3),
Priority.BATCH: rate_model.FixedRateModel(0)
})
predefined_ratekeeper['default1000'] = RatekeeperModel(
{
Priority.SYSTEM: rate_model.UnlimitedRateModel(),
Priority.DEFAULT: rate_model.FixedRateModel(1000),
Priority.BATCH: rate_model.FixedRateModel(500)
})
{
Priority.SYSTEM: rate_model.UnlimitedRateModel(),
Priority.DEFAULT: rate_model.FixedRateModel(1000),
Priority.BATCH: rate_model.FixedRateModel(500)
})

View File

@ -20,6 +20,7 @@
import math
class Smoother:
def __init__(self, folding_time):
self.folding_time = folding_time
@ -28,10 +29,10 @@ class Smoother:
def reset(self, value):
self.time = 0
self.total = value
self.estimate = value
self.estimate = value
def set_total(self, time, total):
self.add_delta(time, total-self.total)
self.add_delta(time, total - self.total)
def add_delta(self, time, delta):
self.update(time)
@ -43,11 +44,10 @@ class Smoother:
def smooth_rate(self, time):
self.update(time)
return (self.total-self.estimate) / self.folding_time
return (self.total - self.estimate) / self.folding_time
def update(self, time):
elapsed = time - self.time
if elapsed > 0:
self.time = time
self.estimate += (self.total-self.estimate) * (1-math.exp(-elapsed/self.folding_time))
self.estimate += (self.total - self.estimate) * (1 - math.exp(-elapsed / self.folding_time))

View File

@ -25,6 +25,7 @@ import math
import rate_model
from priority import Priority
@functools.total_ordering
class Request:
def __init__(self, time, count, priority):
@ -35,6 +36,7 @@ class Request:
def __lt__(self, other):
return self.priority < other.priority
class PriorityWorkloadModel:
def __init__(self, priority, rate_model, batch_model, generator, max_outstanding=1e9):
self.priority = priority
@ -59,6 +61,7 @@ class PriorityWorkloadModel:
return was_full and self.outstanding < self.max_outstanding
class WorkloadModel:
def __init__(self, workload_models):
self.workload_models = workload_models
@ -72,10 +75,17 @@ class WorkloadModel:
def request_completed(self, request):
return self.workload_models[request.priority].request_completed(request)
class Distribution:
EXPONENTIAL = lambda x: numpy.random.exponential(x)
UNIFORM = lambda x: numpy.random.uniform(0, 2.0*x)
FIXED = lambda x: x
def exponential(x):
return numpy.random.exponential(x)
def uniform(x):
return numpy.random.uniform(0, 2.0 * x)
def fixed(x):
return x
class BatchGenerator:
def __init__(self):
@ -84,6 +94,7 @@ class BatchGenerator:
def next_batch(self):
pass
class DistributionBatchGenerator(BatchGenerator):
def __init__(self, distribution, size):
BatchGenerator.__init__(self)
@ -93,6 +104,7 @@ class DistributionBatchGenerator(BatchGenerator):
def next_batch(self):
return math.ceil(self.distribution(self.size))
class RequestGenerator:
def __init__(self):
pass
@ -100,6 +112,7 @@ class RequestGenerator:
def next_request_interval(self, rate):
pass
class DistributionRequestGenerator(RequestGenerator):
def __init__(self, distribution):
RequestGenerator.__init__(self)
@ -109,93 +122,94 @@ class DistributionRequestGenerator(RequestGenerator):
if rate == 0:
return 1e9
return self.distribution(1.0/rate)
return self.distribution(1.0 / rate)
predefined_workloads = {}
predefined_workloads['slow_exponential'] = WorkloadModel(
{
Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT,
rate_model.FixedRateModel(100),
DistributionBatchGenerator(Distribution.FIXED, 1),
DistributionRequestGenerator(Distribution.EXPONENTIAL),
max_outstanding=100
)
})
{
Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT,
rate_model.FixedRateModel(100),
DistributionBatchGenerator(Distribution.fixed, 1),
DistributionRequestGenerator(Distribution.exponential),
max_outstanding=100
)
})
predefined_workloads['fixed_uniform'] = WorkloadModel(
{
Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM,
rate_model.FixedRateModel(0),
DistributionBatchGenerator(Distribution.FIXED, 1),
DistributionRequestGenerator(Distribution.UNIFORM),
max_outstanding=10
),
Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT,
rate_model.FixedRateModel(95),
DistributionBatchGenerator(Distribution.FIXED, 10),
DistributionRequestGenerator(Distribution.UNIFORM),
max_outstanding=200
),
Priority.BATCH: PriorityWorkloadModel(Priority.BATCH,
rate_model.FixedRateModel(1),
DistributionBatchGenerator(Distribution.UNIFORM, 500),
DistributionRequestGenerator(Distribution.UNIFORM),
max_outstanding=200
)
})
{
Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM,
rate_model.FixedRateModel(0),
DistributionBatchGenerator(Distribution.fixed, 1),
DistributionRequestGenerator(Distribution.uniform),
max_outstanding=10
),
Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT,
rate_model.FixedRateModel(95),
DistributionBatchGenerator(Distribution.fixed, 10),
DistributionRequestGenerator(Distribution.uniform),
max_outstanding=200
),
Priority.BATCH: PriorityWorkloadModel(Priority.BATCH,
rate_model.FixedRateModel(1),
DistributionBatchGenerator(Distribution.uniform, 500),
DistributionRequestGenerator(Distribution.uniform),
max_outstanding=200
)
})
predefined_workloads['batch_starvation'] = WorkloadModel(
{
Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM,
rate_model.FixedRateModel(1),
DistributionBatchGenerator(Distribution.FIXED, 1),
DistributionRequestGenerator(Distribution.UNIFORM),
max_outstanding=10
),
Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT,
rate_model.IntervalRateModel([(0,50), (60,150), (120,90)]),
DistributionBatchGenerator(Distribution.FIXED, 1),
DistributionRequestGenerator(Distribution.UNIFORM),
max_outstanding=200
),
Priority.BATCH: PriorityWorkloadModel(Priority.BATCH,
rate_model.FixedRateModel(100),
DistributionBatchGenerator(Distribution.FIXED, 1),
DistributionRequestGenerator(Distribution.UNIFORM),
max_outstanding=200
)
})
{
Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM,
rate_model.FixedRateModel(1),
DistributionBatchGenerator(Distribution.fixed, 1),
DistributionRequestGenerator(Distribution.uniform),
max_outstanding=10
),
Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT,
rate_model.IntervalRateModel([(0, 50), (60, 150), (120, 90)]),
DistributionBatchGenerator(Distribution.fixed, 1),
DistributionRequestGenerator(Distribution.uniform),
max_outstanding=200
),
Priority.BATCH: PriorityWorkloadModel(Priority.BATCH,
rate_model.FixedRateModel(100),
DistributionBatchGenerator(Distribution.fixed, 1),
DistributionRequestGenerator(Distribution.uniform),
max_outstanding=200
)
})
predefined_workloads['default_low_high_low'] = WorkloadModel(
{
Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM,
rate_model.FixedRateModel(0),
DistributionBatchGenerator(Distribution.FIXED, 1),
DistributionRequestGenerator(Distribution.UNIFORM),
max_outstanding=10
),
Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT,
rate_model.IntervalRateModel([(0,100), (60,300), (120,100)]),
DistributionBatchGenerator(Distribution.FIXED, 1),
DistributionRequestGenerator(Distribution.UNIFORM),
max_outstanding=200
),
Priority.BATCH: PriorityWorkloadModel(Priority.BATCH,
rate_model.FixedRateModel(0),
DistributionBatchGenerator(Distribution.FIXED, 1),
DistributionRequestGenerator(Distribution.UNIFORM),
max_outstanding=200
)
})
{
Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM,
rate_model.FixedRateModel(0),
DistributionBatchGenerator(Distribution.fixed, 1),
DistributionRequestGenerator(Distribution.uniform),
max_outstanding=10
),
Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT,
rate_model.IntervalRateModel([(0, 100), (60, 300), (120, 100)]),
DistributionBatchGenerator(Distribution.fixed, 1),
DistributionRequestGenerator(Distribution.uniform),
max_outstanding=200
),
Priority.BATCH: PriorityWorkloadModel(Priority.BATCH,
rate_model.FixedRateModel(0),
DistributionBatchGenerator(Distribution.fixed, 1),
DistributionRequestGenerator(Distribution.uniform),
max_outstanding=200
)
})
for rate in [83, 100, 180, 190, 200]:
predefined_workloads['default%d' % rate] = WorkloadModel(
{
Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT,
rate_model.FixedRateModel(rate),
DistributionBatchGenerator(Distribution.FIXED, 1),
DistributionRequestGenerator(Distribution.EXPONENTIAL),
max_outstanding=1000
)
})
{
Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT,
rate_model.FixedRateModel(rate),
DistributionBatchGenerator(Distribution.fixed, 1),
DistributionRequestGenerator(Distribution.exponential),
max_outstanding=1000
)
})

View File

@ -0,0 +1,5 @@
# LeakSanitizer suppressions file for FDB
# https://github.com/google/sanitizers/wiki/AddressSanitizerLeakSanitizer
# Not all incoming connections are cleanly shut down in client API tests
leak:ConnectionReaderActorState

View File

@ -24,10 +24,12 @@ import sys
import platform
import os
def error(message):
print(message)
sys.exit(1)
def get_version_string(library_path):
try:
lib = ctypes.cdll.LoadLibrary(library_path)
@ -58,6 +60,7 @@ def get_version_string(library_path):
return version_str
if __name__ == '__main__':
if platform.system() == 'Linux':
default_lib = 'libfdb_c.so'

View File

@ -28,7 +28,6 @@ optional packages:
sortedcontainers (for estimating key range read/write density)
"""
import argparse
from collections import defaultdict
from enum import Enum
@ -55,7 +54,6 @@ supported_protocol_versions = frozenset([PROTOCOL_VERSION_5_2, PROTOCOL_VERSION_
PROTOCOL_VERSION_6_2, PROTOCOL_VERSION_6_3, PROTOCOL_VERSION_7_0,
PROTOCOL_VERSION_7_1, PROTOCOL_VERSION_7_2])
fdb.api_version(520)
BASIC_FORMAT = "%(asctime)s - %(levelname)-8s %(message)s"
@ -188,6 +186,7 @@ class BaseInfo(object):
"""
Corresponds to FdbClientLogEvents::Event
"""
def __init__(self, bb, protocol_version):
# we already read the EventType, so go straight to start_timestamp
self.start_timestamp = bb.get_double()
@ -197,6 +196,7 @@ class BaseInfo(object):
if bb.get_bool():
self.tenant = bb.get_bytes_with_length()
class GetVersionInfo(BaseInfo):
def __init__(self, bb, protocol_version):
super().__init__(bb, protocol_version)
@ -206,6 +206,7 @@ class GetVersionInfo(BaseInfo):
if protocol_version >= PROTOCOL_VERSION_6_3:
self.read_version = bb.get_long()
class GetInfo(BaseInfo):
def __init__(self, bb, protocol_version):
super().__init__(bb, protocol_version)
@ -244,11 +245,11 @@ class CommitInfo(BaseInfo):
self.read_snapshot_version = bb.get_long()
if protocol_version >= PROTOCOL_VERSION_6_3:
self.report_conflicting_keys = bb.get_bool()
if protocol_version >= PROTOCOL_VERSION_7_1:
lock_aware = bb.get_bool()
self.lock_aware = bb.get_bool()
if bb.get_bool():
spanId = bb.get_bytes(16)
self.spanId = bb.get_bytes(16)
class ErrorGetInfo(BaseInfo):
@ -285,9 +286,9 @@ class ErrorCommitInfo(BaseInfo):
self.report_conflicting_keys = bb.get_bool()
if protocol_version >= PROTOCOL_VERSION_7_1:
lock_aware = bb.get_bool()
self.lock_aware = bb.get_bool()
if bb.get_bool():
spanId = bb.get_bytes(16)
self.spanId = bb.get_bytes(16)
class UnsupportedProtocolVersionError(Exception):
@ -314,52 +315,57 @@ class ClientTransactionInfo:
if event == 0:
# we need to read it to consume the buffer even if we don't want to store it
get_version = GetVersionInfo(bb, protocol_version)
if (not type_filter or "get_version" in type_filter):
if not type_filter or "get_version" in type_filter:
self.get_version = get_version
elif event == 1:
get = GetInfo(bb, protocol_version)
if (not type_filter or "get" in type_filter):
if not type_filter or "get" in type_filter:
# because of the crappy json serializtion using __dict__ we have to set the list here otherwise
# it doesn't print
if not self.gets: self.gets = []
if not self.gets:
self.gets = []
self.gets.append(get)
elif event == 2:
get_range = GetRangeInfo(bb, protocol_version)
if (not type_filter or "get_range" in type_filter):
if not self.get_ranges: self.get_ranges = []
if not type_filter or "get_range" in type_filter:
if not self.get_ranges:
self.get_ranges = []
self.get_ranges.append(get_range)
elif event == 3:
commit = CommitInfo(bb, protocol_version, full_output=full_output)
if (not type_filter or "commit" in type_filter):
if not type_filter or "commit" in type_filter:
self.commit = commit
elif event == 4:
error_get = ErrorGetInfo(bb, protocol_version)
if (not type_filter or "error_gets" in type_filter):
if not self.error_gets: self.error_gets = []
if not type_filter or "error_gets" in type_filter:
if not self.error_gets:
self.error_gets = []
self.error_gets.append(error_get)
elif event == 5:
error_get_range = ErrorGetRangeInfo(bb, protocol_version)
if (not type_filter or "error_get_range" in type_filter):
if not self.error_get_ranges: self.error_get_ranges = []
if not type_filter or "error_get_range" in type_filter:
if not self.error_get_ranges:
self.error_get_ranges = []
self.error_get_ranges.append(error_get_range)
elif event == 6:
error_commit = ErrorCommitInfo(bb, protocol_version, full_output=full_output)
if (not type_filter or "error_commit" in type_filter):
if not self.error_commits: self.error_commits = []
if not type_filter or "error_commit" in type_filter:
if not self.error_commits:
self.error_commits = []
self.error_commits.append(error_commit)
else:
raise Exception("Unknown event type %d" % event)
def has_types(self):
return self.get_version or self.gets or self.get_ranges or self.commit or self.error_gets \
or self.error_get_ranges or self.error_commits
return self.get_version or self.gets or self.get_ranges or self.commit \
or self.error_gets or self.error_get_ranges or self.error_commits
def to_json(self):
return json.dumps(self, cls=ObjJsonEncoder, sort_keys=True)
class TransactionInfoLoader(object):
max_num_chunks_to_store = 1000 # Each chunk would be 100 KB in size
max_num_chunks_to_store = 1000 # Each chunk would be 100 KB in size
def __init__(self, db, full_output=True, type_filter=None, min_timestamp=None, max_timestamp=None):
self.db = db
@ -433,7 +439,7 @@ class TransactionInfoLoader(object):
reverse = False
for k, v in tr.snapshot.get_range(start_key, end_key, limit=1, reverse=reverse):
return fdb.tuple.unpack(v)[0]
return 0 if start else 0x8000000000000000 # we didn't find any timekeeper data so find the max range
return 0 if start else 0x8000000000000000 # we didn't find any timekeeper data so find the max range
def fetch_transaction_info(self):
if self.min_timestamp:
@ -469,12 +475,12 @@ class TransactionInfoLoader(object):
streaming_mode=fdb.impl.StreamingMode.want_all)
for k, v in transaction_info_range:
found += 1
#logger.debug(k)
# logger.debug(k)
start_key = fdb.KeySelector.first_greater_than(k)
_, tr_id, num_chunks, chunk_num = self.parse_key(k)
#logger.debug("num_chunks=%d, chunk_num=%d" % (num_chunks,chunk_num))
# logger.debug("num_chunks=%d, chunk_num=%d" % (num_chunks,chunk_num))
if num_chunks == 1:
assert chunk_num == 1
@ -482,7 +488,7 @@ class TransactionInfoLoader(object):
info = build_client_transaction_info(v)
if info.has_types():
buffer.append(info)
except UnsupportedProtocolVersionError as e:
except UnsupportedProtocolVersionError:
invalid_transaction_infos += 1
except ValueError:
invalid_transaction_infos += 1
@ -497,7 +503,8 @@ class TransactionInfoLoader(object):
self._check_and_adjust_chunk_cache_size()
else:
if tr_id not in self.tr_info_map:
logger.error("Got a middle chunk without getting beginning part. Discarding transaction id: %s\n" % tr_id)
logger.error(
"Got a middle chunk without getting beginning part. Discarding transaction id: %s\n" % tr_id)
continue
c_list = self.tr_info_map[tr_id]
if c_list[-1].num_chunks != num_chunks or c_list[-1].chunk_num != chunk_num - 1:
@ -513,7 +520,7 @@ class TransactionInfoLoader(object):
info = build_client_transaction_info(b''.join([chunk.value for chunk in c_list]))
if info.has_types():
buffer.append(info)
except UnsupportedProtocolVersionError as e:
except UnsupportedProtocolVersionError:
invalid_transaction_infos += 1
except ValueError:
invalid_transaction_infos += 1
@ -553,6 +560,7 @@ def has_dateparser():
logger.warn("Can't find dateparser so disabling human date parsing")
return False
class ReadCounter(object):
def __init__(self):
from sortedcontainers import SortedDict
@ -560,7 +568,7 @@ class ReadCounter(object):
self.reads[b''] = [0, 0]
self.read_counts = {}
self.hit_count=0
self.hit_count = 0
def process(self, transaction_info):
for get in transaction_info.gets:
@ -576,7 +584,7 @@ class ReadCounter(object):
if end_key is not None:
self.reads.setdefault(end_key, [0, 0])[1] += 1
else:
self.reads.setdefault(start_key+b'\x00', [0, 0])[1] += 1
self.reads.setdefault(start_key + b'\x00', [0, 0])[1] += 1
def get_total_reads(self):
return sum([v for v in self.read_counts.values()])
@ -673,8 +681,8 @@ class ShardFinder(object):
self.shard_cache = {}
def _get_boundary_keys(self, begin, end):
start_pos = max(0, bisect_right(self.boundary_keys, begin)-1)
end_pos = max(0, bisect_right(self.boundary_keys, end)-1)
start_pos = max(0, bisect_right(self.boundary_keys, begin) - 1)
end_pos = max(0, bisect_right(self.boundary_keys, end) - 1)
return self.boundary_keys[start_pos:end_pos]
@ -691,9 +699,9 @@ class ShardFinder(object):
return len(self._get_boundary_keys(start_key, end_key)) + 1
def get_addresses_for_key(self, key):
shard = self.boundary_keys[max(0, bisect_right(self.boundary_keys, key)-1)]
shard = self.boundary_keys[max(0, bisect_right(self.boundary_keys, key) - 1)]
do_load = False
if not shard in self.shard_cache:
if shard not in self.shard_cache:
do_load = True
elif self.shard_cache[shard].is_ready():
try:
@ -708,7 +716,7 @@ class ShardFinder(object):
for f in self.outstanding:
try:
f.wait()
except fdb.FDBError as e:
except fdb.FDBError:
pass
self.outstanding = []
@ -726,10 +734,13 @@ class ShardFinder(object):
if item[addr_idx] is not None:
while True:
try:
ranges[index] = item[0:addr_idx] + ([a.decode('ascii') for a in item[addr_idx].wait()],) + item[addr_idx+1:]
ranges[index] = item[0:addr_idx] + ([a.decode('ascii') for a in item[addr_idx].wait()],) \
+ item[addr_idx + 1:]
break
except fdb.FDBError as e:
ranges[index] = item[0:addr_idx] + (self.get_addresses_for_key(item[key_idx]),) + item[addr_idx+1:]
except fdb.FDBError:
ranges[index] = item[0:addr_idx] + (self.get_addresses_for_key(item[key_idx]),) \
+ item[addr_idx + 1:]
class WriteCounter(object):
mutation_types_to_consider = frozenset([MutationType.SET_VALUE, MutationType.ADD_VALUE])
@ -795,10 +806,11 @@ class WriteCounter(object):
filter_addresses = set(filter_addresses)
results = [r for r in results if filter_addresses.issubset(set(r[3]))][0:num]
else:
results = [(key, end, count) for (count, key) in count_pairs[0:num]]
results = [(key, None, count) for (count, key) in count_pairs[0:num]]
return results
def connect(cluster_file=None):
db = fdb.open(cluster_file=cluster_file)
return db
@ -831,22 +843,34 @@ def main():
end_time_group = parser.add_mutually_exclusive_group()
end_time_group.add_argument("--max-timestamp", type=int, help="Don't return events newer than this epoch time")
end_time_group.add_argument("-e", "--end-time", type=str, help="Don't return events older than this parsed time")
parser.add_argument("--num-buckets", type=int, help="The number of buckets to partition the key-space into for operation counts", default=100)
parser.add_argument("--top-requests", type=int, help="If specified will output this many top keys for reads or writes", default=0)
parser.add_argument("--exclude-ports", action="store_true", help="Print addresses without the port number. Only works in versions older than 6.3, and is required in versions older than 6.2.")
parser.add_argument("--single-shard-ranges-only", action="store_true", help="Only print range boundaries that exist in a single shard")
parser.add_argument("-a", "--filter-address", action="append", help="Only print range boundaries that include the given address. This option can used multiple times to include more than one address in the filter, in which case all addresses must match.")
parser.add_argument("--num-buckets", type=int,
help="The number of buckets to partition the key-space into for operation counts", default=100)
parser.add_argument("--top-requests", type=int,
help="If specified will output this many top keys for reads or writes", default=0)
parser.add_argument("--exclude-ports", action="store_true",
help="Print addresses without the port number. Only works in versions older than 6.3, and is required in versions older than 6.2.")
parser.add_argument("--single-shard-ranges-only", action="store_true",
help="Only print range boundaries that exist in a single shard")
parser.add_argument("-a", "--filter-address", action="append",
help="Only print range boundaries that include the given address. This option can used multiple times to include more than one address in the filter, in which case all addresses must match.")
args = parser.parse_args()
type_filter = set()
if args.filter_get_version: type_filter.add("get_version")
if args.filter_get or args.filter_reads: type_filter.add("get")
if args.filter_get_range or args.filter_reads: type_filter.add("get_range")
if args.filter_commit: type_filter.add("commit")
if args.filter_error_get: type_filter.add("error_get")
if args.filter_error_get_range: type_filter.add("error_get_range")
if args.filter_error_commit: type_filter.add("error_commit")
if args.filter_get_version:
type_filter.add("get_version")
if args.filter_get or args.filter_reads:
type_filter.add("get")
if args.filter_get_range or args.filter_reads:
type_filter.add("get_range")
if args.filter_commit:
type_filter.add("commit")
if args.filter_error_get:
type_filter.add("error_get")
if args.filter_error_get_range:
type_filter.add("error_get_range")
if args.filter_error_commit:
type_filter.add("error_commit")
if (not type_filter or "commit" in type_filter):
write_counter = WriteCounter() if args.num_buckets else None
@ -912,7 +936,8 @@ def main():
else:
op_str = 'Key %r' % start
print(" %d. %s\n %d sampled %s (%.2f%%, %.2f%% cumulative)" % (idx+1, op_str, count, context, 100*count/total, 100*running_count/total))
print(" %d. %s\n %d sampled %s (%.2f%%, %.2f%% cumulative)" % (
idx + 1, op_str, count, context, 100 * count / total, 100 * running_count / total))
print(" shard addresses: %s\n" % ", ".join(addresses))
else:
@ -933,10 +958,10 @@ def main():
if not omit:
if omit_start is not None:
if omit_start == idx-1:
if omit_start == idx - 1:
print(" %d. Omitted\n" % (idx))
else:
print(" %d - %d. Omitted\n" % (omit_start+1, idx))
print(" %d - %d. Omitted\n" % (omit_start + 1, idx))
omit_start = None
if total_count is None:
@ -944,18 +969,19 @@ def main():
else:
count_str = '%d sampled %s (%d intersecting)' % (start_count, context, total_count)
if not shard_count:
print(" %d. [%s, %s]\n %d sampled %s\n" % (idx+1, start, end, count, context))
print(" %d. [%s, %s]\n %s\n" % (idx + 1, start, end, count_str))
else:
addresses_string = "; addresses=%s" % ', '.join(addresses) if addresses else ''
print(" %d. [%s, %s]\n %s spanning %d shard(s)%s\n" % (idx+1, start, end, count_str, shard_count, addresses_string))
print(" %d. [%s, %s]\n %s spanning %d shard(s)%s\n" % (
idx + 1, start, end, count_str, shard_count, addresses_string))
elif omit_start is None:
omit_start = idx
if omit_start is not None:
if omit_start == len(range_boundaries)-1:
if omit_start == len(range_boundaries) - 1:
print(" %d. Omitted\n" % len(range_boundaries))
else:
print(" %d - %d. Omitted\n" % (omit_start+1, len(range_boundaries)))
print(" %d - %d. Omitted\n" % (omit_start + 1, len(range_boundaries)))
shard_finder = ShardFinder(db, args.exclude_ports)
@ -963,7 +989,8 @@ def main():
if write_counter:
if args.top_requests:
top_writes = write_counter.get_top_k_writes(args.top_requests, args.filter_address, shard_finder=shard_finder)
top_writes = write_counter.get_top_k_writes(args.top_requests, args.filter_address,
shard_finder=shard_finder)
range_boundaries = write_counter.get_range_boundaries(args.num_buckets, shard_finder=shard_finder)
num_writes = write_counter.get_total_writes()
@ -1014,5 +1041,6 @@ def main():
print("Key-space boundaries with approximately equal read counts:\n")
print_range_boundaries(range_boundaries, "reads")
if __name__ == "__main__":
main()

View File

@ -105,8 +105,8 @@ class RangeCounterTest(unittest.TestCase):
assert rc_count == v, "Counts for %s mismatch. Expected %d got %d" % (k, v, rc_count)
for _ in range(0, 100):
i = random.randint(0, len(letters)-1)
j = random.randint(0, len(letters)-2)
i = random.randint(0, len(letters) - 1)
j = random.randint(0, len(letters) - 2)
if i == j:
j += 1
start_index = min(i, j)
@ -123,4 +123,4 @@ class RangeCounterTest(unittest.TestCase):
if __name__ == "__main__":
unittest.main() # run all tests
unittest.main() # run all tests

View File

@ -321,7 +321,7 @@ and pass the test with ``-f``:
Running a Workload on an actual Cluster
=======================================
Running a workload on a cluster works basically the smae way. However, one must
Running a workload on a cluster works basically the same way. However, one must
actually setup a cluster first. This cluster must run between one and many server
processes with the class test. So above 2-step process becomes a bit more complex:

View File

@ -890,8 +890,18 @@
}
}
},
"tenants":{
"num_tenants":0
"metacluster" : {
"cluster_type" : "management", // management, data, or standalone
"metacluster_name" : "metacluster1",
"metacluster_id" : 12345,
"data_cluster_name" : "data_cluster1", // data cluster only
"data_cluster_id" : 12346, // data cluster only
"num_data_clusters": 10 // management cluster only
},
"tenants" : {
"num_tenants" : 1, // on data cluster, local count; on management cluster, total metacluster count
"num_tenant_groups" : 10,
"tenant_group_capacity" : 20,
}
},
"client":{

View File

@ -36,7 +36,8 @@ ACTOR Future<bool> blobRestoreCommandActor(Database localDb, std::vector<StringR
state bool success = false;
wait(store(success, localDb->blobRestore(normalKeys)));
if (success) {
fmt::print("Started blob restore for the full cluster. Please use 'status' command to check progress.\n");
fmt::print(
"Started blob restore for the full cluster. Please use 'status details' command to check progress.\n");
} else {
fmt::print("Fail to start a new blob restore while there is a pending one.\n");
}

View File

@ -326,7 +326,7 @@ CommandFactory configureFactory(
"count=<TSS_COUNT>|perpetual_storage_wiggle=<WIGGLE_SPEED>|perpetual_storage_wiggle_locality="
"<<LOCALITY_KEY>:<LOCALITY_VALUE>|0>|storage_migration_type={disabled|gradual|aggressive}"
"|tenant_mode={disabled|optional_experimental|required_experimental}|blob_granules_enabled={0|1}"
"|encryption_at_rest_mode={disabled|aes_256_ctr}",
"|encryption_at_rest_mode={disabled|domain_aware|cluster_aware}",
"change the database configuration",
"The `new' option, if present, initializes a new database with the given configuration rather than changing "
"the configuration of an existing one. When used, both a redundancy mode and a storage engine must be "
@ -360,7 +360,8 @@ CommandFactory configureFactory(
"tenant_mode=<disabled|optional_experimental|required_experimental>: Sets the tenant mode for the cluster. If "
"optional, then transactions can be run with or without specifying tenants. If required, all data must be "
"accessed using tenants.\n\n"
"encryption_at_rest_mode=<disabled|aes_256_ctr>: Sets the cluster encryption data at-rest support for the "
"encryption_at_rest_mode=<disabled|domain_aware|cluster_aware>: Sets the cluster encryption data at-rest "
"support for the "
"database. The configuration can be updated ONLY at the time of database creation and once set can't be "
"updated for the lifetime of the database.\n\n"

View File

@ -1125,6 +1125,15 @@ void printStatus(StatusObjectReader statusObj,
outputString += "\n Number of Workers - " + format("%d", numWorkers);
auto numKeyRanges = statusObjBlobGranules["number_of_key_ranges"].get_int();
outputString += "\n Number of Key Ranges - " + format("%d", numKeyRanges);
if (statusObjCluster.has("blob_restore")) {
StatusObjectReader statusObjBlobRestore = statusObjCluster["blob_restore"];
std::string restoreStatus = statusObjBlobRestore["blob_full_restore_phase"].get_str();
if (statusObjBlobRestore.has("blob_full_restore_progress")) {
auto progress = statusObjBlobRestore["blob_full_restore_progress"].get_int();
restoreStatus += " " + format("%d%%", progress);
}
outputString += "\n Full Restore - " + restoreStatus;
}
}
}

View File

@ -294,6 +294,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY, 1.0 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY = deterministicRandom()->random01() * 60;
init( METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT, 10.0 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT = 1 + deterministicRandom()->random01() * 59;
init( TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( CLIENT_ENABLE_USING_CLUSTER_ID_KEY, false );
init( ENABLE_ENCRYPTION_CPU_TIME_LOGGING, false );
// clang-format on

View File

@ -206,10 +206,12 @@ std::map<std::string, std::string> configForToken(std::string const& mode) {
EncryptionAtRestMode mode;
if (value == "disabled") {
mode = EncryptionAtRestMode::DISABLED;
} else if (value == "aes_256_ctr") {
mode = EncryptionAtRestMode::AES_256_CTR;
} else if (value == "domain_aware") {
mode = EncryptionAtRestMode::DOMAIN_AWARE;
} else if (value == "cluster_aware") {
mode = EncryptionAtRestMode::CLUSTER_AWARE;
} else {
printf("Error: Only disabled|aes_256_ctr are valid for encryption_at_rest_mode.\n");
printf("Error: Only disabled|domain_aware|cluster_aware are valid for encryption_at_rest_mode.\n");
return out;
}
out[p + key] = format("%d", mode);
@ -465,6 +467,168 @@ bool isCompleteConfiguration(std::map<std::string, std::string> const& options)
options.count(p + "storage_engine") == 1;
}
/*
- Validates encryption and tenant mode configurations
- During cluster creation (configure new) we allow the following:
- If encryption mode is disabled/cluster_aware then any tenant mode is allowed
- If the encryption mode is domain_aware then the only allowed tenant mode is required
- During cluster configuration changes the following is allowed:
- Encryption mode cannot be changed (can only be set during creation)
- If the encryption mode is disabled/cluster_aware then any tenant mode changes are allowed
- If the encryption mode is domain_aware then tenant mode changes are not allowed (as the only supported mode is
required)
*/
bool isEncryptionAtRestModeConfigValid(Optional<DatabaseConfiguration> oldConfiguration,
std::map<std::string, std::string> newConfig,
bool creating) {
EncryptionAtRestMode encryptMode;
TenantMode tenantMode;
if (creating) {
if (newConfig.count(encryptionAtRestModeConfKey.toString()) != 0) {
encryptMode = EncryptionAtRestMode::fromValueRef(
ValueRef(newConfig.find(encryptionAtRestModeConfKey.toString())->second));
// check if the tenant mode is being set during configure new (otherwise assume tenants are disabled)
if (newConfig.count(tenantModeConfKey.toString()) != 0) {
tenantMode = TenantMode::fromValue(ValueRef(newConfig.find(tenantModeConfKey.toString())->second));
}
}
} else {
ASSERT(oldConfiguration.present());
encryptMode = oldConfiguration.get().encryptionAtRestMode;
if (newConfig.count(tenantModeConfKey.toString()) != 0) {
tenantMode = TenantMode::fromValue(ValueRef(newConfig.find(tenantModeConfKey.toString())->second));
} else {
// Tenant mode and encryption mode didn't change
return true;
}
}
TraceEvent(SevDebug, "EncryptAndTenantModes")
.detail("EncryptMode", encryptMode.toString())
.detail("TenantMode", tenantMode.toString());
if (encryptMode.mode == EncryptionAtRestMode::DOMAIN_AWARE && tenantMode != TenantMode::REQUIRED) {
// For domain aware encryption only the required tenant mode is currently supported
TraceEvent(SevWarnAlways, "InvalidEncryptAndTenantConfiguration")
.detail("EncryptMode", encryptMode.toString())
.detail("TenantMode", tenantMode.toString());
return false;
}
return true;
}
bool isTenantModeModeConfigValid(DatabaseConfiguration oldConfiguration, DatabaseConfiguration newConfiguration) {
TenantMode oldTenantMode = oldConfiguration.tenantMode;
TenantMode newTenantMode = newConfiguration.tenantMode;
TraceEvent(SevDebug, "TenantModes")
.detail("OldTenantMode", oldTenantMode.toString())
.detail("NewTenantMode", newTenantMode.toString());
if (oldTenantMode != TenantMode::REQUIRED && newTenantMode == TenantMode::REQUIRED) {
// TODO: Changing from optional/disabled to required tenant mode should be allowed if there is no non-tenant
// data present
TraceEvent(SevWarnAlways, "InvalidTenantConfiguration")
.detail("OldTenantMode", oldTenantMode.toString())
.detail("NewTenantMode", newTenantMode.toString());
return false;
}
return true;
}
TEST_CASE("/ManagementAPI/ChangeConfig/TenantMode") {
DatabaseConfiguration oldConfig;
DatabaseConfiguration newConfig;
std::vector<TenantMode> tenantModes = { TenantMode::DISABLED, TenantMode::OPTIONAL_TENANT, TenantMode::REQUIRED };
// required tenant mode can change to any other tenant mode
oldConfig.tenantMode = TenantMode::REQUIRED;
newConfig.tenantMode = deterministicRandom()->randomChoice(tenantModes);
ASSERT(isTenantModeModeConfigValid(oldConfig, newConfig));
// optional/disabled tenant mode can switch to optional/disabled tenant mode
oldConfig.tenantMode = deterministicRandom()->coinflip() ? TenantMode::DISABLED : TenantMode::OPTIONAL_TENANT;
newConfig.tenantMode = deterministicRandom()->coinflip() ? TenantMode::DISABLED : TenantMode::OPTIONAL_TENANT;
ASSERT(isTenantModeModeConfigValid(oldConfig, newConfig));
// optional/disabled tenant mode CANNOT switch to required tenant mode
oldConfig.tenantMode = deterministicRandom()->coinflip() ? TenantMode::DISABLED : TenantMode::OPTIONAL_TENANT;
newConfig.tenantMode = TenantMode::REQUIRED;
ASSERT(!isTenantModeModeConfigValid(oldConfig, newConfig));
return Void();
}
// unit test for changing encryption/tenant mode config options
TEST_CASE("/ManagementAPI/ChangeConfig/TenantAndEncryptMode") {
std::map<std::string, std::string> newConfig;
std::string encryptModeKey = encryptionAtRestModeConfKey.toString();
std::string tenantModeKey = tenantModeConfKey.toString();
std::vector<TenantMode> tenantModes = { TenantMode::DISABLED, TenantMode::OPTIONAL_TENANT, TenantMode::REQUIRED };
std::vector<EncryptionAtRestMode> encryptionModes = { EncryptionAtRestMode::DISABLED,
EncryptionAtRestMode::CLUSTER_AWARE,
EncryptionAtRestMode::DOMAIN_AWARE };
// configure new test cases
// encryption disabled checks
newConfig[encryptModeKey] = std::to_string(EncryptionAtRestMode::DISABLED);
newConfig[tenantModeKey] = std::to_string(deterministicRandom()->randomChoice(tenantModes));
ASSERT(isEncryptionAtRestModeConfigValid(Optional<DatabaseConfiguration>(), newConfig, true));
// cluster aware encryption checks
newConfig[encryptModeKey] = std::to_string(EncryptionAtRestMode::CLUSTER_AWARE);
newConfig[tenantModeKey] = std::to_string(deterministicRandom()->randomChoice(tenantModes));
ASSERT(isEncryptionAtRestModeConfigValid(Optional<DatabaseConfiguration>(), newConfig, true));
// domain aware encryption checks
newConfig[encryptModeKey] = std::to_string(EncryptionAtRestMode::DOMAIN_AWARE);
newConfig[tenantModeKey] =
std::to_string(deterministicRandom()->coinflip() ? TenantMode::DISABLED : TenantMode::OPTIONAL_TENANT);
ASSERT(!isEncryptionAtRestModeConfigValid(Optional<DatabaseConfiguration>(), newConfig, true));
newConfig[tenantModeKey] = std::to_string(TenantMode::REQUIRED);
ASSERT(isEncryptionAtRestModeConfigValid(Optional<DatabaseConfiguration>(), newConfig, true));
// no encrypt mode present
newConfig.erase(encryptModeKey);
newConfig[tenantModeKey] = std::to_string(deterministicRandom()->randomChoice(tenantModes));
ASSERT(isEncryptionAtRestModeConfigValid(Optional<DatabaseConfiguration>(), newConfig, true));
// no tenant mode present
newConfig.erase(tenantModeKey);
newConfig[encryptModeKey] = std::to_string(EncryptionAtRestMode::DOMAIN_AWARE);
ASSERT(!isEncryptionAtRestModeConfigValid(Optional<DatabaseConfiguration>(), newConfig, true));
newConfig[encryptModeKey] = std::to_string(EncryptionAtRestMode::CLUSTER_AWARE);
ASSERT(isEncryptionAtRestModeConfigValid(Optional<DatabaseConfiguration>(), newConfig, true));
// change config test cases
DatabaseConfiguration oldConfig;
// encryption disabled checks
oldConfig.encryptionAtRestMode = EncryptionAtRestMode::DISABLED;
oldConfig.tenantMode = deterministicRandom()->randomChoice(tenantModes);
newConfig[tenantModeKey] = std::to_string(deterministicRandom()->randomChoice(tenantModes));
ASSERT(isEncryptionAtRestModeConfigValid(oldConfig, newConfig, false));
// domain aware encryption checks
oldConfig.encryptionAtRestMode = EncryptionAtRestMode::DOMAIN_AWARE;
oldConfig.tenantMode = TenantMode::REQUIRED;
newConfig[tenantModeKey] =
std::to_string(deterministicRandom()->coinflip() ? TenantMode::DISABLED : TenantMode::OPTIONAL_TENANT);
ASSERT(!isEncryptionAtRestModeConfigValid(oldConfig, newConfig, false));
newConfig[tenantModeKey] = std::to_string(TenantMode::REQUIRED);
ASSERT(isEncryptionAtRestModeConfigValid(oldConfig, newConfig, false));
// cluster aware encryption checks
oldConfig.encryptionAtRestMode = EncryptionAtRestMode::CLUSTER_AWARE;
// required tenant mode can switch to any other tenant mode with cluster aware encryption
oldConfig.tenantMode = deterministicRandom()->randomChoice(tenantModes);
newConfig[tenantModeKey] = std::to_string(deterministicRandom()->randomChoice(tenantModes));
ASSERT(isEncryptionAtRestModeConfigValid(oldConfig, newConfig, false));
// no tenant mode present
newConfig.erase(tenantModeKey);
oldConfig.tenantMode = deterministicRandom()->randomChoice(tenantModes);
oldConfig.encryptionAtRestMode = deterministicRandom()->randomChoice(encryptionModes);
ASSERT(isEncryptionAtRestModeConfigValid(oldConfig, newConfig, false));
return Void();
}
ACTOR Future<DatabaseConfiguration> getDatabaseConfiguration(Transaction* tr) {
tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
@ -962,6 +1126,14 @@ ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
if (!disableConfigDB) {
wait(verifyConfigurationDatabaseAlive(tr->getDatabase()));
}
if (BUGGIFY_WITH_PROB(0.1)) {
// Introduce a random delay in simulation to allow processes to be
// killed before previousCoordinatorKeys has been reset. This will
// help test scenarios where the previous configuration database
// state has been transferred to the new coordinators but the
// broadcaster thinks it has not been transferred.
wait(delay(deterministicRandom()->random01() * 10));
}
wait(resetPreviousCoordinatorsKey(tr->getDatabase()));
return CoordinatorsResult::SAME_NETWORK_ADDRESSES;
}

View File

@ -1548,17 +1548,19 @@ ThreadFuture<Void> MultiVersionTransaction::onError(Error const& e) {
auto f = tr.transaction ? tr.transaction->onError(e) : makeTimeout<Void>();
f = abortableFuture(f, tr.onChange);
return flatMapThreadFuture<Void, Void>(f, [this, e](ErrorOr<Void> ready) {
if (!ready.isError() || ready.getError().code() != error_code_cluster_version_changed) {
if (ready.isError()) {
return ErrorOr<ThreadFuture<Void>>(ready.getError());
}
return flatMapThreadFuture<Void, Void>(f, [this](ErrorOr<Void> ready) {
if (ready.isError() && ready.getError().code() == error_code_cluster_version_changed) {
// In case of a cluster version change, upgrade (or downgrade) the transaction
// and let it to be retried independently of the original error
updateTransaction();
return ErrorOr<ThreadFuture<Void>>(Void());
}
// In all other cases forward the result of the inner onError call
if (ready.isError()) {
return ErrorOr<ThreadFuture<Void>>(ready.getError());
} else {
return ErrorOr<ThreadFuture<Void>>(Void());
}
updateTransaction();
return ErrorOr<ThreadFuture<Void>>(onError(e));
});
}
}
@ -2968,7 +2970,7 @@ ACTOR Future<std::string> updateClusterSharedStateMapImpl(MultiVersionApi* self,
// The cluster ID will be the connection record string (either a filename or the connection string itself)
// in versions before we could read the cluster ID.
state std::string clusterId = connectionRecord.toString();
if (dbProtocolVersion.hasClusterIdSpecialKey()) {
if (CLIENT_KNOBS->CLIENT_ENABLE_USING_CLUSTER_ID_KEY && dbProtocolVersion.hasClusterIdSpecialKey()) {
state Reference<ITransaction> tr = db->createTransaction();
loop {
try {

View File

@ -7110,11 +7110,11 @@ ACTOR Future<Void> readVersionBatcher(DatabaseContext* cx,
state Reference<Histogram> batchIntervalDist =
Histogram::getHistogram("GrvBatcher"_sr,
"ClientGrvBatchInterval"_sr,
Histogram::Unit::microseconds,
Histogram::Unit::milliseconds,
0,
CLIENT_KNOBS->GRV_BATCH_TIMEOUT * 1000000 * 2);
state Reference<Histogram> grvReplyLatencyDist =
Histogram::getHistogram("GrvBatcher"_sr, "ClientGrvReplyLatency"_sr, Histogram::Unit::microseconds);
Histogram::getHistogram("GrvBatcher"_sr, "ClientGrvReplyLatency"_sr, Histogram::Unit::milliseconds);
state double lastRequestTime = now();
state TransactionTagMap<uint32_t> tags;
@ -10732,12 +10732,13 @@ ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
// must be aligned to blob range(s)
state Future<Standalone<VectorRef<KeyRangeRef>>> blobbifiedBegin =
getBlobRanges(&tr, KeyRangeRef(purgeRange.begin, purgeRange.begin), 2);
getBlobRanges(&tr, KeyRangeRef(purgeRange.begin, keyAfter(purgeRange.begin)), 1);
state Future<Standalone<VectorRef<KeyRangeRef>>> blobbifiedEnd =
getBlobRanges(&tr, KeyRangeRef(purgeRange.end, purgeRange.end), 2);
getBlobRanges(&tr, KeyRangeRef(purgeRange.end, keyAfter(purgeRange.end)), 1);
wait(success(blobbifiedBegin) && success(blobbifiedEnd));
// If there are no blob ranges on the boundary that's okay as we allow purging of multiple full ranges.
if ((!blobbifiedBegin.get().empty() && blobbifiedBegin.get().front().begin < purgeRange.begin) ||
(!blobbifiedEnd.get().empty() && blobbifiedEnd.get().back().end > purgeRange.end)) {
(!blobbifiedEnd.get().empty() && blobbifiedEnd.get().front().begin < purgeRange.end)) {
TraceEvent("UnalignedPurge")
.detail("Range", range)
.detail("Version", purgeVersion)
@ -10941,8 +10942,7 @@ ACTOR Future<bool> blobRestoreActor(Reference<DatabaseContext> cx, KeyRange rang
return false; // stop if there is in-progress restore.
}
}
Standalone<BlobRestoreStatus> status;
status.progress = 0;
BlobRestoreStatus status(BlobRestorePhase::INIT);
Value newValue = blobRestoreCommandValueFor(status);
tr->set(key, newValue);
wait(tr->commit());

View File

@ -218,8 +218,12 @@ class GetGenerationQuorum {
if (self->coordinatorsChangedFuture.isReady()) {
throw coordinators_changed();
}
wait(delayJittered(std::clamp(
0.005 * (1 << std::min(retries, 30)), 0.0, CLIENT_KNOBS->TIMEOUT_RETRY_UPPER_BOUND)));
if (deterministicRandom()->random01() < 0.95) {
// Add some random jitter to prevent clients from
// contending.
wait(delayJittered(std::clamp(
0.006 * (1 << std::min(retries, 30)), 0.0, CLIENT_KNOBS->TIMEOUT_RETRY_UPPER_BOUND)));
}
if (deterministicRandom()->random01() < 0.05) {
// Randomly inject a delay of at least the generation
// reply timeout, to try to prevent contention between

View File

@ -855,7 +855,8 @@ const KeyRef JSONSchemas::statusSchema = R"statusSchema(
"encryption_at_rest_mode": {
"$enum":[
"disabled",
"aes_256_ctr"
"domain_aware",
"cluster_aware"
]}
},
"consistency_scan_info":{
@ -963,11 +964,18 @@ const KeyRef JSONSchemas::statusSchema = R"statusSchema(
}
}
},
"tenants":{
"num_tenants":0
},
"metacluster" : {
"cluster_type" : "standalone"
"cluster_type" : "management",
"metacluster_name":"metacluster1",
"metacluster_id":12345,
"data_cluster_name" : "data_cluster1",
"data_cluster_id" : 12346,
"num_data_clusters":10
},
"tenants":{
"num_tenants":0,
"num_tenant_groups":10,
"tenant_group_capacity":20
}
},
"client":{

View File

@ -301,6 +301,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL, 10 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_USAGE_TRACE_INTERVAL, 300 );
init( CP_FETCH_TENANTS_OVER_STORAGE_QUOTA_INTERVAL, 5 ); if( randomize && BUGGIFY ) CP_FETCH_TENANTS_OVER_STORAGE_QUOTA_INTERVAL = deterministicRandom()->randomInt(1, 10);
// TeamRemover
@ -390,19 +391,22 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// If true, do not process and store RocksDB logs
init( ROCKSDB_MUTE_LOGS, true );
// Use a smaller memtable in simulation to avoid OOMs.
int64_t memtableBytes = isSimulated ? 32 * 1024 : 512 * 1024 * 1024;
int64_t memtableBytes = isSimulated ? 1024 * 1024 : 512 * 1024 * 1024;
init( ROCKSDB_MEMTABLE_BYTES, memtableBytes );
init( ROCKSDB_LEVEL_STYLE_COMPACTION, true );
init( ROCKSDB_UNSAFE_AUTO_FSYNC, false );
init( ROCKSDB_PERIODIC_COMPACTION_SECONDS, 0 );
init( ROCKSDB_PREFIX_LEN, 0 );
// If rocksdb block cache size is 0, the default 8MB is used.
int64_t blockCacheSize = isSimulated ? 0 : 1024 * 1024 * 1024 /* 1GB */;
int64_t blockCacheSize = isSimulated ? 16 * 1024 * 1024 : 1024 * 1024 * 1024 /* 1GB */;
init( ROCKSDB_BLOCK_CACHE_SIZE, blockCacheSize );
init( ROCKSDB_METRICS_DELAY, 60.0 );
init( ROCKSDB_READ_VALUE_TIMEOUT, isSimulated ? 5.0 : 200.0 );
init( ROCKSDB_READ_VALUE_PREFIX_TIMEOUT, isSimulated ? 5.0 : 200.0 );
init( ROCKSDB_READ_RANGE_TIMEOUT, isSimulated ? 5.0 : 200.0 );
// ROCKSDB_READ_VALUE_TIMEOUT, ROCKSDB_READ_VALUE_PREFIX_TIMEOUT, ROCKSDB_READ_RANGE_TIMEOUT knobs:
// In simulation, increasing the read operation timeouts to 5 minutes, as some of the tests have
// very high load and single read thread cannot process all the load within the timeouts.
init( ROCKSDB_READ_VALUE_TIMEOUT, 5.0 ); if (isSimulated) ROCKSDB_READ_VALUE_TIMEOUT = 5 * 60;
init( ROCKSDB_READ_VALUE_PREFIX_TIMEOUT, 5.0 ); if (isSimulated) ROCKSDB_READ_VALUE_PREFIX_TIMEOUT = 5 * 60;
init( ROCKSDB_READ_RANGE_TIMEOUT, 5.0 ); if (isSimulated) ROCKSDB_READ_RANGE_TIMEOUT = 5 * 60;
init( ROCKSDB_READ_QUEUE_WAIT, 1.0 );
init( ROCKSDB_READ_QUEUE_HARD_MAX, 1000 );
init( ROCKSDB_READ_QUEUE_SOFT_MAX, 500 );
@ -436,6 +440,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT, 200000 ); // 200KB
init( ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS, true ); if( randomize && BUGGIFY ) ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS = deterministicRandom()->coinflip();
// ROCKSDB_STATS_LEVEL=1 indicates rocksdb::StatsLevel::kExceptHistogramOrTimers
// Refer StatsLevel: https://github.com/facebook/rocksdb/blob/main/include/rocksdb/statistics.h#L594
init( ROCKSDB_STATS_LEVEL, 1 ); if( randomize && BUGGIFY ) ROCKSDB_STATS_LEVEL = deterministicRandom()->randomInt(0, 6);
// Can commit will delay ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD seconds for
// ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD times, if rocksdb overloaded.
@ -555,7 +560,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BACKUP_TIMEOUT, 0.4 );
init( BACKUP_NOOP_POP_DELAY, 5.0 );
init( BACKUP_FILE_BLOCK_BYTES, 1024 * 1024 );
init( BACKUP_LOCK_BYTES, 3e9 ); if(randomize && BUGGIFY) BACKUP_LOCK_BYTES = deterministicRandom()->randomInt(1024, 4096) * 15 * 1024;
init( BACKUP_LOCK_BYTES, 3e9 ); if(randomize && BUGGIFY) BACKUP_LOCK_BYTES = deterministicRandom()->randomInt(1024, 4096) * 30 * 1024;
init( BACKUP_UPLOAD_DELAY, 10.0 ); if(randomize && BUGGIFY) BACKUP_UPLOAD_DELAY = deterministicRandom()->random01() * 60;
//Cluster Controller

View File

@ -876,6 +876,7 @@ const KeyRef triggerDDTeamInfoPrintKey("\xff/triggerDDTeamInfoPrint"_sr);
const KeyRef consistencyScanInfoKey = "\xff/consistencyScanInfo"_sr;
const KeyRef encryptionAtRestModeConfKey("\xff/conf/encryption_at_rest_mode"_sr);
const KeyRef tenantModeConfKey("\xff/conf/tenant_mode"_sr);
const KeyRangeRef excludedServersKeys("\xff/conf/excluded/"_sr, "\xff/conf/excluded0"_sr);
const KeyRef excludedServersPrefix = excludedServersKeys.begin;

View File

@ -355,21 +355,25 @@ Span& Span::operator=(Span&& o) {
g_tracer->trace(*this);
}
arena = std::move(o.arena);
context = o.context;
parentContext = o.parentContext;
begin = o.begin;
end = o.end;
location = o.location;
links = std::move(o.links);
events = std::move(o.events);
status = o.status;
kind = o.kind;
o.context = SpanContext();
o.parentContext = SpanContext();
o.kind = SpanKind::INTERNAL;
o.begin = 0.0;
o.end = 0.0;
o.status = SpanStatus::UNSET;
// All memory referenced in *Ref fields of Span is now (potentially)
// invalid, and o no longer has ownership of any memory referenced by *Ref
// fields of o. We must ensure that o no longer references any memory it no
// longer owns, and that *this no longer references any memory it no longer
// owns. Not every field references arena memory, but this std::exchange
// pattern provides a nice template for getting this right in a concise way
// should we add more fields to Span.
attributes = std::exchange(o.attributes, decltype(o.attributes)());
begin = std::exchange(o.begin, decltype(o.begin)());
context = std::exchange(o.context, decltype(o.context)());
end = std::exchange(o.end, decltype(o.end)());
events = std::exchange(o.events, decltype(o.events)());
kind = std::exchange(o.kind, decltype(o.kind)());
links = std::exchange(o.links, decltype(o.links)());
location = std::exchange(o.location, decltype(o.location)());
parentContext = std::exchange(o.parentContext, decltype(o.parentContext)());
status = std::exchange(o.status, decltype(o.status)());
return *this;
}

View File

@ -314,13 +314,19 @@ struct BlobManifest {
};
// Defines blob restore status
enum BlobRestorePhase { INIT = 0, LOAD_MANIFEST = 1, MANIFEST_DONE = 2, MIGRATE = 3, APPLY_MLOGS = 4, DONE = 5 };
struct BlobRestoreStatus {
constexpr static FileIdentifier file_identifier = 378657;
BlobRestorePhase phase;
int progress;
BlobRestoreStatus() : phase(BlobRestorePhase::INIT){};
BlobRestoreStatus(BlobRestorePhase pha) : phase(pha), progress(0){};
BlobRestoreStatus(BlobRestorePhase pha, int prog) : phase(pha), progress(prog){};
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, progress);
serializer(ar, phase, progress);
}
};

View File

@ -289,6 +289,7 @@ public:
double METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY;
double METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT;
int TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantEntryCache is refreshed
bool CLIENT_ENABLE_USING_CLUSTER_ID_KEY;
// Encryption-at-rest
bool ENABLE_ENCRYPTION_CPU_TIME_LOGGING;

View File

@ -1464,7 +1464,7 @@ struct TenantMode {
struct EncryptionAtRestMode {
// These enumerated values are stored in the database configuration, so can NEVER be changed. Only add new ones
// just before END.
enum Mode { DISABLED = 0, AES_256_CTR = 1, END = 2 };
enum Mode { DISABLED = 0, DOMAIN_AWARE = 1, CLUSTER_AWARE = 2, END = 3 };
EncryptionAtRestMode() : mode(DISABLED) {}
EncryptionAtRestMode(Mode mode) : mode(mode) {
@ -1483,14 +1483,30 @@ struct EncryptionAtRestMode {
switch (mode) {
case DISABLED:
return "disabled";
case AES_256_CTR:
return "aes_256_ctr";
case DOMAIN_AWARE:
return "domain_aware";
case CLUSTER_AWARE:
return "cluster_aware";
default:
ASSERT(false);
}
return "";
}
static EncryptionAtRestMode fromString(std::string mode) {
if (mode == "disabled") {
return EncryptionAtRestMode::DISABLED;
} else if (mode == "cluster_aware") {
return EncryptionAtRestMode::CLUSTER_AWARE;
} else if (mode == "domain_aware") {
return EncryptionAtRestMode::DOMAIN_AWARE;
} else {
TraceEvent(SevError, "UnknownEncryptMode").detail("EncryptMode", mode);
ASSERT(false);
throw internal_error();
}
}
Value toValue() const { return ValueRef(format("%d", (int)mode)); }
bool isEquals(const EncryptionAtRestMode& e) const { return this->mode == e.mode; }

View File

@ -133,6 +133,11 @@ bool isCompleteConfiguration(std::map<std::string, std::string> const& options);
ConfigureAutoResult parseConfig(StatusObject const& status);
bool isEncryptionAtRestModeConfigValid(Optional<DatabaseConfiguration> oldConfiguration,
std::map<std::string, std::string> newConfig,
bool creating);
bool isTenantModeModeConfigValid(DatabaseConfiguration oldConfiguration, DatabaseConfiguration newConfiguration);
// Management API written in template code to support both IClientAPI and NativeAPI
namespace ManagementAPI {
@ -276,6 +281,9 @@ Future<ConfigurationResult> changeConfig(Reference<DB> db, std::map<std::string,
if (!isCompleteConfiguration(m)) {
return ConfigurationResult::INCOMPLETE_CONFIGURATION;
}
if (!isEncryptionAtRestModeConfigValid(Optional<DatabaseConfiguration>(), m, creating)) {
return ConfigurationResult::INVALID_CONFIGURATION;
}
} else if (m.count(encryptionAtRestModeConfKey.toString()) != 0) {
// Encryption data at-rest mode can be set only at the time of database creation
return ConfigurationResult::ENCRYPTION_AT_REST_MODE_ALREADY_SET;
@ -322,6 +330,12 @@ Future<ConfigurationResult> changeConfig(Reference<DB> db, std::map<std::string,
if (!newConfig.isValid()) {
return ConfigurationResult::INVALID_CONFIGURATION;
}
if (!isEncryptionAtRestModeConfigValid(oldConfig, m, creating)) {
return ConfigurationResult::INVALID_CONFIGURATION;
}
if (!isTenantModeModeConfigValid(oldConfig, newConfig)) {
return ConfigurationResult::INVALID_CONFIGURATION;
}
if (newConfig.tLogPolicy->attributeKeys().count("dcid") && newConfig.regions.size() > 0) {
return ConfigurationResult::REGION_REPLICATION_MISMATCH;

View File

@ -244,6 +244,8 @@ public:
// in the TenantCache
int TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL; // How often the storage quota allocated to each tenant is
// refreshed in the TenantCache
int TENANT_CACHE_STORAGE_USAGE_TRACE_INTERVAL; // The minimum interval between consecutive trace events logging the
// storage bytes used by a tenant group
int CP_FETCH_TENANTS_OVER_STORAGE_QUOTA_INTERVAL; // How often the commit proxies send requests to the data
// distributor to fetch the list of tenants over storage quota
@ -313,7 +315,7 @@ public:
// KeyValueStoreRocksDB
bool ROCKSDB_SET_READ_TIMEOUT;
bool ROCKSDB_LEVEL_COMPACTION_DYNAMIC_LEVEL_BYTES;
int ROCKSDB_SUGGEST_COMPACT_CLEAR_RANGE;
bool ROCKSDB_SUGGEST_COMPACT_CLEAR_RANGE;
int ROCKSDB_READ_RANGE_ROW_LIMIT;
int ROCKSDB_READER_THREAD_PRIORITY;
int ROCKSDB_WRITER_THREAD_PRIORITY;

View File

@ -284,6 +284,9 @@ extern const KeyRef triggerDDTeamInfoPrintKey;
// Encryption data at-rest config key
extern const KeyRef encryptionAtRestModeConfKey;
// Tenant mode config key
extern const KeyRef tenantModeConfKey;
// The differences between excluded and failed can be found in "command-line-interface.rst"
// and in the help message of the fdbcli command "exclude".

View File

@ -235,7 +235,6 @@ struct TenantNameUniqueSet {
return tenantNames.empty();
}
};
class TenantPrefixIndex : public VersionedMap<Key, TenantNameUniqueSet>, public ReferenceCounted<TenantPrefixIndex> {};
typedef VersionedMap<Key, TenantNameUniqueSet> TenantPrefixIndex;
#endif

View File

@ -243,7 +243,7 @@ ACTOR Future<Void> read_http_response(Reference<HTTP::Response> r, Reference<ICo
auto i = r->headers.find("Content-Length");
if (i != r->headers.end())
r->contentLen = atoi(i->second.c_str());
r->contentLen = strtoll(i->second.c_str(), NULL, 10);
else
r->contentLen = -1; // Content length unknown
@ -481,7 +481,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest(Reference<IConnection> conn,
}
if (FLOW_KNOBS->HTTP_VERBOSE_LEVEL > 0) {
printf("[%s] HTTP %scode=%d early=%d, time=%fs %s %s contentLen=%d [%d out, response content len %d]\n",
printf("[%s] HTTP %scode=%d early=%d, time=%fs %s %s contentLen=%d [%d out, response content len %lld]\n",
conn->getDebugID().toString().c_str(),
(err.present() ? format("*ERROR*=%s ", err.get().name()).c_str() : ""),
r->code,
@ -491,7 +491,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest(Reference<IConnection> conn,
resource.c_str(),
contentLen,
total_sent,
(int)r->contentLen);
r->contentLen);
}
if (FLOW_KNOBS->HTTP_VERBOSE_LEVEL > 2) {
printf("[%s] HTTP RESPONSE: %s %s\n%s\n",

View File

@ -102,7 +102,7 @@ public:
// If not found, start the read.
if (i == f->m_blocks.end() || (i->second.isValid() && i->second.isError())) {
// printf("starting read of %s block %d\n", f->getFilename().c_str(), blockNum);
fblock = readBlock(f.getPtr(), f->m_block_size, f->m_block_size * blockNum);
fblock = readBlock(f.getPtr(), f->m_block_size, (int64_t)f->m_block_size * blockNum);
f->m_blocks[blockNum] = fblock;
} else
fblock = i->second;
@ -121,7 +121,7 @@ public:
// Calculate the block-relative read range. It's a given that the offset / length range touches this block
// so readStart will never be greater than blocksize (though it could be past the actual end of a short
// block).
int64_t blockStart = blockNum * f->m_block_size;
int64_t blockStart = (int64_t)blockNum * f->m_block_size;
int64_t readStart = std::max<int64_t>(0, offset - blockStart);
int64_t readEnd = std::min<int64_t>(f->m_block_size, offset + length - blockStart);
int rlen = readEnd - readStart;

View File

@ -56,7 +56,7 @@ public:
FailDisk,
RebootAndDelete,
RebootProcessAndDelete,
RebootProcessAndSwitch,
RebootProcessAndSwitch, // Reboot and switch cluster file
Reboot,
RebootProcess,
None

View File

@ -63,7 +63,8 @@ ISimulator::ISimulator()
: desiredCoordinators(1), physicalDatacenters(1), processesPerMachine(0), listenersPerProcess(1), usableRegions(1),
allowLogSetKills(true), tssMode(TSSMode::Disabled), configDBType(ConfigDBType::DISABLED), isStopped(false),
lastConnectionFailure(0), connectionFailuresDisableDuration(0), speedUpSimulation(false),
backupAgents(BackupAgentType::WaitForType), drAgents(BackupAgentType::WaitForType), allSwapsDisabled(false) {}
backupAgents(BackupAgentType::WaitForType), drAgents(BackupAgentType::WaitForType), allSwapsDisabled(false),
blobGranulesEnabled(false) {}
ISimulator::~ISimulator() = default;
bool simulator_should_inject_fault(const char* context, const char* file, int line, int error_code) {

View File

@ -49,8 +49,8 @@ struct VersionedMessage {
Arena decryptArena; // Arena used for decrypt buffer.
size_t bytes; // arena's size when inserted, which can grow afterwards
VersionedMessage(LogMessageVersion v, StringRef m, const VectorRef<Tag>& t, const Arena& a)
: version(v), message(m), tags(t), arena(a), bytes(a.getSize()) {}
VersionedMessage(LogMessageVersion v, StringRef m, const VectorRef<Tag>& t, const Arena& a, size_t n)
: version(v), message(m), tags(t), arena(a), bytes(n) {}
Version getVersion() const { return version.version; }
uint32_t getSubVersion() const { return version.sub; }
@ -977,15 +977,17 @@ ACTOR Future<Void> pullAsyncData(BackupData* self) {
// Note we aggressively peek (uncommitted) messages, but only committed
// messages/mutations will be flushed to disk/blob in uploadData().
while (r->hasMessage()) {
state size_t takeBytes = 0;
if (!prev.sameArena(r->arena())) {
TraceEvent(SevDebugMemory, "BackupWorkerMemory", self->myId)
.detail("Take", r->arena().getSize())
.detail("Current", self->lock->activePermits());
wait(self->lock->take(TaskPriority::DefaultYield, r->arena().getSize()));
takeBytes = r->arena().getSize(); // more bytes can be allocated after the wait.
wait(self->lock->take(TaskPriority::DefaultYield, takeBytes));
prev = r->arena();
}
self->messages.emplace_back(r->version(), r->getMessage(), r->getTags(), r->arena());
self->messages.emplace_back(r->version(), r->getMessage(), r->getTags(), r->arena(), takeBytes);
r->nextMessage();
}

View File

@ -3547,10 +3547,16 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
bool isFullRestore = wait(isFullRestoreMode(bmData->db, normalKeys));
bmData->isFullRestoreMode = isFullRestore;
if (bmData->isFullRestoreMode) {
BlobRestoreStatus initStatus(BlobRestorePhase::LOAD_MANIFEST);
wait(updateRestoreStatus(bmData->db, normalKeys, initStatus));
wait(loadManifest(bmData->db, bmData->bstore));
int64_t epoc = wait(lastBlobEpoc(bmData->db, bmData->bstore));
wait(updateEpoch(bmData, epoc + 1));
BlobRestoreStatus completedStatus(BlobRestorePhase::MANIFEST_DONE);
wait(updateRestoreStatus(bmData->db, normalKeys, completedStatus));
}
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);

View File

@ -545,7 +545,7 @@ ACTOR Future<bool> isFullRestoreMode(Database db, KeyRangeRef keys) {
KeyRange keyRange = decodeBlobRestoreCommandKeyFor(r.key);
if (keyRange.contains(keys)) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(r.value);
return status.progress < 100; // progress is less than 100
return status.phase < BlobRestorePhase::DONE;
}
}
if (!ranges.more) {
@ -563,3 +563,44 @@ ACTOR Future<bool> isFullRestoreMode(Database db, KeyRangeRef keys) {
}
}
}
// Update restore status
ACTOR Future<Void> updateRestoreStatus(Database db, KeyRangeRef range, BlobRestoreStatus status) {
state Transaction tr(db);
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
Key key = blobRestoreCommandKeyFor(range);
Value value = blobRestoreCommandValueFor(status);
tr.set(key, value);
wait(tr.commit());
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Get restore status
ACTOR Future<Optional<BlobRestoreStatus>> getRestoreStatus(Database db, KeyRangeRef range) {
state Transaction tr(db);
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
Key key = blobRestoreCommandKeyFor(range);
Optional<Value> value = wait(tr.get(key));
Optional<BlobRestoreStatus> result;
if (value.present()) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(value.get());
result = status;
}
return result;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}

View File

@ -18,6 +18,7 @@
* limitations under the License.
*/
#include "fdbclient/BlobGranuleCommon.h"
#include "flow/ActorCollection.h"
#include "flow/FastRef.h"
#include "flow/IRandom.h"
@ -75,8 +76,8 @@ private:
// Check if blob manifest is loaded so that blob migration can start
ACTOR static Future<Void> checkIfReadyForMigration(Reference<BlobMigrator> self) {
loop {
bool isFullRestore = wait(isFullRestoreMode(self->db_, normalKeys));
if (isFullRestore) {
Optional<BlobRestoreStatus> status = wait(getRestoreStatus(self->db_, normalKeys));
if (canStartMigration(status)) {
BlobGranuleRestoreVersionVector granules = wait(listBlobGranules(self->db_, self->blobConn_));
if (!granules.empty()) {
self->blobGranules_ = granules;
@ -87,6 +88,9 @@ private:
.detail("Version", granule.version)
.detail("SizeInBytes", granule.sizeInBytes);
}
BlobRestoreStatus status(BlobRestorePhase::MIGRATE, 0);
wait(updateRestoreStatus(self->db_, normalKeys, status));
return Void();
}
}
@ -94,6 +98,15 @@ private:
}
}
// Check if we should start migration. Migration can be started after manifest is fully loaded
static bool canStartMigration(Optional<BlobRestoreStatus> status) {
if (status.present()) {
BlobRestoreStatus value = status.get();
return value.phase == BlobRestorePhase::MANIFEST_DONE; // manifest is loaded successfully
}
return false;
}
// Prepare for data migration for given key range.
ACTOR static Future<Void> prepare(Reference<BlobMigrator> self, KeyRangeRef keys) {
// Register as a storage server, so that DataDistributor could start data movement after
@ -120,8 +133,8 @@ private:
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
state Value value = keyServersValue(std::vector<UID>({ serverUID }), std::vector<UID>(), UID(), UID());
wait(krmSetRange(&tr, keyServersPrefix, keys, value));
wait(krmSetRange(&tr, serverKeysPrefixFor(serverUID), keys, serverKeysTrue));
wait(krmSetRangeCoalescing(&tr, keyServersPrefix, keys, allKeys, value));
wait(krmSetRangeCoalescing(&tr, serverKeysPrefixFor(serverUID), keys, allKeys, serverKeysTrue));
wait(tr.commit());
dprint("Assign {} to server {}\n", normalKeys.toString(), serverUID.toString());
return Void();
@ -152,7 +165,7 @@ private:
}
}
if (owning) {
wait(krmSetRange(&tr, serverKeysPrefixFor(id), keys, serverKeysFalse));
wait(krmSetRangeCoalescing(&tr, serverKeysPrefixFor(id), keys, allKeys, serverKeysFalse));
dprint("Unassign {} from storage server {}\n", keys.toString(), id.toString());
TraceEvent("UnassignKeys").detail("Keys", keys.toString()).detail("From", id.toString());
}
@ -169,8 +182,12 @@ private:
ACTOR static Future<Void> logProgress(Reference<BlobMigrator> self) {
loop {
bool done = wait(checkProgress(self));
if (done)
if (done) {
BlobRestoreStatus status(BlobRestorePhase::DONE);
wait(updateRestoreStatus(self->db_, normalKeys, status));
return Void();
}
wait(delay(SERVER_KNOBS->BLOB_MIGRATOR_CHECK_INTERVAL));
}
}
@ -205,7 +222,8 @@ private:
state bool done = incompleted == 0;
dprint("Migration progress :{}%. done {}\n", progress, done);
TraceEvent("BlobMigratorProgress").detail("Progress", progress).detail("Done", done);
wait(updateProgress(self, normalKeys, progress));
BlobRestoreStatus status(BlobRestorePhase::MIGRATE, progress);
wait(updateRestoreStatus(self->db_, normalKeys, status));
return done;
} catch (Error& e) {
wait(tr.onError(e));
@ -213,32 +231,6 @@ private:
}
}
// Update restore progress
ACTOR static Future<Void> updateProgress(Reference<BlobMigrator> self, KeyRangeRef range, int progress) {
state Transaction tr(self->db_);
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
state Key key = blobRestoreCommandKeyFor(range);
Optional<Value> value = wait(tr.get(key));
if (value.present()) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(value.get());
if (progress > status.progress) {
status.progress = progress;
Value updatedValue = blobRestoreCommandValueFor(status);
tr.set(key, updatedValue);
wait(tr.commit());
}
}
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Advance version, so that future commits will have a larger version than the restored data
ACTOR static Future<Void> advanceVersion(Reference<BlobMigrator> self) {
state Transaction tr(self->db_);

View File

@ -26,6 +26,7 @@
#include <tuple>
#include <vector>
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/DatabaseContext.h"
@ -2565,8 +2566,8 @@ ACTOR Future<Void> watchBlobRestoreCommand(ClusterControllerData* self) {
Optional<Value> blobRestoreCommand = wait(tr->get(blobRestoreCommandKey));
if (blobRestoreCommand.present()) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(blobRestoreCommand.get());
TraceEvent("WatchBlobRestoreCommand").detail("Progress", status.progress);
if (status.progress == 0) {
TraceEvent("WatchBlobRestoreCommand").detail("Progress", status.progress).detail("Phase", status.phase);
if (status.phase == BlobRestorePhase::INIT) {
self->db.blobRestoreEnabled.set(true);
if (self->db.blobGranulesEnabled.get()) {
const auto& blobManager = self->db.serverInfo->get().blobManager;

View File

@ -435,7 +435,7 @@ namespace {
EncryptionAtRestMode getEncryptionAtRest() {
// TODO: Use db-config encryption config to determine cluster encryption status
if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
return EncryptionAtRestMode(EncryptionAtRestMode::Mode::AES_256_CTR);
return EncryptionAtRestMode(EncryptionAtRestMode::Mode::DOMAIN_AWARE);
} else {
return EncryptionAtRestMode();
}

View File

@ -2910,7 +2910,7 @@ ACTOR Future<Void> commitProxyServerCore(CommitProxyInterface proxy,
ASSERT(commitData.resolvers.size() != 0);
for (int i = 0; i < commitData.resolvers.size(); ++i) {
commitData.stats.resolverDist.push_back(Histogram::getHistogram(
"CommitProxy"_sr, "ToResolver_" + commitData.resolvers[i].id().toString(), Histogram::Unit::microseconds));
"CommitProxy"_sr, "ToResolver_" + commitData.resolvers[i].id().toString(), Histogram::Unit::milliseconds));
}
// Initialize keyResolvers map

View File

@ -92,10 +92,10 @@ class ConfigBroadcasterImpl {
// Used to read a snapshot from the previous coordinators after a change
// coordinators command.
Version maxLastSeenVersion = ::invalidVersion;
Future<Optional<Value>> previousCoordinatorsFuture;
std::unique_ptr<IConfigConsumer> previousCoordinatorsConsumer;
Future<Void> previousCoordinatorsSnapshotFuture;
Version largestConfigNodeVersion{ ::invalidVersion };
UID id;
CounterCollection cc;
@ -106,6 +106,7 @@ class ConfigBroadcasterImpl {
Future<Void> logger;
int coordinators = 0;
std::unordered_set<NetworkAddress> registeredConfigNodes;
std::unordered_set<NetworkAddress> activeConfigNodes;
std::unordered_set<NetworkAddress> registrationResponses;
std::unordered_set<NetworkAddress> registrationResponsesUnregistered;
@ -268,7 +269,7 @@ class ConfigBroadcasterImpl {
// Ask the registering ConfigNode whether it has registered in the past.
state ConfigBroadcastRegisteredReply reply = wait(
brokenPromiseToNever(configBroadcastInterface.registered.getReply(ConfigBroadcastRegisteredRequest{})));
self->maxLastSeenVersion = std::max(self->maxLastSeenVersion, reply.lastSeenVersion);
self->largestConfigNodeVersion = std::max(self->largestConfigNodeVersion, reply.lastSeenVersion);
state bool registered = reply.registered;
TraceEvent("ConfigBroadcasterRegisterNodeReceivedRegistrationReply", self->id)
.detail("Address", address)
@ -302,6 +303,7 @@ class ConfigBroadcasterImpl {
int nodesTillQuorum = self->coordinators / 2 + 1 - (int)self->activeConfigNodes.size();
if (registered) {
self->registeredConfigNodes.insert(address);
self->activeConfigNodes.insert(address);
self->disallowUnregistered = true;
} else if ((self->activeConfigNodes.size() < self->coordinators / 2 + 1 && !self->disallowUnregistered) ||
@ -365,6 +367,52 @@ class ConfigBroadcasterImpl {
state bool sendSnapshot =
self->previousCoordinatorsConsumer && reply.lastSeenVersion <= self->mostRecentVersion;
// If a coordinator change is ongoing, a quorum of ConfigNodes are
// already registered and the largest version at least one of those
// ConfigNodes knows about is greater than the version of the latest
// snapshot the broadcaster has, don't send a snapshot to any
// ConfigNodes. This could end up overwriting committed data. Consider
// the following scenario, with three ConfigNodes:
//
// T=0:
// A: v5
// T=1:
// change coordinators, new coordinators are B, C, D
// T=2:
// B: v5, C: v5, D: v5
// T=3:
// B: v5, C: v10, D: v10
// (some commits happen on only C and D)
// (previousCoordinatorsKey has not been cleared yet)
// T=4:
// D dies and loses its data
// T=5:
// D starts
// B: v5 (registered=yes), C: v10 (registered=yes), D: v0 (registered=no)
// Broadcaster: has an old snapshot, only knows about v5
// self->mostRecentVersion=5
// T=6:
// B, C, D (re-)register with broadcaster
//
// At T=5, the broadcaster would send snapshots to B and D because the
// largest version they know about (5) is less than or equal to
// self->mostRecentVersion (5). But this would cause a majority of
// nodes to think v5 is the latest committed version, causing C to be
// rolled back, and losing commit data between versions 5 and 10.
//
// This is a special case where the coordinators are being changed.
// During a coordinator change, a majority of ConfigNodes being
// registered means the coordinator change already took place, and it
// is being retried due to some failure. In that case, we don't want to
// resend snapshots if a majority of the new ConfigNodes are
// registered, because they could have been accepting commits. Instead,
// let the rollback/rollforward algorithm update the out of date nodes.
if (self->previousCoordinatorsConsumer && self->largestConfigNodeVersion > self->mostRecentVersion &&
self->registeredConfigNodes.size() >= self->coordinators / 2 + 1) {
sendSnapshot = false;
}
// Unregistered nodes need to wait for either:
// 1. A quorum of registered nodes to register and send their
// snapshots, so the unregistered nodes can be rolled forward, or

View File

@ -234,10 +234,13 @@ class ConfigNodeImpl {
req.reply.sendError(process_behind()); // Reuse the process_behind error
return Void();
}
if (BUGGIFY) {
wait(delay(deterministicRandom()->random01() * 2));
}
state Standalone<VectorRef<VersionedConfigMutationRef>> versionedMutations =
wait(getMutations(self, req.lastSeenVersion + 1, committedVersion));
wait(getMutations(self, req.lastSeenVersion + 1, req.mostRecentVersion));
state Standalone<VectorRef<VersionedConfigCommitAnnotationRef>> versionedAnnotations =
wait(getAnnotations(self, req.lastSeenVersion + 1, committedVersion));
wait(getAnnotations(self, req.lastSeenVersion + 1, req.mostRecentVersion));
TraceEvent(SevInfo, "ConfigNodeSendingChanges", self->id)
.detail("ReqLastSeenVersion", req.lastSeenVersion)
.detail("ReqMostRecentVersion", req.mostRecentVersion)
@ -245,7 +248,7 @@ class ConfigNodeImpl {
.detail("NumMutations", versionedMutations.size())
.detail("NumCommits", versionedAnnotations.size());
++self->successfulChangeRequests;
req.reply.send(ConfigFollowerGetChangesReply{ committedVersion, versionedMutations, versionedAnnotations });
req.reply.send(ConfigFollowerGetChangesReply{ versionedMutations, versionedAnnotations });
return Void();
}
@ -520,6 +523,18 @@ class ConfigNodeImpl {
ObjectReader::fromStringRef<KnobValue>(kv.value, IncludeVersion());
}
wait(store(reply.snapshotVersion, getLastCompactedVersion(self)));
if (req.mostRecentVersion < reply.snapshotVersion) {
// The version in the request can be less than the last compacted
// version in certain circumstances where the coordinators are
// being changed and the consumer reads the latest committed
// version from a majority of ConfigNodes before they have received
// up to date snapshots. This should be fine, it just means the
// consumer needs to fetch the latest version and retry its
// request.
CODE_PROBE(true, "ConfigNode ahead of consumer", probe::decoration::rare);
req.reply.sendError(version_already_compacted());
return Void();
}
wait(store(reply.changes, getMutations(self, reply.snapshotVersion + 1, req.mostRecentVersion)));
wait(store(reply.annotations, getAnnotations(self, reply.snapshotVersion + 1, req.mostRecentVersion)));
TraceEvent(SevInfo, "ConfigNodeGettingSnapshot", self->id)

View File

@ -1548,14 +1548,20 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
if (enableShardMove && tciIndex == 1) {
ASSERT(physicalShardIDCandidate != UID().first() &&
physicalShardIDCandidate != anonymousShardId.first());
Optional<ShardsAffectedByTeamFailure::Team> remoteTeamWithPhysicalShard =
std::pair<Optional<ShardsAffectedByTeamFailure::Team>, bool> remoteTeamWithPhysicalShard =
self->physicalShardCollection->tryGetAvailableRemoteTeamWith(
physicalShardIDCandidate, metrics, debugID);
// TODO: when we know that `physicalShardIDCandidate` exists, remote team must also exists.
if (remoteTeamWithPhysicalShard.present()) {
if (!remoteTeamWithPhysicalShard.second) {
// Physical shard with `physicalShardIDCandidate` is not available. Retry selecting new
// dst physical shard.
self->retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]++;
foundTeams = false;
break;
}
if (remoteTeamWithPhysicalShard.first.present()) {
// Exists a remoteTeam in the mapping that has the physicalShardIDCandidate
// use the remoteTeam with the physicalShard as the bestTeam
req = GetTeamRequest(remoteTeamWithPhysicalShard.get().servers);
req = GetTeamRequest(remoteTeamWithPhysicalShard.first.get().servers);
}
}
@ -1853,19 +1859,35 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
state Error error = success();
state Promise<Void> dataMovementComplete;
// Move keys from source to destination by changing the serverKeyList and keyServerList system keys
state Future<Void> doMoveKeys =
self->txnProcessor->moveKeys(MoveKeysParams{ rd.dataMoveId,
rd.keys,
destIds,
healthyIds,
self->lock,
dataMovementComplete,
&self->startMoveKeysParallelismLock,
&self->finishMoveKeysParallelismLock,
self->teamCollections.size() > 1,
relocateShardInterval.pairID,
ddEnabledState,
CancelConflictingDataMoves::False });
std::unique_ptr<MoveKeysParams> params;
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
params = std::make_unique<MoveKeysParams>(rd.dataMoveId,
std::vector<KeyRange>{ rd.keys },
destIds,
healthyIds,
self->lock,
dataMovementComplete,
&self->startMoveKeysParallelismLock,
&self->finishMoveKeysParallelismLock,
self->teamCollections.size() > 1,
relocateShardInterval.pairID,
ddEnabledState,
CancelConflictingDataMoves::False);
} else {
params = std::make_unique<MoveKeysParams>(rd.dataMoveId,
rd.keys,
destIds,
healthyIds,
self->lock,
dataMovementComplete,
&self->startMoveKeysParallelismLock,
&self->finishMoveKeysParallelismLock,
self->teamCollections.size() > 1,
relocateShardInterval.pairID,
ddEnabledState,
CancelConflictingDataMoves::False);
}
state Future<Void> doMoveKeys = self->txnProcessor->moveKeys(*params);
state Future<Void> pollHealth =
signalledTransferComplete ? Never()
: delay(SERVER_KNOBS->HEALTH_POLL_TIME, TaskPriority::DataDistributionLaunch);
@ -1878,19 +1900,35 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
healthyIds.insert(healthyIds.end(), extraIds.begin(), extraIds.end());
extraIds.clear();
ASSERT(totalIds == destIds.size()); // Sanity check the destIDs before we move keys
doMoveKeys =
self->txnProcessor->moveKeys(MoveKeysParams{ rd.dataMoveId,
rd.keys,
destIds,
healthyIds,
self->lock,
Promise<Void>(),
&self->startMoveKeysParallelismLock,
&self->finishMoveKeysParallelismLock,
self->teamCollections.size() > 1,
relocateShardInterval.pairID,
ddEnabledState,
CancelConflictingDataMoves::False });
std::unique_ptr<MoveKeysParams> params;
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
params = std::make_unique<MoveKeysParams>(rd.dataMoveId,
std::vector<KeyRange>{ rd.keys },
destIds,
healthyIds,
self->lock,
Promise<Void>(),
&self->startMoveKeysParallelismLock,
&self->finishMoveKeysParallelismLock,
self->teamCollections.size() > 1,
relocateShardInterval.pairID,
ddEnabledState,
CancelConflictingDataMoves::False);
} else {
params = std::make_unique<MoveKeysParams>(rd.dataMoveId,
rd.keys,
destIds,
healthyIds,
self->lock,
Promise<Void>(),
&self->startMoveKeysParallelismLock,
&self->finishMoveKeysParallelismLock,
self->teamCollections.size() > 1,
relocateShardInterval.pairID,
ddEnabledState,
CancelConflictingDataMoves::False);
}
doMoveKeys = self->txnProcessor->moveKeys(*params);
} else {
self->fetchKeysComplete.insert(rd);
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {

View File

@ -1756,7 +1756,7 @@ InOverSizePhysicalShard PhysicalShardCollection::isInOverSizePhysicalShard(KeyRa
}
// May return a problematic remote team
Optional<ShardsAffectedByTeamFailure::Team> PhysicalShardCollection::tryGetAvailableRemoteTeamWith(
std::pair<Optional<ShardsAffectedByTeamFailure::Team>, bool> PhysicalShardCollection::tryGetAvailableRemoteTeamWith(
uint64_t inputPhysicalShardID,
StorageMetrics const& moveInMetrics,
uint64_t debugID) {
@ -1764,10 +1764,10 @@ Optional<ShardsAffectedByTeamFailure::Team> PhysicalShardCollection::tryGetAvail
ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD);
ASSERT(inputPhysicalShardID != anonymousShardId.first() && inputPhysicalShardID != UID().first());
if (physicalShardInstances.count(inputPhysicalShardID) == 0) {
return Optional<ShardsAffectedByTeamFailure::Team>();
return { Optional<ShardsAffectedByTeamFailure::Team>(), true };
}
if (!checkPhysicalShardAvailable(inputPhysicalShardID, moveInMetrics)) {
return Optional<ShardsAffectedByTeamFailure::Team>();
return { Optional<ShardsAffectedByTeamFailure::Team>(), false };
}
for (auto team : physicalShardInstances[inputPhysicalShardID].teams) {
if (team.primary == false) {
@ -1777,10 +1777,12 @@ Optional<ShardsAffectedByTeamFailure::Team> PhysicalShardCollection::tryGetAvail
.detail("TeamSize", team.servers.size())
.detail("PhysicalShardsOfTeam", convertIDsToString(teamPhysicalShardIDs[team]))
.detail("DebugID", debugID);*/
return team;
return { team, true };
}
}
UNREACHABLE();
// In this case, the physical shard may not be populated in the remote region yet, e.g., we are making a
// configuration change to turn a single region cluster into HA mode.
return { Optional<ShardsAffectedByTeamFailure::Team>(), true };
}
// The update of PhysicalShardToTeams, Collection, keyRangePhysicalShardIDMap should be atomic

View File

@ -723,6 +723,17 @@ struct DDMockTxnProcessorImpl {
return Void();
}
static Future<Void> rawCheckFetchingState(DDMockTxnProcessor* self, const MoveKeysParams& params) {
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
ASSERT(params.ranges.present());
// TODO: make startMoveShards work with multiple ranges.
ASSERT(params.ranges.get().size() == 1);
return checkFetchingState(self, params.destinationTeam, params.ranges.get().at(0));
}
ASSERT(params.keys.present());
return checkFetchingState(self, params.destinationTeam, params.keys.get());
}
ACTOR static Future<Void> moveKeys(DDMockTxnProcessor* self, MoveKeysParams params) {
state std::map<UID, StorageServerInterface> tssMapping;
// Because SFBTF::Team requires the ID is ordered
@ -732,7 +743,7 @@ struct DDMockTxnProcessorImpl {
wait(self->rawStartMovement(params, tssMapping));
ASSERT(tssMapping.empty());
wait(checkFetchingState(self, params.destinationTeam, params.keys));
wait(rawCheckFetchingState(self, params));
wait(self->rawFinishMovement(params, tssMapping));
if (!params.dataMovementComplete.isSet())
@ -915,6 +926,16 @@ Future<std::vector<ProcessData>> DDMockTxnProcessor::getWorkers() const {
ACTOR Future<Void> rawStartMovement(std::shared_ptr<MockGlobalState> mgs,
MoveKeysParams params,
std::map<UID, StorageServerInterface> tssMapping) {
state KeyRange keys;
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
ASSERT(params.ranges.present());
// TODO: make startMoveShards work with multiple ranges.
ASSERT(params.ranges.get().size() == 1);
keys = params.ranges.get().at(0);
} else {
ASSERT(params.keys.present());
keys = params.keys.get();
}
// There wont be parallel rawStart or rawFinish in mock world due to the fact the following *mock* transaction code
// will always finish without coroutine switch.
ASSERT(params.startMoveKeysParallelismLock->activePermits() == 0);
@ -925,15 +946,15 @@ ACTOR Future<Void> rawStartMovement(std::shared_ptr<MockGlobalState> mgs,
destTeams.emplace_back(params.destinationTeam, true);
// invariant: the splitting and merge operation won't happen at the same moveKeys action. For example, if [a,c) [c,
// e) exists, the params.keys won't be [b, d).
auto intersectRanges = mgs->shardMapping->intersectingRanges(params.keys);
auto intersectRanges = mgs->shardMapping->intersectingRanges(keys);
// 1. splitting or just move a range. The new boundary need to be defined in startMovement
if (intersectRanges.begin().range().contains(params.keys)) {
mgs->shardMapping->defineShard(params.keys);
if (intersectRanges.begin().range().contains(keys)) {
mgs->shardMapping->defineShard(keys);
}
// 2. merge ops will coalesce the boundary in finishMovement;
intersectRanges = mgs->shardMapping->intersectingRanges(params.keys);
ASSERT(params.keys.begin == intersectRanges.begin().begin());
ASSERT(params.keys.end == intersectRanges.end().begin());
intersectRanges = mgs->shardMapping->intersectingRanges(keys);
ASSERT(keys.begin == intersectRanges.begin().begin());
ASSERT(keys.end == intersectRanges.end().begin());
for (auto it = intersectRanges.begin(); it != intersectRanges.end(); ++it) {
auto teamPair = mgs->shardMapping->getTeamsFor(it->begin());
@ -945,8 +966,8 @@ ACTOR Future<Void> rawStartMovement(std::shared_ptr<MockGlobalState> mgs,
deterministicRandom()->randomInt64(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
for (auto& id : params.destinationTeam) {
auto& server = mgs->allServers.at(id);
server.setShardStatus(params.keys, MockShardStatus::INFLIGHT, mgs->restrictSize);
server.signalFetchKeys(params.keys, randomRangeSize);
server.setShardStatus(keys, MockShardStatus::INFLIGHT, mgs->restrictSize);
server.signalFetchKeys(keys, randomRangeSize);
}
return Void();
}
@ -959,6 +980,17 @@ Future<Void> DDMockTxnProcessor::rawStartMovement(const MoveKeysParams& params,
ACTOR Future<Void> rawFinishMovement(std::shared_ptr<MockGlobalState> mgs,
MoveKeysParams params,
std::map<UID, StorageServerInterface> tssMapping) {
state KeyRange keys;
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
ASSERT(params.ranges.present());
// TODO: make startMoveShards work with multiple ranges.
ASSERT(params.ranges.get().size() == 1);
keys = params.ranges.get().at(0);
} else {
ASSERT(params.keys.present());
keys = params.keys.get();
}
// There wont be parallel rawStart or rawFinish in mock world due to the fact the following *mock* transaction code
// will always finish without coroutine switch.
ASSERT(params.finishMoveKeysParallelismLock->activePermits() == 0);
@ -966,7 +998,7 @@ ACTOR Future<Void> rawFinishMovement(std::shared_ptr<MockGlobalState> mgs,
state FlowLock::Releaser releaser(*params.finishMoveKeysParallelismLock);
// get source and dest teams
auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsForFirstShard(params.keys);
auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsForFirstShard(keys);
ASSERT_EQ(destTeams.size(), 1); // Will the multi-region or dynamic replica make destTeam.size() > 1?
if (destTeams.front() != ShardsAffectedByTeamFailure::Team{ params.destinationTeam, true }) {
@ -978,7 +1010,7 @@ ACTOR Future<Void> rawFinishMovement(std::shared_ptr<MockGlobalState> mgs,
}
for (auto& id : params.destinationTeam) {
mgs->allServers.at(id).setShardStatus(params.keys, MockShardStatus::COMPLETED, mgs->restrictSize);
mgs->allServers.at(id).setShardStatus(keys, MockShardStatus::COMPLETED, mgs->restrictSize);
}
// remove destination servers from source servers
@ -986,11 +1018,11 @@ ACTOR Future<Void> rawFinishMovement(std::shared_ptr<MockGlobalState> mgs,
for (auto& id : srcTeams.front().servers) {
// the only caller moveKeys will always make sure the UID are sorted
if (!std::binary_search(params.destinationTeam.begin(), params.destinationTeam.end(), id)) {
mgs->allServers.at(id).removeShard(params.keys);
mgs->allServers.at(id).removeShard(keys);
}
}
mgs->shardMapping->finishMove(params.keys);
mgs->shardMapping->defineShard(params.keys); // coalesce for merge
mgs->shardMapping->finishMove(keys);
mgs->shardMapping->defineShard(keys); // coalesce for merge
return Void();
}

View File

@ -134,9 +134,9 @@ struct GrvProxyStats {
recentRequests(0), lastBucketBegin(now()),
bucketInterval(FLOW_KNOBS->BASIC_LOAD_BALANCE_UPDATE_RATE / FLOW_KNOBS->BASIC_LOAD_BALANCE_BUCKETS),
grvConfirmEpochLiveDist(
Histogram::getHistogram("GrvProxy"_sr, "GrvConfirmEpochLive"_sr, Histogram::Unit::microseconds)),
Histogram::getHistogram("GrvProxy"_sr, "GrvConfirmEpochLive"_sr, Histogram::Unit::milliseconds)),
grvGetCommittedVersionRpcDist(
Histogram::getHistogram("GrvProxy"_sr, "GrvGetCommittedVersionRpc"_sr, Histogram::Unit::microseconds)) {
Histogram::getHistogram("GrvProxy"_sr, "GrvGetCommittedVersionRpc"_sr, Histogram::Unit::milliseconds)) {
// The rate at which the limit(budget) is allowed to grow.
specialCounter(cc, "SystemGRVQueueSize", [this]() { return this->systemGRVQueueSize; });
specialCounter(cc, "DefaultGRVQueueSize", [this]() { return this->defaultGRVQueueSize; });

View File

@ -68,12 +68,9 @@
#ifdef SSD_ROCKSDB_EXPERIMENTAL
// Enforcing rocksdb version to be 6.27.3 or greater.
static_assert(ROCKSDB_MAJOR >= 6, "Unsupported rocksdb version. Update the rocksdb to 6.27.3 version");
static_assert(ROCKSDB_MAJOR == 6 ? ROCKSDB_MINOR >= 27 : true,
"Unsupported rocksdb version. Update the rocksdb to 6.27.3 version");
static_assert((ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR == 27) ? ROCKSDB_PATCH >= 3 : true,
"Unsupported rocksdb version. Update the rocksdb to 6.27.3 version");
// Enforcing rocksdb version to be 7.7.3.
static_assert((ROCKSDB_MAJOR == 7 && ROCKSDB_MINOR == 7 && ROCKSDB_PATCH == 3),
"Unsupported rocksdb version. Update the rocksdb to 7.7.3 version");
namespace {
using rocksdb::BackgroundErrorReason;
@ -901,6 +898,7 @@ ACTOR Future<Void> rocksDBMetricLogger(UID id,
};
// To control the rocksdb::StatsLevel, use ROCKSDB_STATS_LEVEL knob.
// Refer StatsLevel: https://github.com/facebook/rocksdb/blob/main/include/rocksdb/statistics.h#L594
state std::vector<std::pair<const char*, uint32_t>> histogramStats = {
{ "CompactionTime", rocksdb::COMPACTION_TIME }, // enabled if rocksdb::StatsLevel > kExceptTimers(2)
{ "CompactionCPUTime", rocksdb::COMPACTION_CPU_TIME }, // enabled if rocksdb::StatsLevel > kExceptTimers(2)
@ -970,6 +968,7 @@ ACTOR Future<Void> rocksDBMetricLogger(UID id,
}
// None of the histogramStats are enabled unless the ROCKSDB_STATS_LEVEL > kExceptHistogramOrTimers(1)
// Refer StatsLevel: https://github.com/facebook/rocksdb/blob/main/include/rocksdb/statistics.h#L594
if (SERVER_KNOBS->ROCKSDB_STATS_LEVEL > rocksdb::kExceptHistogramOrTimers) {
for (auto& [name, histogram] : histogramStats) {
rocksdb::HistogramData histogram_data;
@ -1031,7 +1030,10 @@ void logRocksDBError(UID id,
Optional<Severity> sev = Optional<Severity>()) {
Severity level = sev.present() ? sev.get() : (status.IsTimedOut() ? SevWarn : SevError);
TraceEvent e(level, "RocksDBError", id);
e.detail("Error", status.ToString()).detail("Method", method).detail("RocksDBSeverity", status.severity());
e.setMaxFieldLength(10000)
.detail("Error", status.ToString())
.detail("Method", method)
.detail("RocksDBSeverity", status.severity());
if (status.IsIOError()) {
e.detail("SubCode", status.subcode());
}
@ -1253,15 +1255,18 @@ struct RocksDBKeyValueStore : IKeyValueStore {
std::make_pair(ROCKSDB_COMMIT_QUEUEWAIT_HISTOGRAM.toString(), commitBeginTime - a.startTime));
}
Standalone<VectorRef<KeyRangeRef>> deletes;
DeleteVisitor dv(deletes, deletes.arena());
rocksdb::Status s = a.batchToCommit->Iterate(&dv);
if (!s.ok()) {
logRocksDBError(id, s, "CommitDeleteVisitor");
a.done.sendError(statusToError(s));
return;
if (SERVER_KNOBS->ROCKSDB_SUGGEST_COMPACT_CLEAR_RANGE) {
DeleteVisitor dv(deletes, deletes.arena());
rocksdb::Status s = a.batchToCommit->Iterate(&dv);
if (!s.ok()) {
logRocksDBError(id, s, "CommitDeleteVisitor");
a.done.sendError(statusToError(s));
return;
}
// If there are any range deletes, we should have added them to be deleted.
ASSERT(!deletes.empty() || !a.batchToCommit->HasDeleteRange());
}
// If there are any range deletes, we should have added them to be deleted.
ASSERT(!deletes.empty() || !a.batchToCommit->HasDeleteRange());
rocksdb::WriteOptions options;
options.sync = !SERVER_KNOBS->ROCKSDB_UNSAFE_AUTO_FSYNC;
if (SERVER_KNOBS->ROCKSDB_DISABLE_WAL_EXPERIMENTAL) {
@ -1275,7 +1280,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
// Request for batchToCommit bytes. If this request cannot be satisfied, the call is blocked.
rateLimiter->Request(a.batchToCommit->GetDataSize() /* bytes */, rocksdb::Env::IO_HIGH);
}
s = db->Write(options, a.batchToCommit.get());
rocksdb::Status s = db->Write(options, a.batchToCommit.get());
readIterPool->update();
double currTime = timer_monotonic();
sharedState->dbWriteLatency.addMeasurement(currTime - writeBeginTime);
@ -1402,17 +1407,11 @@ struct RocksDBKeyValueStore : IKeyValueStore {
ThreadReturnPromiseStream<std::pair<std::string, double>>* metricPromiseStream)
: id(id), db(db), cf(cf), sharedState(sharedState), readIterPool(readIterPool),
perfContextMetrics(perfContextMetrics), metricPromiseStream(metricPromiseStream), threadIndex(threadIndex) {
if (g_network->isSimulated()) {
// In simulation, increasing the read operation timeouts to 5 minutes, as some of the tests have
// very high load and single read thread cannot process all the load within the timeouts.
readValueTimeout = 5 * 60;
readValuePrefixTimeout = 5 * 60;
readRangeTimeout = 5 * 60;
} else {
readValueTimeout = SERVER_KNOBS->ROCKSDB_READ_VALUE_TIMEOUT;
readValuePrefixTimeout = SERVER_KNOBS->ROCKSDB_READ_VALUE_PREFIX_TIMEOUT;
readRangeTimeout = SERVER_KNOBS->ROCKSDB_READ_RANGE_TIMEOUT;
}
readValueTimeout = SERVER_KNOBS->ROCKSDB_READ_VALUE_TIMEOUT;
readValuePrefixTimeout = SERVER_KNOBS->ROCKSDB_READ_VALUE_PREFIX_TIMEOUT;
readRangeTimeout = SERVER_KNOBS->ROCKSDB_READ_RANGE_TIMEOUT;
if (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE) {
// Enable perf context on the same thread with the db thread
rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableTimeExceptForMutex);
@ -1792,39 +1791,39 @@ struct RocksDBKeyValueStore : IKeyValueStore {
ACTOR Future<Void> updateHistogram(FutureStream<std::pair<std::string, double>> metricFutureStream) {
state Reference<Histogram> commitLatencyHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_LATENCY_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_LATENCY_HISTOGRAM, Histogram::Unit::milliseconds);
state Reference<Histogram> commitActionHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_ACTION_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_ACTION_HISTOGRAM, Histogram::Unit::milliseconds);
state Reference<Histogram> commitQueueWaitHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_QUEUEWAIT_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_QUEUEWAIT_HISTOGRAM, Histogram::Unit::milliseconds);
state Reference<Histogram> writeHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_WRITE_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_WRITE_HISTOGRAM, Histogram::Unit::milliseconds);
state Reference<Histogram> deleteCompactRangeHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_DELETE_COMPACTRANGE_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_DELETE_COMPACTRANGE_HISTOGRAM, Histogram::Unit::milliseconds);
state Reference<Histogram> readRangeLatencyHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_LATENCY_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_LATENCY_HISTOGRAM, Histogram::Unit::milliseconds);
state Reference<Histogram> readValueLatencyHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_LATENCY_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_LATENCY_HISTOGRAM, Histogram::Unit::milliseconds);
state Reference<Histogram> readPrefixLatencyHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_LATENCY_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_LATENCY_HISTOGRAM, Histogram::Unit::milliseconds);
state Reference<Histogram> readRangeActionHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_ACTION_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_ACTION_HISTOGRAM, Histogram::Unit::milliseconds);
state Reference<Histogram> readValueActionHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_ACTION_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_ACTION_HISTOGRAM, Histogram::Unit::milliseconds);
state Reference<Histogram> readPrefixActionHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_ACTION_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_ACTION_HISTOGRAM, Histogram::Unit::milliseconds);
state Reference<Histogram> readRangeQueueWaitHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_QUEUEWAIT_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_QUEUEWAIT_HISTOGRAM, Histogram::Unit::milliseconds);
state Reference<Histogram> readValueQueueWaitHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_QUEUEWAIT_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_QUEUEWAIT_HISTOGRAM, Histogram::Unit::milliseconds);
state Reference<Histogram> readPrefixQueueWaitHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_QUEUEWAIT_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_QUEUEWAIT_HISTOGRAM, Histogram::Unit::milliseconds);
state Reference<Histogram> readRangeNewIteratorHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_NEWITERATOR_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_NEWITERATOR_HISTOGRAM, Histogram::Unit::milliseconds);
state Reference<Histogram> readValueGetHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_GET_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_GET_HISTOGRAM, Histogram::Unit::milliseconds);
state Reference<Histogram> readPrefixGetHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_GET_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_GET_HISTOGRAM, Histogram::Unit::milliseconds);
loop {
choose {
when(std::pair<std::string, double> measure = waitNext(metricFutureStream)) {

View File

@ -41,12 +41,9 @@
#ifdef SSD_ROCKSDB_EXPERIMENTAL
// Enforcing rocksdb version to be 6.27.3 or greater.
static_assert(ROCKSDB_MAJOR >= 6, "Unsupported rocksdb version. Update the rocksdb to 6.27.3 version");
static_assert(ROCKSDB_MAJOR == 6 ? ROCKSDB_MINOR >= 27 : true,
"Unsupported rocksdb version. Update the rocksdb to 6.27.3 version");
static_assert((ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR == 27) ? ROCKSDB_PATCH >= 3 : true,
"Unsupported rocksdb version. Update the rocksdb to 6.27.3 version");
// Enforcing rocksdb version to be 7.7.3.
static_assert((ROCKSDB_MAJOR == 7 && ROCKSDB_MINOR == 7 && ROCKSDB_PATCH == 3),
"Unsupported rocksdb version. Update the rocksdb to 7.7.3 version");
const std::string rocksDataFolderSuffix = "-data";
const std::string METADATA_SHARD_ID = "kvs-metadata";
@ -170,7 +167,10 @@ std::string getShardMappingKey(KeyRef key, StringRef prefix) {
void logRocksDBError(const rocksdb::Status& status, const std::string& method) {
auto level = status.IsTimedOut() ? SevWarn : SevError;
TraceEvent e(level, "ShardedRocksDBError");
e.detail("Error", status.ToString()).detail("Method", method).detail("ShardedRocksDBSeverity", status.severity());
e.setMaxFieldLength(10000)
.detail("Error", status.ToString())
.detail("Method", method)
.detail("ShardedRocksDBSeverity", status.severity());
if (status.IsIOError()) {
e.detail("SubCode", status.subcode());
}
@ -449,7 +449,8 @@ struct DataShard {
// PhysicalShard represent a collection of logical shards. A PhysicalShard could have one or more DataShards. A
// PhysicalShard is stored as a column family in rocksdb. Each PhysicalShard has its own iterator pool.
struct PhysicalShard {
PhysicalShard(rocksdb::DB* db, std::string id) : db(db), id(id), isInitialized(false) {}
PhysicalShard(rocksdb::DB* db, std::string id, const rocksdb::ColumnFamilyOptions& options)
: db(db), id(id), cfOptions(options), isInitialized(false) {}
PhysicalShard(rocksdb::DB* db, std::string id, rocksdb::ColumnFamilyHandle* handle)
: db(db), id(id), cf(handle), isInitialized(true) {
ASSERT(cf);
@ -460,7 +461,7 @@ struct PhysicalShard {
if (cf) {
return rocksdb::Status::OK();
}
auto status = db->CreateColumnFamily(getCFOptions(), id, &cf);
auto status = db->CreateColumnFamily(cfOptions, id, &cf);
if (!status.ok()) {
logRocksDBError(status, "AddCF");
return status;
@ -516,6 +517,7 @@ struct PhysicalShard {
rocksdb::DB* db;
std::string id;
rocksdb::ColumnFamilyOptions cfOptions;
rocksdb::ColumnFamilyHandle* cf = nullptr;
std::unordered_map<std::string, std::unique_ptr<DataShard>> dataShards;
std::shared_ptr<ReadIteratorPool> readIterPool;
@ -586,7 +588,8 @@ int readRangeInDb(PhysicalShard* shard, const KeyRangeRef range, int rowLimit, i
// Manages physical shards and maintains logical shard mapping.
class ShardManager {
public:
ShardManager(std::string path, UID logId) : path(path), logId(logId), dataShardMap(nullptr, specialKeys.end) {}
ShardManager(std::string path, UID logId, const rocksdb::Options& options)
: path(path), logId(logId), dbOptions(options), dataShardMap(nullptr, specialKeys.end) {}
ACTOR static Future<Void> shardMetricsLogger(std::shared_ptr<ShardedRocksDBState> rState,
Future<Void> openFuture,
@ -637,31 +640,31 @@ public:
return Void();
}
rocksdb::Status init(rocksdb::Options options) {
rocksdb::Status init() {
// Open instance.
TraceEvent(SevInfo, "ShardedRocksShardManagerInitBegin", this->logId).detail("DataPath", path);
std::vector<std::string> columnFamilies;
rocksdb::Status status = rocksdb::DB::ListColumnFamilies(options, path, &columnFamilies);
rocksdb::Status status = rocksdb::DB::ListColumnFamilies(dbOptions, path, &columnFamilies);
rocksdb::ColumnFamilyOptions cfOptions = getCFOptions();
std::vector<rocksdb::ColumnFamilyDescriptor> descriptors;
bool foundMetadata = false;
for (const auto& name : columnFamilies) {
if (name == METADATA_SHARD_ID) {
foundMetadata = true;
}
descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ name, cfOptions });
descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ name, rocksdb::ColumnFamilyOptions(dbOptions) });
}
ASSERT(foundMetadata || descriptors.size() == 0);
// Add default column family if it's a newly opened database.
if (descriptors.size() == 0) {
descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ "default", cfOptions });
descriptors.push_back(
rocksdb::ColumnFamilyDescriptor{ "default", rocksdb::ColumnFamilyOptions(dbOptions) });
}
std::vector<rocksdb::ColumnFamilyHandle*> handles;
status = rocksdb::DB::Open(options, path, descriptors, &handles, &db);
status = rocksdb::DB::Open(dbOptions, path, descriptors, &handles, &db);
if (!status.ok()) {
logRocksDBError(status, "Open");
return status;
@ -766,7 +769,8 @@ public:
physicalShards[defaultShard->id] = defaultShard;
// Create metadata shard.
auto metadataShard = std::make_shared<PhysicalShard>(db, METADATA_SHARD_ID);
auto metadataShard =
std::make_shared<PhysicalShard>(db, METADATA_SHARD_ID, rocksdb::ColumnFamilyOptions(dbOptions));
metadataShard->init();
columnFamilyMap[metadataShard->cf->GetID()] = metadataShard->cf;
physicalShards[METADATA_SHARD_ID] = metadataShard;
@ -832,7 +836,8 @@ public:
}
}
auto [it, inserted] = physicalShards.emplace(id, std::make_shared<PhysicalShard>(db, id));
auto [it, inserted] = physicalShards.emplace(
id, std::make_shared<PhysicalShard>(db, id, rocksdb::ColumnFamilyOptions(dbOptions)));
std::shared_ptr<PhysicalShard>& shard = it->second;
activePhysicalShardIds.emplace(id);
@ -1146,6 +1151,7 @@ public:
private:
const std::string path;
const UID logId;
rocksdb::Options dbOptions;
rocksdb::DB* db = nullptr;
std::unordered_map<std::string, std::shared_ptr<PhysicalShard>> physicalShards;
std::unordered_set<std::string> activePhysicalShardIds;
@ -1421,40 +1427,40 @@ RocksDBMetrics::RocksDBMetrics(UID debugID, std::shared_ptr<rocksdb::Statistics>
}
for (int i = 0; i < SERVER_KNOBS->ROCKSDB_READ_PARALLELISM; i++) {
readRangeLatencyHistograms.push_back(Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_LATENCY_HISTOGRAM, Histogram::Unit::microseconds));
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_LATENCY_HISTOGRAM, Histogram::Unit::milliseconds));
readValueLatencyHistograms.push_back(Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_LATENCY_HISTOGRAM, Histogram::Unit::microseconds));
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_LATENCY_HISTOGRAM, Histogram::Unit::milliseconds));
readPrefixLatencyHistograms.push_back(Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_LATENCY_HISTOGRAM, Histogram::Unit::microseconds));
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_LATENCY_HISTOGRAM, Histogram::Unit::milliseconds));
readRangeActionHistograms.push_back(Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_ACTION_HISTOGRAM, Histogram::Unit::microseconds));
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_ACTION_HISTOGRAM, Histogram::Unit::milliseconds));
readValueActionHistograms.push_back(Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_ACTION_HISTOGRAM, Histogram::Unit::microseconds));
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_ACTION_HISTOGRAM, Histogram::Unit::milliseconds));
readPrefixActionHistograms.push_back(Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_ACTION_HISTOGRAM, Histogram::Unit::microseconds));
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_ACTION_HISTOGRAM, Histogram::Unit::milliseconds));
readRangeQueueWaitHistograms.push_back(Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_QUEUEWAIT_HISTOGRAM, Histogram::Unit::microseconds));
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_QUEUEWAIT_HISTOGRAM, Histogram::Unit::milliseconds));
readValueQueueWaitHistograms.push_back(Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_QUEUEWAIT_HISTOGRAM, Histogram::Unit::microseconds));
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_QUEUEWAIT_HISTOGRAM, Histogram::Unit::milliseconds));
readPrefixQueueWaitHistograms.push_back(Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_QUEUEWAIT_HISTOGRAM, Histogram::Unit::microseconds));
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_QUEUEWAIT_HISTOGRAM, Histogram::Unit::milliseconds));
readRangeNewIteratorHistograms.push_back(Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_NEWITERATOR_HISTOGRAM, Histogram::Unit::microseconds));
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_NEWITERATOR_HISTOGRAM, Histogram::Unit::milliseconds));
readValueGetHistograms.push_back(Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_GET_HISTOGRAM, Histogram::Unit::microseconds));
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_GET_HISTOGRAM, Histogram::Unit::milliseconds));
readPrefixGetHistograms.push_back(Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_GET_HISTOGRAM, Histogram::Unit::microseconds));
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_GET_HISTOGRAM, Histogram::Unit::milliseconds));
}
commitLatencyHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_LATENCY_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_LATENCY_HISTOGRAM, Histogram::Unit::milliseconds);
commitActionHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_ACTION_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_ACTION_HISTOGRAM, Histogram::Unit::milliseconds);
commitQueueWaitHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_QUEUEWAIT_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_QUEUEWAIT_HISTOGRAM, Histogram::Unit::milliseconds);
writeHistogram =
Histogram::getHistogram(ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_WRITE_HISTOGRAM, Histogram::Unit::microseconds);
Histogram::getHistogram(ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_WRITE_HISTOGRAM, Histogram::Unit::milliseconds);
deleteCompactRangeHistogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_DELETE_COMPACTRANGE_HISTOGRAM, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_DELETE_COMPACTRANGE_HISTOGRAM, Histogram::Unit::milliseconds);
}
void RocksDBMetrics::logStats(rocksdb::DB* db) {
@ -1689,7 +1695,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
Future<Void> readyToStart,
std::unordered_map<std::string, std::shared_ptr<PhysicalShard>>* physicalShards) {
state Reference<Histogram> histogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, "TimeSpentRefreshIterators"_sr, Histogram::Unit::microseconds);
ROCKSDBSTORAGE_HISTOGRAM_GROUP, "TimeSpentRefreshIterators"_sr, Histogram::Unit::milliseconds);
if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS) {
try {
@ -1755,7 +1761,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
struct OpenAction : TypedAction<Writer, OpenAction> {
ShardManager* shardManager;
rocksdb::Options dbOptions;
ThreadReturnPromise<Void> done;
Optional<Future<Void>>& metrics;
const FlowLock* readLock;
@ -1763,19 +1768,18 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
std::shared_ptr<RocksDBErrorListener> errorListener;
OpenAction(ShardManager* shardManager,
rocksdb::Options dbOptions,
Optional<Future<Void>>& metrics,
const FlowLock* readLock,
const FlowLock* fetchLock,
std::shared_ptr<RocksDBErrorListener> errorListener)
: shardManager(shardManager), dbOptions(dbOptions), metrics(metrics), readLock(readLock),
fetchLock(fetchLock), errorListener(errorListener) {}
: shardManager(shardManager), metrics(metrics), readLock(readLock), fetchLock(fetchLock),
errorListener(errorListener) {}
double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; }
};
void action(OpenAction& a) {
auto status = a.shardManager->init(a.dbOptions);
auto status = a.shardManager->init();
if (!status.ok()) {
logRocksDBError(status, "Open");
@ -1886,21 +1890,23 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
rocksdb::DB* db,
std::vector<std::pair<uint32_t, KeyRange>>* deletes,
bool sample) {
DeleteVisitor dv(deletes);
rocksdb::Status s = batch->Iterate(&dv);
if (!s.ok()) {
logRocksDBError(s, "CommitDeleteVisitor");
return s;
}
if (SERVER_KNOBS->ROCKSDB_SUGGEST_COMPACT_CLEAR_RANGE) {
DeleteVisitor dv(deletes);
rocksdb::Status s = batch->Iterate(&dv);
if (!s.ok()) {
logRocksDBError(s, "CommitDeleteVisitor");
return s;
}
// If there are any range deletes, we should have added them to be deleted.
ASSERT(!deletes->empty() || !batch->HasDeleteRange());
// If there are any range deletes, we should have added them to be deleted.
ASSERT(!deletes->empty() || !batch->HasDeleteRange());
}
rocksdb::WriteOptions options;
options.sync = !SERVER_KNOBS->ROCKSDB_UNSAFE_AUTO_FSYNC;
double writeBeginTime = sample ? timer_monotonic() : 0;
s = db->Write(options, batch);
rocksdb::Status s = db->Write(options, batch);
if (sample) {
rocksDBMetrics->getWriteHistogram()->sampleSeconds(timer_monotonic() - writeBeginTime);
}
@ -2280,7 +2286,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
numReadWaiters(SERVER_KNOBS->ROCKSDB_READ_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
numFetchWaiters(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX),
errorListener(std::make_shared<RocksDBErrorListener>()), errorFuture(errorListener->getFuture()),
shardManager(path, id), dbOptions(getOptions()),
dbOptions(getOptions()), shardManager(path, id, dbOptions),
rocksDBMetrics(std::make_shared<RocksDBMetrics>(id, dbOptions.statistics)) {
// In simluation, run the reader/writer threads as Coro threads (i.e. in the network thread. The storage
// engine is still multi-threaded as background compaction threads are still present. Reads/writes to disk
@ -2347,7 +2353,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
// mapping data.
} else {
auto a = std::make_unique<Writer::OpenAction>(
&shardManager, dbOptions, metrics, &readSemaphore, &fetchSemaphore, errorListener);
&shardManager, metrics, &readSemaphore, &fetchSemaphore, errorListener);
openFuture = a->done.getFuture();
this->metrics = ShardManager::shardMetricsLogger(this->rState, openFuture, &shardManager) &&
rocksDBAggregatedMetricsLogger(this->rState, openFuture, rocksDBMetrics, &shardManager);
@ -2581,8 +2587,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
std::vector<std::pair<KeyRange, std::string>> getDataMapping() { return shardManager.getDataMapping(); }
std::shared_ptr<ShardedRocksDBState> rState;
ShardManager shardManager;
rocksdb::Options dbOptions;
ShardManager shardManager;
std::shared_ptr<RocksDBMetrics> rocksDBMetrics;
std::string path;
UID id;

View File

@ -138,7 +138,7 @@ struct LogRouterData {
: dbgid(dbgid), logSystem(new AsyncVar<Reference<ILogSystem>>()), version(req.startVersion - 1), minPopped(0),
startVersion(req.startVersion), minKnownCommittedVersion(0), poppedVersion(0), routerTag(req.routerTag),
allowPops(false), foundEpochEnd(false), generation(req.recoveryCount),
peekLatencyDist(Histogram::getHistogram("LogRouter"_sr, "PeekTLogLatency"_sr, Histogram::Unit::microseconds)),
peekLatencyDist(Histogram::getHistogram("LogRouter"_sr, "PeekTLogLatency"_sr, Histogram::Unit::milliseconds)),
cc("LogRouter", dbgid.toString()), getMoreCount("GetMoreCount", cc),
getMoreBlockedCount("GetMoreBlockedCount", cc) {
// setup just enough of a logSet to be able to call getPushLocations

View File

@ -375,7 +375,7 @@ bool LogPushData::writeTransactionInfo(int location, uint32_t subseq) {
// parent->child.
SpanContextMessage contextMessage;
if (spanContext.isSampled()) {
CODE_PROBE(true, "Converting OTELSpanContextMessage to traced SpanContextMessage", probe::decoration::rare);
CODE_PROBE(true, "Converting OTELSpanContextMessage to traced SpanContextMessage");
contextMessage = SpanContextMessage(UID(spanContext.traceID.first(), spanContext.traceID.second()));
} else {
CODE_PROBE(true, "Converting OTELSpanContextMessage to untraced SpanContextMessage");

View File

@ -1241,7 +1241,7 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
// Set dataMoves[dataMoveId] = DataMoveMetaData.
ACTOR static Future<Void> startMoveShards(Database occ,
UID dataMoveId,
KeyRange keys,
std::vector<KeyRange> ranges,
std::vector<UID> servers,
MoveKeysLock lock,
FlowLock* startMoveKeysLock,
@ -1257,8 +1257,11 @@ ACTOR static Future<Void> startMoveShards(Database occ,
TraceEvent(SevDebug, "StartMoveShardsBegin", relocationIntervalId)
.detail("DataMoveID", dataMoveId)
.detail("TargetRange", keys);
.detail("TargetRange", describe(ranges));
// TODO: make startMoveShards work with multiple ranges.
ASSERT(ranges.size() == 1);
state KeyRangeRef keys = ranges[0];
try {
state Key begin = keys.begin;
state KeyRange currentKeys = keys;
@ -1576,7 +1579,7 @@ ACTOR static Future<Void> checkDataMoveComplete(Database occ, UID dataMoveId, Ke
// Clear dataMoves[dataMoveId].
ACTOR static Future<Void> finishMoveShards(Database occ,
UID dataMoveId,
KeyRange targetKeys,
std::vector<KeyRange> targetRanges,
std::vector<UID> destinationTeam,
MoveKeysLock lock,
FlowLock* finishMoveKeysParallelismLock,
@ -1585,7 +1588,10 @@ ACTOR static Future<Void> finishMoveShards(Database occ,
std::map<UID, StorageServerInterface> tssMapping,
const DDEnabledState* ddEnabledState) {
ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
state KeyRange keys = targetKeys;
// TODO: make startMoveShards work with multiple ranges.
ASSERT(targetRanges.size() == 1);
state KeyRange keys = targetRanges[0];
state Future<Void> warningLogger = logWarningAfter("FinishMoveShardsTooLong", 600, destinationTeam);
state int retries = 0;
state DataMoveMetaData dataMove;
@ -1636,7 +1642,7 @@ ACTOR static Future<Void> finishMoveShards(Database occ,
} else {
TraceEvent(SevWarn, "FinishMoveShardsDataMoveDeleted", relocationIntervalId)
.detail("DataMoveID", dataMoveId);
wait(checkDataMoveComplete(occ, dataMoveId, targetKeys, relocationIntervalId));
wait(checkDataMoveComplete(occ, dataMoveId, keys, relocationIntervalId));
return Void();
}
@ -2485,9 +2491,10 @@ Future<Void> rawStartMovement(Database occ,
const MoveKeysParams& params,
std::map<UID, StorageServerInterface>& tssMapping) {
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
ASSERT(params.ranges.present());
return startMoveShards(std::move(occ),
params.dataMoveId,
params.keys,
params.ranges.get(),
params.destinationTeam,
params.lock,
params.startMoveKeysParallelismLock,
@ -2495,8 +2502,9 @@ Future<Void> rawStartMovement(Database occ,
params.ddEnabledState,
params.cancelConflictingDataMoves);
}
ASSERT(params.keys.present());
return startMoveKeys(std::move(occ),
params.keys,
params.keys.get(),
params.destinationTeam,
params.lock,
params.startMoveKeysParallelismLock,
@ -2505,13 +2513,37 @@ Future<Void> rawStartMovement(Database occ,
params.ddEnabledState);
}
Future<Void> rawCheckFetchingState(const Database& cx,
const MoveKeysParams& params,
const std::map<UID, StorageServerInterface>& tssMapping) {
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
ASSERT(params.ranges.present());
// TODO: make startMoveShards work with multiple ranges.
ASSERT(params.ranges.get().size() == 1);
return checkFetchingState(cx,
params.healthyDestinations,
params.ranges.get().at(0),
params.dataMovementComplete,
params.relocationIntervalId,
tssMapping);
}
ASSERT(params.keys.present());
return checkFetchingState(cx,
params.healthyDestinations,
params.keys.get(),
params.dataMovementComplete,
params.relocationIntervalId,
tssMapping);
}
Future<Void> rawFinishMovement(Database occ,
const MoveKeysParams& params,
const std::map<UID, StorageServerInterface>& tssMapping) {
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
ASSERT(params.ranges.present());
return finishMoveShards(std::move(occ),
params.dataMoveId,
params.keys,
params.ranges.get(),
params.destinationTeam,
params.lock,
params.finishMoveKeysParallelismLock,
@ -2520,8 +2552,9 @@ Future<Void> rawFinishMovement(Database occ,
tssMapping,
params.ddEnabledState);
}
ASSERT(params.keys.present());
return finishMoveKeys(std::move(occ),
params.keys,
params.keys.get(),
params.destinationTeam,
params.lock,
params.finishMoveKeysParallelismLock,
@ -2539,12 +2572,7 @@ ACTOR Future<Void> moveKeys(Database occ, MoveKeysParams params) {
wait(rawStartMovement(occ, params, tssMapping));
state Future<Void> completionSignaller = checkFetchingState(occ,
params.healthyDestinations,
params.keys,
params.dataMovementComplete,
params.relocationIntervalId,
tssMapping);
state Future<Void> completionSignaller = rawCheckFetchingState(occ, params, tssMapping);
wait(rawFinishMovement(occ, params, tssMapping));

View File

@ -98,7 +98,6 @@ TraceEvent debugTagsAndMessageEnabled(const char* context, Version version, Stri
SpanContextMessage scm;
br >> scm;
} else if (OTELSpanContextMessage::startsOTELSpanContextMessage(mutationType)) {
CODE_PROBE(true, "MutationTracking reading OTELSpanContextMessage", probe::decoration::rare);
BinaryReader br(mutationData, AssumeVersion(rdr.protocolVersion()));
OTELSpanContextMessage scm;
br >> scm;

View File

@ -1633,7 +1633,7 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
TraceEvent("SharedTlog", tlogId).detail("Version", "4.6");
try {
wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION, "TLogInit"));
wait(restorePersistentState(&self, locality));
self.sharedActors.send(cleanupPeekTrackers(&self));

View File

@ -1484,7 +1484,7 @@ ACTOR Future<Void> doQueueCommit(TLogData* self,
self->largeDiskQueueCommitBytes.set(false);
wait(ioDegradedOrTimeoutError(
c, SERVER_KNOBS->MAX_STORAGE_COMMIT_TIME, self->degraded, SERVER_KNOBS->TLOG_DEGRADED_DURATION));
c, SERVER_KNOBS->MAX_STORAGE_COMMIT_TIME, self->degraded, SERVER_KNOBS->TLOG_DEGRADED_DURATION, "TLogCommit"));
if (g_network->isSimulated() && !g_simulator->speedUpSimulation && BUGGIFY_WITH_PROB(0.0001)) {
wait(delay(6.0));
}
@ -1701,7 +1701,7 @@ ACTOR Future<Void> initPersistentState(TLogData* self, Reference<LogData> logDat
}
TraceEvent("TLogInitCommit", logData->logId).log();
wait(ioTimeoutError(self->persistentData->commit(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
wait(ioTimeoutError(self->persistentData->commit(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION, "TLogCommit"));
return Void();
}
@ -2801,13 +2801,13 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
TraceEvent("SharedTlog", tlogId).detail("Version", "6.0");
try {
wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION, "TLogInit"));
if (restoreFromDisk) {
wait(restorePersistentState(&self, locality, oldLog, recovered, tlogRequests));
} else {
wait(ioTimeoutError(checkEmptyQueue(&self) && checkRecovered(&self),
SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
wait(ioTimeoutError(
checkEmptyQueue(&self) && checkRecovered(&self), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION, "TLogInit"));
}
// Disk errors need a chance to kill this actor.

View File

@ -3291,7 +3291,7 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
TraceEvent("SharedTlog", tlogId).detail("Version", "6.2");
try {
wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION, "TLogInit"));
if (restoreFromDisk) {
wait(restorePersistentState(&self, locality, oldLog, recovered, tlogRequests));

View File

@ -487,12 +487,12 @@ class PaxosConfigConsumerImpl {
.detail("LargestLiveVersion", self->getCommittedVersionQuorum.getLargestLive())
.detail("SmallestCommitted", smallestCommitted);
ASSERT_GE(committedVersion, self->lastSeenVersion);
self->lastSeenVersion = committedVersion;
self->lastSeenVersion = std::max(self->lastSeenVersion, committedVersion);
self->compactionVersion = std::max(self->compactionVersion, smallestCommitted);
broadcaster->applySnapshotAndChanges(std::move(reply.snapshot),
reply.snapshotVersion,
reply.changes,
committedVersion,
self->lastSeenVersion,
reply.annotations,
self->getCommittedVersionQuorum.getReadReplicas(),
self->getCommittedVersionQuorum.getLargestLive(),
@ -534,6 +534,13 @@ class PaxosConfigConsumerImpl {
if (committedVersion > self->lastSeenVersion) {
ASSERT(self->getCommittedVersionQuorum.getReadReplicas().size() >= self->cfis.size() / 2 + 1 ||
self->getCommittedVersionQuorum.isSpecialZeroQuorum());
if (BUGGIFY) {
// Inject a random delay between getting the committed
// version and reading any changes. The goal is to
// allow attrition to occasionally kill ConfigNodes in
// this in-between state.
wait(delay(deterministicRandom()->random01() * 5));
}
state std::vector<ConfigFollowerInterface> readReplicas =
self->getCommittedVersionQuorum.getReadReplicas();
std::vector<Future<Void>> fs;
@ -567,7 +574,7 @@ class PaxosConfigConsumerImpl {
Version smallestCommitted = self->getCommittedVersionQuorum.getSmallestCommitted();
self->compactionVersion = std::max(self->compactionVersion, smallestCommitted);
broadcaster->applyChanges(reply.changes,
committedVersion,
self->lastSeenVersion,
reply.annotations,
self->getCommittedVersionQuorum.getReadReplicas());
} else if (committedVersion == self->lastSeenVersion) {

View File

@ -43,9 +43,9 @@
#include "flow/actorcompiler.h" // has to be last include
#ifdef SSD_ROCKSDB_EXPERIMENTAL
// Enforcing rocksdb version to be 6.22.1 or greater.
static_assert(ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR >= 22 && ROCKSDB_PATCH >= 1,
"Unsupported rocksdb version. Update the rocksdb to at least 6.22.1 version");
// Enforcing rocksdb version to be 7.7.3.
static_assert((ROCKSDB_MAJOR == 7 && ROCKSDB_MINOR == 7 && ROCKSDB_PATCH == 3),
"Unsupported rocksdb version. Update the rocksdb to 7.7.3 version");
namespace {

View File

@ -328,6 +328,13 @@ class TestConfig : public BasicTestConfig {
if (attrib == "disableEncryption") {
disableEncryption = strcmp(value.c_str(), "true") == 0;
}
if (attrib == "encryptModes") {
std::stringstream ss(value);
std::string token;
while (std::getline(ss, token, ',')) {
encryptModes.push_back(token);
}
}
if (attrib == "restartInfoLocation") {
isFirstTestInRestart = true;
}
@ -397,6 +404,9 @@ public:
bool disableRemoteKVS = false;
// 7.2 cannot be downgraded to 7.1 or below after enabling encryption-at-rest.
bool disableEncryption = false;
// By default, encryption mode is set randomly (based on the tenant mode)
// If provided, set using EncryptionAtRestMode::fromString
std::vector<std::string> encryptModes;
// Storage Engine Types: Verify match with SimulationConfig::generateNormalConfig
// 0 = "ssd"
// 1 = "memory"
@ -474,6 +484,7 @@ public:
.add("disableHostname", &disableHostname)
.add("disableRemoteKVS", &disableRemoteKVS)
.add("disableEncryption", &disableEncryption)
.add("encryptModes", &encryptModes)
.add("simpleConfig", &simpleConfig)
.add("generateFearless", &generateFearless)
.add("datacenters", &datacenters)
@ -1274,6 +1285,7 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
g_knobs.setKnob("remote_kv_store", KnobValueRef::create(bool{ false }));
TraceEvent(SevDebug, "DisableRemoteKVS");
}
// TODO: Remove this code when encryption knobs are removed
if (testConfig->disableEncryption) {
g_knobs.setKnob("enable_encryption", KnobValueRef::create(bool{ false }));
g_knobs.setKnob("enable_tlog_encryption", KnobValueRef::create(bool{ false }));
@ -2052,6 +2064,19 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
simconfig.db.tenantMode = tenantMode;
simconfig.db.encryptionAtRestMode = EncryptionAtRestMode::DISABLED;
if (!testConfig.encryptModes.empty()) {
simconfig.db.encryptionAtRestMode =
EncryptionAtRestMode::fromString(deterministicRandom()->randomChoice(testConfig.encryptModes));
} else if (!testConfig.disableEncryption && deterministicRandom()->coinflip()) {
if (tenantMode == TenantMode::DISABLED || tenantMode == TenantMode::OPTIONAL_TENANT ||
deterministicRandom()->coinflip()) {
// optional and disabled tenant modes currently only support cluster aware encryption
simconfig.db.encryptionAtRestMode = EncryptionAtRestMode::CLUSTER_AWARE;
} else {
simconfig.db.encryptionAtRestMode = EncryptionAtRestMode::DOMAIN_AWARE;
}
}
TraceEvent("SimulatedClusterEncryptionMode").detail("Mode", simconfig.db.encryptionAtRestMode.toString());
g_simulator->blobGranulesEnabled = simconfig.db.blobGranulesEnabled;
@ -2065,6 +2090,7 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
g_knobs.setKnob("remote_kv_store", KnobValueRef::create(bool{ false }));
TraceEvent(SevDebug, "DisableRemoteKVS");
}
// TODO: Remove this code once encryption knobs are removed
if (testConfig.disableEncryption) {
g_knobs.setKnob("enable_encryption", KnobValueRef::create(bool{ false }));
g_knobs.setKnob("enable_tlog_encryption", KnobValueRef::create(bool{ false }));

View File

@ -19,6 +19,8 @@
*/
#include <cinttypes>
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "fmt/format.h"
#include "fdbclient/BackupAgent.actor.h"
#include "fdbclient/BlobWorkerInterface.h"
@ -2443,6 +2445,47 @@ ACTOR static Future<JsonBuilderObject> blobWorkerStatusFetcher(
return statusObj;
}
ACTOR static Future<JsonBuilderObject> blobRestoreStatusFetcher(Database db, std::set<std::string>* incompleteReason) {
state JsonBuilderObject statusObj;
state std::vector<Future<Optional<TraceEventFields>>> futures;
try {
Optional<BlobRestoreStatus> status = wait(getRestoreStatus(db, normalKeys));
if (status.present()) {
switch (status.get().phase) {
case BlobRestorePhase::INIT:
statusObj["blob_full_restore_phase"] = "Initializing";
break;
case BlobRestorePhase::LOAD_MANIFEST:
statusObj["blob_full_restore_phase"] = "Loading manifest";
break;
case BlobRestorePhase::MANIFEST_DONE:
statusObj["blob_full_restore_phase"] = "Manifest loaded";
break;
case BlobRestorePhase::MIGRATE:
statusObj["blob_full_restore_phase"] = "Copying data";
statusObj["blob_full_restore_progress"] = status.get().progress;
break;
case BlobRestorePhase::APPLY_MLOGS:
statusObj["blob_full_restore_phase"] = "Applying mutation logs";
statusObj["blob_full_restore_progress"] = status.get().progress;
break;
case BlobRestorePhase::DONE:
statusObj["blob_full_restore_phase"] = "Completed";
break;
default:
statusObj["blob_full_restore_phase"] = "Unexpected phase";
}
}
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled)
throw;
incompleteReason->insert("Unable to query blob restore status");
}
return statusObj;
}
static JsonBuilderObject tlogFetcher(int* logFaultTolerance,
const std::vector<TLogSet>& tLogs,
std::unordered_map<NetworkAddress, WorkerInterface> const& address_workers) {
@ -3409,6 +3452,8 @@ ACTOR Future<StatusReply> clusterGetStatus(
JsonBuilderObject blobGranuelsStatus =
wait(blobWorkerStatusFetcher(blobWorkers, address_workers, &status_incomplete_reasons));
statusObj["blob_granules"] = blobGranuelsStatus;
JsonBuilderObject blobRestoreStatus = wait(blobRestoreStatusFetcher(cx, &status_incomplete_reasons));
statusObj["blob_restore"] = blobRestoreStatus;
}
JsonBuilderArray incompatibleConnectionsArray;

View File

@ -375,7 +375,7 @@ struct TLogData : NonCopyable {
peekMemoryLimiter(SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES),
concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS), ignorePopDeadline(0), dataFolder(folder),
degraded(degraded),
commitLatencyDist(Histogram::getHistogram("tLog"_sr, "commit"_sr, Histogram::Unit::microseconds)) {
commitLatencyDist(Histogram::getHistogram("tLog"_sr, "commit"_sr, Histogram::Unit::milliseconds)) {
cx = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True);
}
};
@ -1098,7 +1098,7 @@ ACTOR Future<Void> updatePersistentData(TLogData* self, Reference<LogData> logDa
}
// SOMEDAY: This seems to be running pretty often, should we slow it down???
// This needs a timeout since nothing prevents I/O operations from hanging indefinitely.
wait(ioTimeoutError(self->persistentData->commit(), tLogMaxCreateDuration));
wait(ioTimeoutError(self->persistentData->commit(), tLogMaxCreateDuration, "TLogCommit"));
wait(delay(0, TaskPriority::UpdateStorage));
@ -2160,7 +2160,7 @@ ACTOR Future<Void> doQueueCommit(TLogData* self,
self->largeDiskQueueCommitBytes.set(false);
wait(ioDegradedOrTimeoutError(
c, SERVER_KNOBS->MAX_STORAGE_COMMIT_TIME, self->degraded, SERVER_KNOBS->TLOG_DEGRADED_DURATION));
c, SERVER_KNOBS->MAX_STORAGE_COMMIT_TIME, self->degraded, SERVER_KNOBS->TLOG_DEGRADED_DURATION, "TLogCommit"));
if (g_network->isSimulated() && !g_simulator->speedUpSimulation && BUGGIFY_WITH_PROB(0.0001)) {
wait(delay(6.0));
}
@ -3464,7 +3464,8 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
logData->unpoppedRecoveredTagCount = req.allTags.size();
logData->unpoppedRecoveredTags = std::set<Tag>(req.allTags.begin(), req.allTags.end());
wait(ioTimeoutError(initPersistentState(self, logData) || logData->removed,
SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
SERVER_KNOBS->TLOG_MAX_CREATE_DURATION,
"TLogInit"));
TraceEvent("TLogRecover", self->dbgid)
.detail("LogId", logData->logId)
@ -3529,7 +3530,8 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
} else {
// Brand new tlog, initialization has already been done by caller
wait(ioTimeoutError(initPersistentState(self, logData) || logData->removed,
SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
SERVER_KNOBS->TLOG_MAX_CREATE_DURATION,
"TLogInit"));
if (logData->recoveryComplete.isSet()) {
throw worker_removed();
@ -3600,13 +3602,14 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
TraceEvent("SharedTlog", tlogId);
try {
wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION, "TLogInit"));
if (restoreFromDisk) {
wait(restorePersistentState(&self, locality, oldLog, recovered, tlogRequests));
} else {
wait(ioTimeoutError(checkEmptyQueue(&self) && initPersistentStorage(&self),
SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
SERVER_KNOBS->TLOG_MAX_CREATE_DURATION,
"TLogInit"));
}
// Disk errors need a chance to kill this actor.

View File

@ -546,7 +546,7 @@ Future<Version> TagPartitionedLogSystem::push(Version prevVersion,
it->tlogPushDistTrackers.push_back(
Histogram::getHistogram("ToTlog_" + it->logServers[i]->get().interf().uniqueID.toString(),
it->logServers[i]->get().interf().address().toString(),
Histogram::Unit::microseconds));
Histogram::Unit::milliseconds));
}
}
std::vector<Future<Void>> tLogCommitResults;

View File

@ -124,9 +124,17 @@ public:
state int refreshInterval = SERVER_KNOBS->TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL;
state double lastTenantListFetchTime = now();
state double lastTraceTime = 0;
loop {
state double fetchStartTime = now();
state bool toTrace = false;
if (fetchStartTime - lastTraceTime > SERVER_KNOBS->TENANT_CACHE_STORAGE_USAGE_TRACE_INTERVAL) {
toTrace = true;
lastTraceTime = fetchStartTime;
}
state std::vector<TenantGroupName> groups;
for (const auto& [group, storage] : tenantCache->tenantStorageMap) {
groups.push_back(group);
@ -159,6 +167,14 @@ public:
}
}
tenantCache->tenantStorageMap[group].usage = usage;
if (toTrace) {
// Trace the storage used by all tenant groups for visibility.
TraceEvent(SevInfo, "StorageUsageUpdated", tenantCache->id())
.detail("TenantGroup", group)
.detail("Quota", tenantCache->tenantStorageMap[group].quota)
.detail("Usage", tenantCache->tenantStorageMap[group].usage);
}
}
lastTenantListFetchTime = now();

View File

@ -459,7 +459,13 @@ public:
// Since cursors can have async operations pending which modify their state they can't be copied cleanly
Cursor(const Cursor& other) = delete;
~Cursor() { writeOperations.cancel(); }
~Cursor() { cancel(); }
// Cancel outstanding operations. Further use of cursor is not allowed.
void cancel() {
nextPageReader.cancel();
writeOperations.cancel();
}
// A read cursor can be initialized from a pop cursor
void initReadOnly(const Cursor& c, bool readExtents = false) {
@ -921,7 +927,15 @@ public:
public:
FIFOQueue() : pager(nullptr) {}
~FIFOQueue() { newTailPage.cancel(); }
~FIFOQueue() { cancel(); }
// Cancel outstanding operations. Further use of queue is not allowed.
void cancel() {
headReader.cancel();
tailWriter.cancel();
headWriter.cancel();
newTailPage.cancel();
}
FIFOQueue(const FIFOQueue& other) = delete;
void operator=(const FIFOQueue& rhs) = delete;
@ -3627,6 +3641,13 @@ public:
}
self->operations.clear();
debug_printf("DWALPager(%s) shutdown cancel queues\n", self->filename.c_str());
self->freeList.cancel();
self->delayedFreeList.cancel();
self->remapQueue.cancel();
self->extentFreeList.cancel();
self->extentUsedList.cancel();
debug_printf("DWALPager(%s) shutdown destroy page cache\n", self->filename.c_str());
wait(self->extentCache.clear());
wait(self->pageCache.clear());
@ -4697,21 +4718,15 @@ public:
if (domainId.present()) {
ASSERT(keyProvider && keyProvider->enableEncryptionDomain());
// Temporarily disabling the check, since if a tenant is removed, where the key provider
// would not find the domain, the data for the tenant may still be in Redwood and being read.
// TODO(yiwu): re-enable the check.
/*
if (domainId.get() != keyProvider->getDefaultEncryptionDomainId() &&
!keyProvider->keyFitsInDomain(domainId.get(), lowerBound, false)) {
fprintf(stderr,
"Page lower bound not in domain: %s %s, domain id %s, lower bound '%s'\n",
::toString(id).c_str(),
::toString(v).c_str(),
::toString(domainId).c_str(),
lowerBound.printable().c_str());
return false;
if (!keyProvider->keyFitsInDomain(domainId.get(), lowerBound, true)) {
fprintf(stderr,
"Page lower bound not in domain: %s %s, domain id %s, lower bound '%s'\n",
::toString(id).c_str(),
::toString(v).c_str(),
::toString(domainId).c_str(),
lowerBound.printable().c_str());
return false;
}
*/
}
auto& b = boundariesByPageID[id.front()][v];
@ -4759,45 +4774,27 @@ public:
::toString(b->second.domainId).c_str());
return false;
}
// Temporarily disabling the check, since if a tenant is removed, where the key provider
// would not find the domain, the data for the tenant may still be in Redwood and being read.
// TODO(yiwu): re-enable the check.
/*
ASSERT(domainId.present());
auto checkKeyFitsInDomain = [&]() -> bool {
if (!keyProvider->keyFitsInDomain(domainId.get(), cursor.get().key, b->second.height > 1)) {
fprintf(stderr,
"Encryption domain mismatch on %s, %s, domain: %s, key %s\n",
::toString(id).c_str(),
::toString(v).c_str(),
::toString(domainId).c_str(),
cursor.get().key.printable().c_str());
return false;
}
return true;
if (!keyProvider->keyFitsInDomain(domainId.get(), cursor.get().key, b->second.height > 1)) {
fprintf(stderr,
"Encryption domain mismatch on %s, %s, domain: %s, key %s\n",
::toString(id).c_str(),
::toString(v).c_str(),
::toString(domainId).c_str(),
cursor.get().key.printable().c_str());
return false;
}
return true;
};
if (domainId.get() != keyProvider->getDefaultEncryptionDomainId()) {
cursor.moveFirst();
if (cursor.valid() && !checkKeyFitsInDomain()) {
return false;
}
cursor.moveLast();
if (cursor.valid() && !checkKeyFitsInDomain()) {
return false;
}
} else {
if (deterministicRandom()->random01() < domainPrefixScanProbability) {
cursor.moveFirst();
while (cursor.valid()) {
if (!checkKeyFitsInDomain()) {
return false;
}
cursor.moveNext();
}
domainPrefixScanCount++;
}
cursor.moveFirst();
if (cursor.valid() && !checkKeyFitsInDomain()) {
return false;
}
cursor.moveLast();
if (cursor.valid() && !checkKeyFitsInDomain()) {
return false;
}
*/
}
return true;
@ -5674,8 +5671,8 @@ private:
int64_t defaultDomainId = keyProvider->getDefaultEncryptionDomainId();
int64_t currentDomainId;
size_t prefixLength;
if (count == 0 || (splitByDomain && count > 0)) {
std::tie(currentDomainId, prefixLength) = keyProvider->getEncryptionDomain(rec.key, domainId);
if (count == 0 || splitByDomain) {
std::tie(currentDomainId, prefixLength) = keyProvider->getEncryptionDomain(rec.key);
}
if (count == 0) {
domainId = currentDomainId;
@ -5886,12 +5883,18 @@ private:
if (useEncryptionDomain) {
ASSERT(pagesToBuild[0].domainId.present());
int64_t domainId = pagesToBuild[0].domainId.get();
// We need to make sure we use the domain prefix as the page lower bound, for the first page
// of a non-default domain on a level. That way we ensure that pages for a domain form a full subtree
// (i.e. have a single root) in the B-tree.
if (domainId != self->m_keyProvider->getDefaultEncryptionDomainId() &&
!self->m_keyProvider->keyFitsInDomain(domainId, pageLowerBound.key, false)) {
pageLowerBound = RedwoodRecordRef(entries[0].key.substr(0, pagesToBuild[0].domainPrefixLength));
// We make sure the page lower bound fits in the domain of the page.
// If the page domain is the default domain, we make sure the page doesn't fall within a domain
// specific subtree.
// If the page domain is non-default, in addition, we make the first page of the domain on a level
// use the domain prefix as the lower bound. Such a lower bound will ensure that pages for a domain
// form a full subtree (i.e. have a single root) in the B-tree.
if (!self->m_keyProvider->keyFitsInDomain(domainId, pageLowerBound.key, true)) {
if (domainId == self->m_keyProvider->getDefaultEncryptionDomainId()) {
pageLowerBound = RedwoodRecordRef(entries[0].key);
} else {
pageLowerBound = RedwoodRecordRef(entries[0].key.substr(0, pagesToBuild[0].domainPrefixLength));
}
}
}

View File

@ -163,7 +163,8 @@ ACTOR Future<Void> printRestoreSummary(Database db, Reference<BlobConnectionProv
ACTOR Future<BlobGranuleRestoreVersionVector> listBlobGranules(Database db, Reference<BlobConnectionProvider> blobConn);
ACTOR Future<int64_t> lastBlobEpoc(Database db, Reference<BlobConnectionProvider> blobConn);
ACTOR Future<bool> isFullRestoreMode(Database db, KeyRangeRef range);
ACTOR Future<Void> updateRestoreStatus(Database db, KeyRangeRef range, BlobRestoreStatus status);
ACTOR Future<Optional<BlobRestoreStatus>> getRestoreStatus(Database db, KeyRangeRef range);
#include "flow/unactorcompiler.h"
#endif

View File

@ -920,7 +920,7 @@ public:
}
if (fitness == ProcessClass::NeverAssign) {
logWorkerUnavailable(
SevDebug, id, "complex", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds);
SevDebug, id, "simple", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds);
continue;
}
if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) {
@ -1072,7 +1072,7 @@ public:
}
if (fitness == ProcessClass::NeverAssign) {
logWorkerUnavailable(
SevDebug, id, "complex", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds);
SevDebug, id, "deprecated", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds);
continue;
}
if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) {

View File

@ -110,8 +110,7 @@ struct ConfigFollowerGetChangesReply {
Standalone<VectorRef<VersionedConfigCommitAnnotationRef>> annotations;
ConfigFollowerGetChangesReply() = default;
explicit ConfigFollowerGetChangesReply(Version mostRecentVersion,
Standalone<VectorRef<VersionedConfigMutationRef>> const& changes,
explicit ConfigFollowerGetChangesReply(Standalone<VectorRef<VersionedConfigMutationRef>> const& changes,
Standalone<VectorRef<VersionedConfigCommitAnnotationRef>> const& annotations)
: changes(changes), annotations(annotations) {}

View File

@ -284,12 +284,12 @@ public:
const std::unordered_set<uint64_t>& excludedPhysicalShards,
uint64_t debugID);
// Step 2: get a remote team which has the input physical shard
// Return empty if no such remote team
// May return a problematic remote team, and re-selection is required for this case
Optional<ShardsAffectedByTeamFailure::Team> tryGetAvailableRemoteTeamWith(uint64_t inputPhysicalShardID,
StorageMetrics const& moveInMetrics,
uint64_t debugID);
// Step 2: get a remote team which has the input physical shard.
// Second field in the returned pair indicates whether this physical shard is available or not.
// Return empty if no such remote team.
// May return a problematic remote team, and re-selection is required for this case.
std::pair<Optional<ShardsAffectedByTeamFailure::Team>, bool>
tryGetAvailableRemoteTeamWith(uint64_t inputPhysicalShardID, StorageMetrics const& moveInMetrics, uint64_t debugID);
// Invariant:
// (1) If forceToUseNewPhysicalShard is set, use the bestTeams selected by getTeam(), and create a new physical
// shard for the teams

View File

@ -90,21 +90,11 @@ public:
virtual int64_t getDefaultEncryptionDomainId() const { throw not_implemented(); }
// Get encryption domain from a key. Return the domain id, and the size of the encryption domain prefix.
// It is assumed that all keys with the same encryption domain prefix as the given key falls in the same encryption
// domain. If possibleDomainId is given, it is a valid domain id previously returned by the key provider,
// potentially for a different key. The possibleDomainId parm is used by TenantAwareEncryptionKeyProvider to speed
// up encryption domain lookup.
virtual std::tuple<int64_t, size_t> getEncryptionDomain(const KeyRef& key,
Optional<int64_t> possibleDomainId = Optional<int64_t>()) {
throw not_implemented();
}
virtual std::tuple<int64_t, size_t> getEncryptionDomain(const KeyRef& key) { throw not_implemented(); }
// Get encryption domain of a page given encoding header.
virtual int64_t getEncryptionDomainIdFromHeader(const void* encodingHeader) { throw not_implemented(); }
// Setting tenant prefix to tenant name map. Used by TenantAwareEncryptionKeyProvider.
virtual void setTenantPrefixIndex(Reference<TenantPrefixIndex> tenantPrefixIndex) {}
// Helper methods.
// Check if a key fits in an encryption domain.
@ -220,7 +210,7 @@ public:
int64_t getDefaultEncryptionDomainId() const override { return FDB_DEFAULT_ENCRYPT_DOMAIN_ID; }
std::tuple<int64_t, size_t> getEncryptionDomain(const KeyRef& key, Optional<int64_t>) override {
std::tuple<int64_t, size_t> getEncryptionDomain(const KeyRef& key) override {
int64_t domainId;
if (key.size() < PREFIX_LENGTH) {
domainId = getDefaultEncryptionDomainId();
@ -291,6 +281,8 @@ class TenantAwareEncryptionKeyProvider : public IPageEncryptionKeyProvider {
public:
using EncodingHeader = ArenaPage::AESEncryptionV1Encoder::Header;
const StringRef systemKeysPrefix = systemKeys.begin;
TenantAwareEncryptionKeyProvider(Reference<AsyncVar<ServerDBInfo> const> db) : db(db) {}
virtual ~TenantAwareEncryptionKeyProvider() = default;
@ -337,10 +329,10 @@ public:
int64_t getDefaultEncryptionDomainId() const override { return FDB_DEFAULT_ENCRYPT_DOMAIN_ID; }
std::tuple<int64_t, size_t> getEncryptionDomain(const KeyRef& key, Optional<int64_t> possibleDomainId) override {
std::tuple<int64_t, size_t> getEncryptionDomain(const KeyRef& key) override {
// System key.
if (key.startsWith(systemKeys.begin)) {
return { SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, 2 };
if (key.startsWith(systemKeysPrefix)) {
return { SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, systemKeysPrefix.size() };
}
// Key smaller than tenant prefix in size belongs to the default domain.
if (key.size() < TENANT_PREFIX_SIZE) {
@ -352,21 +344,7 @@ public:
if (tenantId < 0) {
return { FDB_DEFAULT_ENCRYPT_DOMAIN_ID, 0 };
}
// Optimization: Caller guarantee possibleDomainId is a valid domain id that we previously returned.
// We can return immediately without checking with tenant map.
if (possibleDomainId.present() && possibleDomainId.get() == tenantId) {
return { tenantId, TENANT_PREFIX_SIZE };
}
if (tenantPrefixIndex.isValid()) {
auto view = tenantPrefixIndex->atLatest();
auto itr = view.find(prefix);
if (itr != view.end()) {
// Tenant not found. Tenant must be disabled, or in optional mode.
return { tenantId, TENANT_PREFIX_SIZE };
}
}
// The prefix does not belong to any tenant. The key belongs to the default domain.
return { FDB_DEFAULT_ENCRYPT_DOMAIN_ID, 0 };
return { tenantId, TENANT_PREFIX_SIZE };
}
int64_t getEncryptionDomainIdFromHeader(const void* encodingHeader) override {
@ -375,13 +353,8 @@ public:
return header->cipherTextDetails.encryptDomainId;
}
void setTenantPrefixIndex(Reference<TenantPrefixIndex> tenantPrefixIndex) override {
this->tenantPrefixIndex = tenantPrefixIndex;
}
private:
Reference<AsyncVar<ServerDBInfo> const> db;
Reference<TenantPrefixIndex> tenantPrefixIndex;
};
#include "flow/unactorcompiler.h"

View File

@ -58,7 +58,12 @@ public:
struct MoveKeysParams {
UID dataMoveId;
KeyRange keys;
// Only one of `keys` and `ranges` can be set. `ranges` is created mainly for physical shard moves to move a full
// physical shard with multiple key ranges.
Optional<KeyRange> keys;
Optional<std::vector<KeyRange>> ranges;
std::vector<UID> destinationTeam, healthyDestinations;
MoveKeysLock lock;
Promise<Void> dataMovementComplete;
@ -68,6 +73,46 @@ struct MoveKeysParams {
UID relocationIntervalId;
const DDEnabledState* ddEnabledState = nullptr;
CancelConflictingDataMoves cancelConflictingDataMoves = CancelConflictingDataMoves::False;
MoveKeysParams() {}
MoveKeysParams(UID dataMoveId,
const KeyRange& keys,
const std::vector<UID>& destinationTeam,
const std::vector<UID>& healthyDestinations,
const MoveKeysLock& lock,
const Promise<Void>& dataMovementComplete,
FlowLock* startMoveKeysParallelismLock,
FlowLock* finishMoveKeysParallelismLock,
bool hasRemote,
UID relocationIntervalId,
const DDEnabledState* ddEnabledState,
CancelConflictingDataMoves cancelConflictingDataMoves)
: dataMoveId(dataMoveId), keys(keys), destinationTeam(destinationTeam), healthyDestinations(healthyDestinations),
lock(lock), dataMovementComplete(dataMovementComplete),
startMoveKeysParallelismLock(startMoveKeysParallelismLock),
finishMoveKeysParallelismLock(finishMoveKeysParallelismLock), hasRemote(hasRemote),
relocationIntervalId(relocationIntervalId), ddEnabledState(ddEnabledState),
cancelConflictingDataMoves(cancelConflictingDataMoves) {}
MoveKeysParams(UID dataMoveId,
const std::vector<KeyRange>& ranges,
const std::vector<UID>& destinationTeam,
const std::vector<UID>& healthyDestinations,
const MoveKeysLock& lock,
const Promise<Void>& dataMovementComplete,
FlowLock* startMoveKeysParallelismLock,
FlowLock* finishMoveKeysParallelismLock,
bool hasRemote,
UID relocationIntervalId,
const DDEnabledState* ddEnabledState,
CancelConflictingDataMoves cancelConflictingDataMoves)
: dataMoveId(dataMoveId), ranges(ranges), destinationTeam(destinationTeam),
healthyDestinations(healthyDestinations), lock(lock), dataMovementComplete(dataMovementComplete),
startMoveKeysParallelismLock(startMoveKeysParallelismLock),
finishMoveKeysParallelismLock(finishMoveKeysParallelismLock), hasRemote(hasRemote),
relocationIntervalId(relocationIntervalId), ddEnabledState(ddEnabledState),
cancelConflictingDataMoves(cancelConflictingDataMoves) {}
};
// read the lock value in system keyspace but do not change anything

View File

@ -137,16 +137,16 @@ struct ProxyStats {
SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
maxComputeNS(0), minComputeNS(1e12),
commitBatchQueuingDist(
Histogram::getHistogram("CommitProxy"_sr, "CommitBatchQueuing"_sr, Histogram::Unit::microseconds)),
Histogram::getHistogram("CommitProxy"_sr, "CommitBatchQueuing"_sr, Histogram::Unit::milliseconds)),
getCommitVersionDist(
Histogram::getHistogram("CommitProxy"_sr, "GetCommitVersion"_sr, Histogram::Unit::microseconds)),
resolutionDist(Histogram::getHistogram("CommitProxy"_sr, "Resolution"_sr, Histogram::Unit::microseconds)),
Histogram::getHistogram("CommitProxy"_sr, "GetCommitVersion"_sr, Histogram::Unit::milliseconds)),
resolutionDist(Histogram::getHistogram("CommitProxy"_sr, "Resolution"_sr, Histogram::Unit::milliseconds)),
postResolutionDist(
Histogram::getHistogram("CommitProxy"_sr, "PostResolutionQueuing"_sr, Histogram::Unit::microseconds)),
Histogram::getHistogram("CommitProxy"_sr, "PostResolutionQueuing"_sr, Histogram::Unit::milliseconds)),
processingMutationDist(
Histogram::getHistogram("CommitProxy"_sr, "ProcessingMutation"_sr, Histogram::Unit::microseconds)),
tlogLoggingDist(Histogram::getHistogram("CommitProxy"_sr, "TlogLogging"_sr, Histogram::Unit::microseconds)),
replyCommitDist(Histogram::getHistogram("CommitProxy"_sr, "ReplyCommit"_sr, Histogram::Unit::microseconds)) {
Histogram::getHistogram("CommitProxy"_sr, "ProcessingMutation"_sr, Histogram::Unit::milliseconds)),
tlogLoggingDist(Histogram::getHistogram("CommitProxy"_sr, "TlogLogging"_sr, Histogram::Unit::milliseconds)),
replyCommitDist(Histogram::getHistogram("CommitProxy"_sr, "ReplyCommit"_sr, Histogram::Unit::milliseconds)) {
specialCounter(cc, "LastAssignedCommitVersion", [this]() { return this->lastCommitVersionAssigned; });
specialCounter(cc, "Version", [pVersion]() { return pVersion->get(); });
specialCounter(cc, "CommittedVersion", [pCommittedVersion]() { return pCommittedVersion->get(); });

View File

@ -1284,7 +1284,7 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
typedef decltype(&tLog) TLogFn;
ACTOR template <class T>
Future<T> ioTimeoutError(Future<T> what, double time) {
Future<T> ioTimeoutError(Future<T> what, double time, const char* context = nullptr) {
// Before simulation is sped up, IO operations can take a very long time so limit timeouts
// to not end until at least time after simulation is sped up.
if (g_network->isSimulated() && !g_simulator->speedUpSimulation) {
@ -1298,7 +1298,12 @@ Future<T> ioTimeoutError(Future<T> what, double time) {
if (g_network->isSimulated() && !g_simulator->getCurrentProcess()->isReliable()) {
err = err.asInjectedFault();
}
TraceEvent(SevError, "IoTimeoutError").error(err);
TraceEvent e(SevError, "IoTimeoutError");
e.error(err);
if (context != nullptr) {
e.detail("Context", context);
}
e.log();
throw err;
}
}
@ -1308,7 +1313,8 @@ ACTOR template <class T>
Future<T> ioDegradedOrTimeoutError(Future<T> what,
double errTime,
Reference<AsyncVar<bool>> degraded,
double degradedTime) {
double degradedTime,
const char* context = nullptr) {
// Before simulation is sped up, IO operations can take a very long time so limit timeouts
// to not end until at least time after simulation is sped up.
if (g_network->isSimulated() && !g_simulator->speedUpSimulation) {
@ -1337,7 +1343,12 @@ Future<T> ioDegradedOrTimeoutError(Future<T> what,
if (g_network->isSimulated() && !g_simulator->getCurrentProcess()->isReliable()) {
err = err.asInjectedFault();
}
TraceEvent(SevError, "IoTimeoutError").error(err);
TraceEvent e(SevError, "IoTimeoutError");
e.error(err);
if (context != nullptr) {
e.detail("Context", context);
}
e.log();
throw err;
}
}

View File

@ -159,8 +159,7 @@ bool canReplyWith(Error e) {
#define PERSIST_PREFIX "\xff\xff"
FDB_DECLARE_BOOLEAN_PARAM(UnlimitedCommitBytes);
FDB_DEFINE_BOOLEAN_PARAM(UnlimitedCommitBytes);
FDB_BOOLEAN_PARAM(UnlimitedCommitBytes);
// Immutable
static const KeyValueRef persistFormat(PERSIST_PREFIX "Format"_sr, "FoundationDB/StorageServer/1/4"_sr);
@ -786,7 +785,7 @@ public:
std::map<Version, std::vector<CheckpointMetaData>> pendingCheckpoints; // Pending checkpoint requests
std::unordered_map<UID, CheckpointMetaData> checkpoints; // Existing and deleting checkpoints
TenantMap tenantMap;
Reference<TenantPrefixIndex> tenantPrefixIndex;
TenantPrefixIndex tenantPrefixIndex;
std::map<Version, std::vector<PendingNewShard>>
pendingAddRanges; // Pending requests to add ranges to physical shards
std::map<Version, std::vector<KeyRange>>
@ -805,7 +804,7 @@ public:
FetchKeysHistograms()
: latency(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP,
FETCH_KEYS_LATENCY_HISTOGRAM,
Histogram::Unit::microseconds)),
Histogram::Unit::milliseconds)),
bytes(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP,
FETCH_KEYS_BYTES_HISTOGRAM,
Histogram::Unit::bytes)),
@ -1369,31 +1368,31 @@ public:
Reference<AsyncVar<ServerDBInfo> const> const& db,
StorageServerInterface const& ssi,
Reference<IPageEncryptionKeyProvider> encryptionKeyProvider)
: tenantPrefixIndex(makeReference<TenantPrefixIndex>()), encryptionKeyProvider(encryptionKeyProvider),
shardAware(false), tlogCursorReadsLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP,
TLOG_CURSOR_READS_LATENCY_HISTOGRAM,
Histogram::Unit::microseconds)),
: encryptionKeyProvider(encryptionKeyProvider), shardAware(false),
tlogCursorReadsLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP,
TLOG_CURSOR_READS_LATENCY_HISTOGRAM,
Histogram::Unit::milliseconds)),
ssVersionLockLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP,
SS_VERSION_LOCK_LATENCY_HISTOGRAM,
Histogram::Unit::microseconds)),
Histogram::Unit::milliseconds)),
eagerReadsLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP,
EAGER_READS_LATENCY_HISTOGRAM,
Histogram::Unit::microseconds)),
Histogram::Unit::milliseconds)),
fetchKeysPTreeUpdatesLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP,
FETCH_KEYS_PTREE_UPDATES_LATENCY_HISTOGRAM,
Histogram::Unit::microseconds)),
Histogram::Unit::milliseconds)),
tLogMsgsPTreeUpdatesLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP,
TLOG_MSGS_PTREE_UPDATES_LATENCY_HISTOGRAM,
Histogram::Unit::microseconds)),
Histogram::Unit::milliseconds)),
storageUpdatesDurableLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP,
STORAGE_UPDATES_DURABLE_LATENCY_HISTOGRAM,
Histogram::Unit::microseconds)),
Histogram::Unit::milliseconds)),
storageCommitLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP,
STORAGE_COMMIT_LATENCY_HISTOGRAM,
Histogram::Unit::microseconds)),
Histogram::Unit::milliseconds)),
ssDurableVersionUpdateLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP,
SS_DURABLE_VERSION_UPDATE_LATENCY_HISTOGRAM,
Histogram::Unit::microseconds)),
Histogram::Unit::milliseconds)),
readRangeBytesReturnedHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP,
SS_READ_RANGE_BYTES_RETURNED_HISTOGRAM,
Histogram::Unit::bytes)),
@ -5111,7 +5110,7 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
throw tenant_name_required();
}
if (rangeIntersectsAnyTenant(*(data->tenantPrefixIndex), KeyRangeRef(begin, end), req.version)) {
if (rangeIntersectsAnyTenant(data->tenantPrefixIndex, KeyRangeRef(begin, end), req.version)) {
throw tenant_name_required();
}
}
@ -8616,11 +8615,11 @@ private:
bool StorageServer::insertTenant(TenantNameRef tenantName, TenantMapEntry tenantEntry, Version version) {
if (version >= tenantMap.getLatestVersion()) {
tenantMap.createNewVersion(version);
tenantPrefixIndex->createNewVersion(version);
tenantPrefixIndex.createNewVersion(version);
tenantMap.insert(tenantName, tenantEntry);
auto view = tenantPrefixIndex->at(version);
auto view = tenantPrefixIndex.at(version);
auto itr = view.find(tenantEntry.prefix);
TenantNameUniqueSet nameSet;
if (itr != view.end()) {
@ -8628,7 +8627,7 @@ bool StorageServer::insertTenant(TenantNameRef tenantName, TenantMapEntry tenant
}
nameSet.insert(tenantName);
tenantPrefixIndex->insert(tenantEntry.prefix, nameSet);
tenantPrefixIndex.insert(tenantEntry.prefix, nameSet);
TraceEvent("InsertTenant", thisServerID).detail("Tenant", tenantName).detail("Version", version);
return true;
@ -8648,20 +8647,20 @@ void StorageServer::insertTenant(TenantNameRef tenantName, ValueRef value, Versi
void StorageServer::clearTenants(TenantNameRef startTenant, TenantNameRef endTenant, Version version) {
if (version >= tenantMap.getLatestVersion()) {
tenantMap.createNewVersion(version);
tenantPrefixIndex->createNewVersion(version);
tenantPrefixIndex.createNewVersion(version);
auto view = tenantMap.at(version);
for (auto itr = view.lower_bound(startTenant); itr != view.lower_bound(endTenant); ++itr) {
auto indexView = tenantPrefixIndex->at(version);
auto indexView = tenantPrefixIndex.at(version);
// Trigger any watches on the prefix associated with the tenant.
watches.triggerRange(itr->prefix, strinc(itr->prefix));
auto indexItr = indexView.find(itr->prefix);
ASSERT(indexItr != indexView.end());
TenantNameUniqueSet nameSet = *indexItr;
if (nameSet.remove(itr.key())) {
tenantPrefixIndex->erase(itr->prefix);
tenantPrefixIndex.erase(itr->prefix);
} else {
tenantPrefixIndex->insert(itr->prefix, nameSet);
tenantPrefixIndex.insert(itr->prefix, nameSet);
}
TraceEvent("EraseTenant", thisServerID).detail("Tenant", itr.key()).detail("Version", version);
}
@ -9348,7 +9347,7 @@ ACTOR Future<Void> updateStorage(StorageServer* data) {
newOldestVersion, desiredVersion, bytesLeft, unlimitedCommitBytes);
if (data->tenantMap.getLatestVersion() < newOldestVersion) {
data->tenantMap.createNewVersion(newOldestVersion);
data->tenantPrefixIndex->createNewVersion(newOldestVersion);
data->tenantPrefixIndex.createNewVersion(newOldestVersion);
}
// We want to forget things from these data structures atomically with changing oldestVersion (and "before",
// since oldestVersion.set() may trigger waiting actors) forgetVersionsBeforeAsync visibly forgets
@ -9356,7 +9355,7 @@ ACTOR Future<Void> updateStorage(StorageServer* data) {
Future<Void> finishedForgetting =
data->mutableData().forgetVersionsBeforeAsync(newOldestVersion, TaskPriority::UpdateStorage) &&
data->tenantMap.forgetVersionsBeforeAsync(newOldestVersion, TaskPriority::UpdateStorage) &&
data->tenantPrefixIndex->forgetVersionsBeforeAsync(newOldestVersion, TaskPriority::UpdateStorage);
data->tenantPrefixIndex.forgetVersionsBeforeAsync(newOldestVersion, TaskPriority::UpdateStorage);
data->oldestVersion.set(newOldestVersion);
wait(finishedForgetting);
wait(yield(TaskPriority::UpdateStorage));
@ -9468,7 +9467,7 @@ ACTOR Future<Void> updateStorage(StorageServer* data) {
durableDelay = delay(SERVER_KNOBS->STORAGE_COMMIT_INTERVAL, TaskPriority::UpdateStorage);
}
wait(ioTimeoutError(durable, SERVER_KNOBS->MAX_STORAGE_COMMIT_TIME));
wait(ioTimeoutError(durable, SERVER_KNOBS->MAX_STORAGE_COMMIT_TIME, "StorageCommit"));
data->storageCommitLatencyHistogram->sampleSeconds(now() - beforeStorageCommit);
debug_advanceMinCommittedVersion(data->thisServerID, data->storageMinRecoverVersion);
@ -10165,7 +10164,7 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
data->tenantMap.insert(tenantName, tenantEntry);
auto view = data->tenantPrefixIndex->at(version);
auto view = data->tenantPrefixIndex.at(version);
auto itr = view.find(tenantEntry.prefix);
TenantNameUniqueSet nameSet;
if (itr != view.end()) {
@ -10173,7 +10172,7 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
}
nameSet.insert(tenantName);
data->tenantPrefixIndex->insert(tenantEntry.prefix, nameSet);
data->tenantPrefixIndex.insert(tenantEntry.prefix, nameSet);
TraceEvent("RestoringTenant", data->thisServerID)
.detail("Key", tenantMap[tenantMapLoc].key)
@ -11275,7 +11274,6 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
self.tag = seedTag;
}
self.encryptionKeyProvider->setTenantPrefixIndex(self.tenantPrefixIndex);
self.storage.makeNewStorageServerDurable(self.shardAware);
wait(self.storage.commit());
++self.counters.kvCommits;
@ -11358,13 +11356,6 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
recovered.send(Void());
return Void();
}
// Pass a reference of tenantPrefixIndex to the storage engine to support per-tenant data encryption,
// after the tenant map is recovered in restoreDurableState. In case of a storage server reboot,
// it is possible that the storage engine is still holding a pre-reboot tenantPrefixIndex, and use that
// for its own recovery, before we set the tenantPrefixIndex here.
if (self.encryptionKeyProvider.isValid()) {
self.encryptionKeyProvider->setTenantPrefixIndex(self.tenantPrefixIndex);
}
TraceEvent("SSTimeRestoreDurableState", self.thisServerID).detail("TimeTaken", now() - start);
// if this is a tss storage file, use that as source of truth for this server being a tss instead of the

Some files were not shown because too many files have changed in this diff Show More