Merge remote-tracking branch 'origin/main' into bugfixes/machines-attrition-debugging

2022-10-24 15:24:36 -06:00 · 2022-10-24 15:24:36 -06:00 · e7b5b870a3
parent 2310584a05 d672f91160
commit e7b5b870a3
161 changed files with 3561 additions and 1376 deletions
--- a/bindings/c/CMakeLists.txt
+++ b/bindings/c/CMakeLists.txt
@ -274,85 +274,21 @@ if(NOT WIN32)
      @CLUSTER_FILE@
      ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
    )
-    add_fdbclient_test(
-      NAME fdb_c_api_tests
-      DISABLE_LOG_DUMP
-      COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
-      --cluster-file
-      @CLUSTER_FILE@
-      --tester-binary
-      $<TARGET_FILE:fdb_c_api_tester>
-      --external-client-library
-      ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
-      --test-dir
-      ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
-      --tmp-dir
-      @TMP_DIR@
-      --log-dir
-      @LOG_DIR@
-    )

-    add_fdbclient_test(
-      NAME fdb_c_api_tests_local_only
-      DISABLE_LOG_DUMP
-      COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
-      --cluster-file
-      @CLUSTER_FILE@
-      --tester-binary
-      $<TARGET_FILE:fdb_c_api_tester>
-      --test-dir
-      ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/local_tests
-      --tmp-dir
-      @TMP_DIR@
-      --log-dir
-      @LOG_DIR@
-    )
-
-    add_fdbclient_test(
-      NAME fdb_c_api_tests_blob_granule
-      DISABLE_LOG_DUMP
-      API_TEST_BLOB_GRANULES_ENABLED
-      COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
-      --cluster-file
-      @CLUSTER_FILE@
-      --tester-binary
-      $<TARGET_FILE:fdb_c_api_tester>
-      --external-client-library
-      ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
-      --test-dir
-      ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/blobgranuletests
-      --blob-granule-local-file-path
-      @DATA_DIR@/fdbblob/
-      --tmp-dir
-      @TMP_DIR@
-      --log-dir
-      @LOG_DIR@
-    )
-
-    add_fdbclient_test(
-      NAME fdb_c_api_tests_with_tls
-      DISABLE_LOG_DUMP
-      TLS_ENABLED
-      COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
-      --cluster-file
-      @CLUSTER_FILE@
-      --tester-binary
-      $<TARGET_FILE:fdb_c_api_tester>
-      --external-client-library
-      ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
-      --test-dir
-      ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
-      --tmp-dir
-      @TMP_DIR@
-      --log-dir
-      @LOG_DIR@
-      --tls-cert-file
-      @CLIENT_CERT_FILE@
-      --tls-key-file
-      @CLIENT_KEY_FILE@
-      --tls-ca-file
-      @SERVER_CA_FILE@
-    )
+    file(GLOB API_TEST_FILES "${CMAKE_CURRENT_SOURCE_DIR}/test/apitester/tests/*.toml")
+    foreach(test_file ${API_TEST_FILES})
+      get_filename_component(file_name "${test_file}" NAME_WE)
+      set(test_name "fdb_c_api_test_${file_name}")
+      add_test(NAME "${test_name}"
+        COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
+          --build-dir ${CMAKE_BINARY_DIR}
+          --api-tester-bin $<TARGET_FILE:fdb_c_api_tester>
+          --external-client-library ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
+          --test-file ${test_file}
+          --knob delete-native-lib-after-loading=false
+      )
+      set_tests_properties("${test_name}" PROPERTIES TIMEOUT 300)
+    endforeach()

    add_test(NAME fdb_c_upgrade_to_future_version
      COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
--- a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessSingleThr.toml
+++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessSingleThr.toml
@ -1,15 +0,0 @@
-[[test]]
-title = 'Blob Granule API Correctness Single Threaded'
-minClients = 1
-maxClients = 3
-multiThreaded = false
-
-    [[test.workload]]
-    name = 'ApiBlobGranuleCorrectness'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 50
-	initialSize = 100
-	numRandomOperations = 100
--- a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml
+++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml
@ -1,15 +0,0 @@
-[[test]]
-title = 'Blob Granule Errors Single Threaded'
-minClients = 1
-maxClients = 3
-multiThreaded = false
-
-	[[test.workload]]
-    name = 'BlobGranuleErrors'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 50
-	initialSize = 100
-	numRandomOperations = 100
--- a/bindings/c/test/apitester/fdb_c_api_tester.cpp
+++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp
@ -279,9 +279,9 @@ bool parseArgs(TesterOptions& options, int argc, char** argv) {
 	return true;
 }

-void fdb_check(fdb::Error e) {
-	if (e) {
-		fmt::print(stderr, "Unexpected FDB error: {}({})\n", e.code(), e.what());
+void fdb_check(fdb::Error e, std::string_view msg, fdb::Error::CodeType expectedError = error_code_success) {
+	if (e.code()) {
+		fmt::print(stderr, "{}, Error: {}({})\n", msg, e.code(), e.what());
 		std::abort();
 	}
 }
@ -453,13 +453,13 @@ int main(int argc, char** argv) {
 		applyNetworkOptions(options);
 		fdb::network::setup();

-		std::thread network_thread{ &fdb::network::run };
+		std::thread network_thread{ [] { fdb_check(fdb::network::run(), "FDB network thread failed"); } };

 		if (!runWorkloads(options)) {
 			retCode = 1;
 		}

-		fdb_check(fdb::network::stop());
+		fdb_check(fdb::network::stop(), "Failed to stop FDB thread");
 		network_thread.join();
 	} catch (const std::exception& err) {
 		fmt::print(stderr, "ERROR: {}\n", err.what());
--- a/bindings/c/test/apitester/local_tests/CApiCorrectnessSingleThr.toml
+++ b/bindings/c/test/apitester/local_tests/CApiCorrectnessSingleThr.toml
@ -1,29 +0,0 @@
-[[test]]
-title = 'API Correctness Single Threaded'
-minClients = 1
-maxClients = 3
-minDatabases = 1
-maxDatabases = 3
-multiThreaded = false
-disableClientBypass = true
-
-    [[test.workload]]
-    name = 'ApiCorrectness'
-    minKeyLength = 1
-    maxKeyLength = 64
-    minValueLength = 1
-    maxValueLength = 1000
-    maxKeysPerTransaction = 50
-    initialSize = 100
-    numRandomOperations = 100
-    readExistingKeysRatio = 0.9
-
-    [[test.workload]]
-    name = 'AtomicOpsCorrectness'
-    initialSize = 0
-    numRandomOperations = 100
-
-    [[test.workload]]
-    name = 'WatchAndWait'
-    initialSize = 0
-    numRandomOperations = 10
--- a/bindings/c/test/apitester/run_c_api_tests.py
+++ b/bindings/c/test/apitester/run_c_api_tests.py
@ -29,31 +29,39 @@ from pathlib import Path
 import glob
 import random
 import string
+import toml
+
+sys.path[:0] = [os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "tests", "TestRunner")]
+
+# fmt: off
+from tmp_cluster import TempCluster
+from local_cluster import TLSConfig
+# fmt: on

 TESTER_STATS_INTERVAL_SEC = 5


 def random_string(len):
-    return ''.join(random.choice(string.ascii_letters + string.digits) for i in range(len))
+    return "".join(random.choice(string.ascii_letters + string.digits) for i in range(len))


 def get_logger():
-    return logging.getLogger('foundationdb.run_c_api_tests')
+    return logging.getLogger("foundationdb.run_c_api_tests")


 def initialize_logger_level(logging_level):
    logger = get_logger()

-    assert logging_level in ['DEBUG', 'INFO', 'WARNING', 'ERROR']
+    assert logging_level in ["DEBUG", "INFO", "WARNING", "ERROR"]

-    logging.basicConfig(format='%(message)s')
-    if logging_level == 'DEBUG':
+    logging.basicConfig(format="%(message)s")
+    if logging_level == "DEBUG":
        logger.setLevel(logging.DEBUG)
-    elif logging_level == 'INFO':
+    elif logging_level == "INFO":
        logger.setLevel(logging.INFO)
-    elif logging_level == 'WARNING':
+    elif logging_level == "WARNING":
        logger.setLevel(logging.WARNING)
-    elif logging_level == 'ERROR':
+    elif logging_level == "ERROR":
        logger.setLevel(logging.ERROR)


@ -65,35 +73,52 @@ def dump_client_logs(log_dir):
        print(">>>>>>>>>>>>>>>>>>>> End of {}:".format(log_file))


-def run_tester(args, test_file):
-    cmd = [args.tester_binary,
-           "--cluster-file", args.cluster_file,
-           "--test-file", test_file,
-           "--stats-interval", str(TESTER_STATS_INTERVAL_SEC*1000)]
+def run_tester(args, cluster, test_file):
+    build_dir = Path(args.build_dir).resolve()
+    tester_binary = Path(args.api_tester_bin).resolve()
+    external_client_library = build_dir.joinpath("bindings", "c", "libfdb_c_external.so")
+    log_dir = Path(cluster.log).joinpath("client")
+    log_dir.mkdir(exist_ok=True)
+    cmd = [
+        tester_binary,
+        "--cluster-file",
+        cluster.cluster_file,
+        "--test-file",
+        test_file,
+        "--stats-interval",
+        str(TESTER_STATS_INTERVAL_SEC * 1000),
+        "--tmp-dir",
+        cluster.tmp_dir,
+        "--log",
+        "--log-dir",
+        str(log_dir),
+    ]
+
    if args.external_client_library is not None:
-        cmd += ["--external-client-library", args.external_client_library]
-    if args.tmp_dir is not None:
-        cmd += ["--tmp-dir", args.tmp_dir]
-    log_dir = None
-    if args.log_dir is not None:
-        log_dir = Path(args.log_dir).joinpath(random_string(8))
-        log_dir.mkdir(exist_ok=True)
-        cmd += ['--log', "--log-dir", str(log_dir)]
+        external_client_library = Path(args.external_client_library).resolve()
+        cmd += ["--external-client-library", external_client_library]

-    if args.blob_granule_local_file_path is not None:
-        cmd += ["--blob-granule-local-file-path",
-                args.blob_granule_local_file_path]
+    if cluster.blob_granules_enabled:
+        cmd += [
+            "--blob-granule-local-file-path",
+            str(cluster.data.joinpath("fdbblob")) + os.sep,
+        ]

-    if args.tls_ca_file is not None:
-        cmd += ["--tls-ca-file", args.tls_ca_file]
+    if cluster.tls_config is not None:
+        cmd += [
+            "--tls-ca-file",
+            cluster.server_ca_file,
+            "--tls-key-file",
+            cluster.client_key_file,
+            "--tls-cert-file",
+            cluster.client_cert_file,
+        ]

-    if args.tls_key_file is not None:
-        cmd += ["--tls-key-file", args.tls_key_file]
+    for knob in args.knobs:
+        knob_name, knob_value = knob.split("=")
+        cmd += ["--knob-" + knob_name, knob_value]

-    if args.tls_cert_file is not None:
-        cmd += ["--tls-cert-file", args.tls_cert_file]
-
-    get_logger().info('\nRunning tester \'%s\'...' % ' '.join(cmd))
+    get_logger().info("\nRunning tester '%s'..." % " ".join(map(str, cmd)))
    proc = Popen(cmd, stdout=sys.stdout, stderr=sys.stderr)
    timed_out = False
    ret_code = 1
@ -103,34 +128,76 @@ def run_tester(args, test_file):
        proc.kill()
        timed_out = True
    except Exception as e:
-        raise Exception('Unable to run tester (%s)' % e)
+        raise Exception("Unable to run tester (%s)" % e)

    if ret_code != 0:
        if timed_out:
-            reason = 'timed out after %d seconds' % args.timeout
+            reason = "timed out after %d seconds" % args.timeout
        elif ret_code < 0:
            reason = signal.Signals(-ret_code).name
        else:
-            reason = 'exit code: %d' % ret_code
-        get_logger().error('\n\'%s\' did not complete succesfully (%s)' %
-                           (cmd[0], reason))
-        if (log_dir is not None):
+            reason = "exit code: %d" % ret_code
+        get_logger().error("\n'%s' did not complete succesfully (%s)" % (cmd[0], reason))
+        if log_dir is not None:
            dump_client_logs(log_dir)

-    get_logger().info('')
+    get_logger().info("")
    return ret_code


+class TestConfig:
+    def __init__(self, test_file):
+        config = toml.load(test_file)
+        server_config = config.get("server", [{}])[0]
+        self.tenants_enabled = server_config.get("tenants_enabled", True)
+        self.blob_granules_enabled = server_config.get("blob_granules_enabled", False)
+        self.tls_enabled = server_config.get("tls_enabled", False)
+        self.client_chain_len = server_config.get("tls_client_chain_len", 2)
+        self.server_chain_len = server_config.get("tls_server_chain_len", 3)
+        self.min_num_processes = server_config.get("min_num_processes", 1)
+        self.max_num_processes = server_config.get("max_num_processes", 3)
+        self.num_processes = random.randint(self.min_num_processes, self.max_num_processes)
+
+
+def run_test(args, test_file):
+    config = TestConfig(test_file)
+
+    tls_config = None
+    if config.tls_enabled:
+        tls_config = TLSConfig(
+            server_chain_len=config.client_chain_len,
+            client_chain_len=config.server_chain_len,
+        )
+
+    with TempCluster(
+        args.build_dir,
+        config.num_processes,
+        enable_tenants=config.tenants_enabled,
+        blob_granules_enabled=config.blob_granules_enabled,
+        tls_config=tls_config,
+    ) as cluster:
+        ret_code = run_tester(args, cluster, test_file)
+        if not cluster.check_cluster_logs():
+            ret_code = 1 if ret_code == 0 else ret_code
+        return ret_code
+
+
 def run_tests(args):
    num_failed = 0
-    test_files = [f for f in os.listdir(args.test_dir) if os.path.isfile(
-        os.path.join(args.test_dir, f)) and f.endswith(".toml")]
+    if args.test_file is not None:
+        test_files = [Path(args.test_file).resolve()]
+    else:
+        test_files = [
+            f
+            for f in os.listdir(args.test_dir)
+            if os.path.isfile(os.path.join(args.test_dir, f)) and f.endswith(".toml")
+        ]

    for test_file in test_files:
-        get_logger().info('=========================================================')
-        get_logger().info('Running test %s' % test_file)
-        get_logger().info('=========================================================')
-        ret_code = run_tester(args, os.path.join(args.test_dir, test_file))
+        get_logger().info("=========================================================")
+        get_logger().info("Running test %s" % test_file)
+        get_logger().info("=========================================================")
+        ret_code = run_test(args, os.path.join(args.test_dir, test_file))
        if ret_code != 0:
            num_failed += 1

@ -138,32 +205,49 @@ def run_tests(args):


 def parse_args(argv):
-    parser = argparse.ArgumentParser(description='FoundationDB C API Tester')
-
-    parser.add_argument('--cluster-file', type=str, default="fdb.cluster",
-                        help='The cluster file for the cluster being connected to. (default: fdb.cluster)')
-    parser.add_argument('--tester-binary', type=str, default="fdb_c_api_tester",
-                        help='Path to the fdb_c_api_tester executable. (default: fdb_c_api_tester)')
-    parser.add_argument('--external-client-library', type=str, default=None,
-                        help='Path to the external client library. (default: None)')
-    parser.add_argument('--test-dir', type=str, default="./",
-                        help='Path to a directory with test definitions. (default: ./)')
-    parser.add_argument('--timeout', type=int, default=300,
-                        help='The timeout in seconds for running each individual test. (default 300)')
-    parser.add_argument('--log-dir', type=str, default=None,
-                        help='The directory for storing logs (default: None)')
-    parser.add_argument('--logging-level', type=str, default='INFO',
-                        choices=['ERROR', 'WARNING', 'INFO', 'DEBUG'], help='Specifies the level of detail in the tester output (default=\'INFO\').')
-    parser.add_argument('--tmp-dir', type=str, default=None,
-                        help='The directory for storing temporary files (default: None)')
-    parser.add_argument('--blob-granule-local-file-path', type=str, default=None,
-                        help='Enable blob granule tests if set, value is path to local blob granule files')
-    parser.add_argument('--tls-ca-file', type=str, default=None,
-                        help='Path to client\'s TLS CA file: i.e. certificate of CA that signed the server certificate')
-    parser.add_argument('--tls-cert-file', type=str, default=None,
-                        help='Path to client\'s TLS certificate file')
-    parser.add_argument('--tls-key-file', type=str, default=None,
-                        help='Path to client\'s TLS private key file')
+    parser = argparse.ArgumentParser(description="FoundationDB C API Tester")
+    parser.add_argument("--build-dir", "-b", type=str, required=True, help="FDB build directory")
+    parser.add_argument("--api-tester-bin", type=str, help="Path to the fdb_c_api_tester executable.", required=True)
+    parser.add_argument("--external-client-library", type=str, help="Path to the external client library.")
+    parser.add_argument(
+        "--cluster-file",
+        type=str,
+        default="fdb.cluster",
+        help="The cluster file for the cluster being connected to. (default: fdb.cluster)",
+    )
+    parser.add_argument(
+        "--test-dir",
+        type=str,
+        default="./",
+        help="Path to a directory with test definitions. (default: ./)",
+    )
+    parser.add_argument(
+        "--test-file",
+        type=str,
+        default=None,
+        help="Path to a single test definition to be executed, overrides --test-dir if set.",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=300,
+        help="The timeout in seconds for running each individual test. (default 300)",
+    )
+    parser.add_argument(
+        "--logging-level",
+        type=str,
+        default="INFO",
+        choices=["ERROR", "WARNING", "INFO", "DEBUG"],
+        help="Specifies the level of detail in the tester output (default='INFO').",
+    )
+    parser.add_argument(
+        "--knob",
+        type=str,
+        default=[],
+        action="append",
+        dest="knobs",
+        help="[lowercase-knob-name]=[knob-value] (there may be multiple --knob options)",
+    )

    return parser.parse_args(argv)

@ -174,5 +258,5 @@ def main(argv):
    return run_tests(args)


-if __name__ == '__main__':
+if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))
--- a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessBlocking.toml
+++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessBlocking.toml
@ -12,13 +12,15 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

+[[server]]
+blob_granules_enabled = true

-    [[test.workload]]
-    name = 'ApiBlobGranuleCorrectness'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 50
-	initialSize = 100
-	numRandomOperations = 100
+[[test.workload]]
+name = 'ApiBlobGranuleCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
--- a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessMultiThr.toml
+++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessMultiThr.toml
@ -11,13 +11,15 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-    [[test.workload]]
-    name = 'ApiBlobGranuleCorrectness'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 50
-	initialSize = 100
-	numRandomOperations = 100
+[[server]]
+blob_granules_enabled = true

+[[test.workload]]
+name = 'ApiBlobGranuleCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
--- a/bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessSingleThr.toml
+++ b/bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessSingleThr.toml
@ -0,0 +1,18 @@
+[[test]]
+title = 'Blob Granule API Correctness Single Threaded'
+minClients = 1
+maxClients = 3
+multiThreaded = false
+
+[[server]]
+blob_granules_enabled = true
+
+[[test.workload]]
+name = 'ApiBlobGranuleCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
--- a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml
+++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml
@ -11,12 +11,15 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-	[[test.workload]]
-    name = 'BlobGranuleErrors'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 50
-	initialSize = 100
-	numRandomOperations = 100
+[[server]]
+blob_granules_enabled = true
+
+[[test.workload]]
+name = 'BlobGranuleErrors'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
--- a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml
+++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml
@ -11,12 +11,15 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-	[[test.workload]]
-    name = 'BlobGranuleErrors'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 50
-	initialSize = 100
-	numRandomOperations = 100
+[[server]]
+blob_granules_enabled = true
+
+[[test.workload]]
+name = 'BlobGranuleErrors'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
--- a/bindings/c/test/apitester/tests/CApiBlobGranuleErrorsSingleThr.toml
+++ b/bindings/c/test/apitester/tests/CApiBlobGranuleErrorsSingleThr.toml
@ -0,0 +1,18 @@
+[[test]]
+title = 'Blob Granule Errors Single Threaded'
+minClients = 1
+maxClients = 3
+multiThreaded = false
+
+[[server]]
+blob_granules_enabled = true
+
+[[test.workload]]
+name = 'BlobGranuleErrors'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
--- a/bindings/c/test/apitester/tests/CApiCancelTransactionBlocking.toml
+++ b/bindings/c/test/apitester/tests/CApiCancelTransactionBlocking.toml
@ -12,13 +12,13 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-	[[test.workload]]
-    name = 'CancelTransaction'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 50
-	initialSize = 100
-	numRandomOperations = 100
-	readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'CancelTransaction'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiCancelTransactionCB.toml
+++ b/bindings/c/test/apitester/tests/CApiCancelTransactionCB.toml
@ -11,13 +11,13 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-	[[test.workload]]
-    name = 'CancelTransaction'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 50
-	initialSize = 100
-	numRandomOperations = 100
-	readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'CancelTransaction'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX.toml
+++ b/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX.toml
@ -12,13 +12,13 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-	[[test.workload]]
-    name = 'CancelTransaction'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 50
-	initialSize = 100
-	numRandomOperations = 100
-	readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'CancelTransaction'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX_TLS.toml
+++ b/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX_TLS.toml
@ -0,0 +1,28 @@
+[[test]]
+title = 'Cancel Transaction with Database per Transaction with TLS'
+multiThreaded = true
+buggify = true
+databasePerTransaction = true
+minFdbThreads = 2
+maxFdbThreads = 8
+minDatabases = 2
+maxDatabases = 8
+minClientThreads = 2
+maxClientThreads = 8
+minClients = 2
+maxClients = 8
+
+[[server]]
+tls_enabled = true
+max_num_processes = 1
+
+[[test.workload]]
+name = 'CancelTransaction'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiCancelTransactionWithTimeout.toml
+++ b/bindings/c/test/apitester/tests/CApiCancelTransactionWithTimeout.toml
@ -11,15 +11,15 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-    [[test.workload]]
-    name = 'CancelTransaction'
-    minKeyLength = 1
-    maxKeyLength = 64
-    minValueLength = 1
-    maxValueLength = 1000
-    maxKeysPerTransaction = 50
-    initialSize = 100
-    numRandomOperations = 100
-    readExistingKeysRatio = 0.9
-    minTxTimeoutMs = 10
-    maxTxTimeoutMs = 10000
+[[test.workload]]
+name = 'CancelTransaction'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
+minTxTimeoutMs = 10
+maxTxTimeoutMs = 10000
--- a/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml
@ -12,23 +12,23 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-    [[test.workload]]
-    name = 'ApiCorrectness'
-    minKeyLength = 1
-    maxKeyLength = 64
-    minValueLength = 1
-    maxValueLength = 1000
-    maxKeysPerTransaction = 50
-    initialSize = 100
-    numRandomOperations = 100
-    readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9

-    [[test.workload]]
-    name = 'AtomicOpsCorrectness'
-    initialSize = 0
-    numRandomOperations = 100
+[[test.workload]]
+name = 'AtomicOpsCorrectness'
+initialSize = 0
+numRandomOperations = 100

-    [[test.workload]]
-    name = 'WatchAndWait'
-    initialSize = 0
-    numRandomOperations = 10
+[[test.workload]]
+name = 'WatchAndWait'
+initialSize = 0
+numRandomOperations = 10
--- a/bindings/c/test/apitester/tests/CApiCorrectnessCallbacksOnExtThr.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessCallbacksOnExtThr.toml
@ -12,23 +12,23 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-    [[test.workload]]
-    name = 'ApiCorrectness'
-    minKeyLength = 1
-    maxKeyLength = 64
-    minValueLength = 1
-    maxValueLength = 1000
-    maxKeysPerTransaction = 50
-    initialSize = 100
-    numRandomOperations = 100
-    readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9

-    [[test.workload]]
-    name = 'AtomicOpsCorrectness'
-    initialSize = 0
-    numRandomOperations = 100
+[[test.workload]]
+name = 'AtomicOpsCorrectness'
+initialSize = 0
+numRandomOperations = 100

-    [[test.workload]]
-    name = 'WatchAndWait'
-    initialSize = 0
-    numRandomOperations = 10
+[[test.workload]]
+name = 'WatchAndWait'
+initialSize = 0
+numRandomOperations = 10
--- a/bindings/c/test/apitester/tests/CApiCorrectnessDBPerTX.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessDBPerTX.toml
@ -12,23 +12,23 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-    [[test.workload]]
-    name = 'ApiCorrectness'
-    minKeyLength = 1
-    maxKeyLength = 64
-    minValueLength = 1
-    maxValueLength = 1000
-    maxKeysPerTransaction = 50
-    initialSize = 100
-    numRandomOperations = 100
-    readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9

-    [[test.workload]]
-    name = 'AtomicOpsCorrectness'
-    initialSize = 0
-    numRandomOperations = 100
+[[test.workload]]
+name = 'AtomicOpsCorrectness'
+initialSize = 0
+numRandomOperations = 100

-    [[test.workload]]
-    name = 'WatchAndWait'
-    initialSize = 0
-    numRandomOperations = 10
+[[test.workload]]
+name = 'WatchAndWait'
+initialSize = 0
+numRandomOperations = 10
--- a/bindings/c/test/apitester/tests/CApiCorrectnessDisableBypass.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessDisableBypass.toml
@ -0,0 +1,29 @@
+[[test]]
+title = 'API Correctness Single Threaded'
+minClients = 1
+maxClients = 3
+minDatabases = 1
+maxDatabases = 3
+multiThreaded = false
+disableClientBypass = true
+
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
+
+[[test.workload]]
+name = 'AtomicOpsCorrectness'
+initialSize = 0
+numRandomOperations = 100
+
+[[test.workload]]
+name = 'WatchAndWait'
+initialSize = 0
+numRandomOperations = 10
--- a/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml
@ -11,23 +11,23 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-    [[test.workload]]
-    name = 'ApiCorrectness'
-    minKeyLength = 1
-    maxKeyLength = 64
-    minValueLength = 1
-    maxValueLength = 1000
-    maxKeysPerTransaction = 50
-    initialSize = 100
-    numRandomOperations = 100
-    readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9

-    [[test.workload]]
-    name = 'AtomicOpsCorrectness'
-    initialSize = 0
-    numRandomOperations = 100
+[[test.workload]]
+name = 'AtomicOpsCorrectness'
+initialSize = 0
+numRandomOperations = 100

-    [[test.workload]]
-    name = 'WatchAndWait'
-    initialSize = 0
-    numRandomOperations = 10
+[[test.workload]]
+name = 'WatchAndWait'
+initialSize = 0
+numRandomOperations = 10
--- a/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml
@ -4,23 +4,23 @@ minClients = 1
 maxClients = 3
 multiThreaded = false

-    [[test.workload]]
-    name = 'ApiCorrectness'
-    minKeyLength = 1
-    maxKeyLength = 64
-    minValueLength = 1
-    maxValueLength = 1000
-    maxKeysPerTransaction = 50
-    initialSize = 100
-    numRandomOperations = 100
-    readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9

-    [[test.workload]]
-    name = 'AtomicOpsCorrectness'
-    initialSize = 0
-    numRandomOperations = 100
+[[test.workload]]
+name = 'AtomicOpsCorrectness'
+initialSize = 0
+numRandomOperations = 100

-    [[test.workload]]
-    name = 'WatchAndWait'
-    initialSize = 0
-    numRandomOperations = 10
+[[test.workload]]
+name = 'WatchAndWait'
+initialSize = 0
+numRandomOperations = 10
--- a/bindings/c/test/apitester/tests/CApiCorrectnessTLS.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessTLS.toml
@ -0,0 +1,37 @@
+[[test]]
+title = 'API Correctness with TLS'
+multiThreaded = true
+buggify = true
+minFdbThreads = 2
+maxFdbThreads = 8
+minDatabases = 2
+maxDatabases = 8
+minClientThreads = 2
+maxClientThreads = 8
+minClients = 2
+maxClients = 8
+
+[[server]]
+tls_enabled = true
+max_num_processes = 1
+
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
+
+[[test.workload]]
+name = 'AtomicOpsCorrectness'
+initialSize = 0
+numRandomOperations = 100
+
+[[test.workload]]
+name = 'WatchAndWait'
+initialSize = 0
+numRandomOperations = 10
--- a/bindings/c/test/apitester/tests/CApiCorrectnessWithTimeout.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessWithTimeout.toml
@ -11,23 +11,22 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-    [[test.workload]]
-    name = 'ApiCorrectness'
-    minKeyLength = 1
-    maxKeyLength = 64
-    minValueLength = 1
-    maxValueLength = 1000
-    maxKeysPerTransaction = 50
-    initialSize = 100
-    numRandomOperations = 100
-    readExistingKeysRatio = 0.9
-    minTxTimeoutMs = 100
-    maxTxTimeoutMs = 10000
-
-    [[test.workload]]
-    name = 'AtomicOpsCorrectness'
-    initialSize = 0
-    numRandomOperations = 100
-    minTxTimeoutMs = 100
-    maxTxTimeoutMs = 10000
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
+minTxTimeoutMs = 100
+maxTxTimeoutMs = 10000

+[[test.workload]]
+name = 'AtomicOpsCorrectness'
+initialSize = 0
+numRandomOperations = 100
+minTxTimeoutMs = 100
+maxTxTimeoutMs = 10000
--- a/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessMultiThr.toml
+++ b/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessMultiThr.toml
@ -9,13 +9,13 @@ maxClients = 8
 minTenants = 2
 maxTenants = 5

-    [[test.workload]]
-    name = 'ApiCorrectness'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 5
-	initialSize = 100
-	numRandomOperations = 200
-	readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 5
+initialSize = 100
+numRandomOperations = 200
+readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessTLS.toml
+++ b/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessTLS.toml
@ -0,0 +1,25 @@
+[[test]]
+title = 'Multi-tenant API Correctness Multi Threaded'
+multiThreaded = true
+buggify = true
+minFdbThreads = 2
+maxFdbThreads = 8
+minClients = 2
+maxClients = 8
+minTenants = 2
+maxTenants = 5
+
+[[server]]
+tls_enabled = true
+max_num_processes = 1
+
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 5
+initialSize = 100
+numRandomOperations = 200
+readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiTamperClusterFile.toml
+++ b/bindings/c/test/apitester/tests/CApiTamperClusterFile.toml
@ -12,13 +12,13 @@ maxClientThreads = 4
 minClients = 2
 maxClients = 4

-    [[test.workload]]
-    name = 'ApiCorrectness'
-    minKeyLength = 1
-    maxKeyLength = 64
-    minValueLength = 1
-    maxValueLength = 1000
-    maxKeysPerTransaction = 50
-    initialSize = 100
-    numRandomOperations = 100
-    readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiTamperClusterFileTLS.toml
+++ b/bindings/c/test/apitester/tests/CApiTamperClusterFileTLS.toml
@ -0,0 +1,28 @@
+[[test]]
+title = 'Test tampering the cluster file with TLS'
+multiThreaded = true
+buggify = true
+tamperClusterFile = true
+minFdbThreads = 2
+maxFdbThreads = 4
+minDatabases = 2
+maxDatabases = 4
+minClientThreads = 2
+maxClientThreads = 4
+minClients = 2
+maxClients = 4
+
+[[server]]
+tls_enabled = true
+max_num_processes = 1
+
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
--- a/bindings/c/test/client_memory_test.cpp
+++ b/bindings/c/test/client_memory_test.cpp
@ -46,7 +46,7 @@ int main(int argc, char** argv) {
 	}
 	fdb_check(fdb_select_api_version(FDB_API_VERSION));
 	fdb_check(fdb_setup_network());
-	std::thread network_thread{ &fdb_run_network };
+	std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };

 	fdb_check(
 	    fdb_network_set_option(FDBNetworkOption::FDB_NET_OPTION_TRACE_ENABLE, reinterpret_cast<const uint8_t*>(""), 0));
--- a/bindings/c/test/mako/mako.cpp
+++ b/bindings/c/test/mako/mako.cpp
@ -321,7 +321,16 @@ int populate(Database db,
 		const auto key_begin = insertBegin(args.rows, worker_id, thread_id, args.num_processes, args.num_threads);
 		const auto key_end = insertEnd(args.rows, worker_id, thread_id, args.num_processes, args.num_threads);
 		auto key_checkpoint = key_begin; // in case of commit failure, restart from this key
+		double required_keys = (key_end - key_begin + 1) * args.load_factor;
 		for (auto i = key_begin; i <= key_end; i++) {
+			// Choose required_keys out of (key_end -i + 1) randomly, so the probability is required_keys / (key_end - i
+			// + 1). Generate a random number in range [0, 1), if the generated number is smaller or equal to
+			// required_keys / (key_end - i + 1), then choose this key.
+			double r = rand() / (1.0 + RAND_MAX);
+			if (r > required_keys / (key_end - i + 1)) {
+				continue;
+			}
+			--required_keys;
 			/* sequential keys */
 			genKey(keystr.data(), KEY_PREFIX, args, i);
 			/* random values */
@ -984,6 +993,7 @@ int initArguments(Arguments& args) {
 	args.async_xacts = 0;
 	args.mode = MODE_INVALID;
 	args.rows = 100000;
+	args.load_factor = 1.0;
 	args.row_digits = digits(args.rows);
 	args.seconds = 30;
 	args.iteration = 0;
@ -1166,6 +1176,7 @@ void usage() {
 	printf("%-24s %s\n", "-t, --threads=THREADS", "Specify number of worker threads");
 	printf("%-24s %s\n", "    --async_xacts", "Specify number of concurrent transactions to be run in async mode");
 	printf("%-24s %s\n", "-r, --rows=ROWS", "Specify number of records");
+	printf("%-24s %s\n", "-l, --load_factor=LOAD_FACTOR", "Specify load factor");
 	printf("%-24s %s\n", "-s, --seconds=SECONDS", "Specify the test duration in seconds\n");
 	printf("%-24s %s\n", "", "This option cannot be specified with --iteration.");
 	printf("%-24s %s\n", "-i, --iteration=ITERS", "Specify the number of iterations.\n");
@ -1228,6 +1239,7 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
 			{ "threads", required_argument, NULL, 't' },
 			{ "async_xacts", required_argument, NULL, ARG_ASYNC },
 			{ "rows", required_argument, NULL, 'r' },
+			{ "load_factor", required_argument, NULL, 'l' },
 			{ "seconds", required_argument, NULL, 's' },
 			{ "iteration", required_argument, NULL, 'i' },
 			{ "keylen", required_argument, NULL, ARG_KEYLEN },
@ -1304,6 +1316,9 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
 			args.rows = atoi(optarg);
 			args.row_digits = digits(args.rows);
 			break;
+		case 'l':
+			args.load_factor = atof(optarg);
+			break;
 		case 's':
 			args.seconds = atoi(optarg);
 			break;
@ -1523,6 +1538,10 @@ int validateArguments(Arguments const& args) {
 		logr.error("--rows must be a positive integer");
 		return -1;
 	}
+	if (args.load_factor <= 0 || args.load_factor > 1) {
+		logr.error("--load_factor must be in range (0, 1]");
+		return -1;
+	}
 	if (args.key_length < 0) {
 		logr.error("--keylen must be a positive integer");
 		return -1;
@ -2118,6 +2137,7 @@ int statsProcessMain(Arguments const& args,
 		fmt::fprintf(fp, "\"async_xacts\": %d,", args.async_xacts);
 		fmt::fprintf(fp, "\"mode\": %d,", args.mode);
 		fmt::fprintf(fp, "\"rows\": %d,", args.rows);
+		fmt::fprintf(fp, "\"load_factor\": %lf,", args.load_factor);
 		fmt::fprintf(fp, "\"seconds\": %d,", args.seconds);
 		fmt::fprintf(fp, "\"iteration\": %d,", args.iteration);
 		fmt::fprintf(fp, "\"tpsmax\": %d,", args.tpsmax);
--- a/bindings/c/test/mako/mako.hpp
+++ b/bindings/c/test/mako/mako.hpp
@ -138,6 +138,7 @@ struct Arguments {
 	int async_xacts;
 	int mode;
 	int rows; /* is 2 billion enough? */
+	double load_factor;
 	int row_digits;
 	int seconds;
 	int iteration;
--- a/bindings/c/test/shim_lib_tester.cpp
+++ b/bindings/c/test/shim_lib_tester.cpp
@ -233,7 +233,7 @@ int main(int argc, char** argv) {
 		applyNetworkOptions(options);
 		fdb::network::setup();

-		std::thread network_thread{ &fdb::network::run };
+		std::thread network_thread{ [] { fdb_check(fdb::network::run(), "FDB network thread failed"); } };

 		// Try calling some basic functionality that is available
 		// in all recent API versions
--- a/bindings/c/test/unit/disconnected_timeout_tests.cpp
+++ b/bindings/c/test/unit/disconnected_timeout_tests.cpp
@ -271,7 +271,7 @@ int main(int argc, char** argv) {
 	context.applyCommandLine(argc, argv);

 	fdb_check(fdb_setup_network());
-	std::thread network_thread{ &fdb_run_network };
+	std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };

 	db = fdb_open_database(argv[1]);
 	timeoutDb = fdb_open_database(argv[1]);
--- a/bindings/c/test/unit/setup_tests.cpp
+++ b/bindings/c/test/unit/setup_tests.cpp
@ -66,7 +66,7 @@ TEST_CASE("setup") {
 	    },
 	    &context));

-	std::thread network_thread{ &fdb_run_network };
+	std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };

 	CHECK(!context.called);
 	fdb_check(fdb_stop_network());
--- a/bindings/c/test/unit/trace_partial_file_suffix_test.cpp
+++ b/bindings/c/test/unit/trace_partial_file_suffix_test.cpp
@ -68,7 +68,7 @@ int main(int argc, char** argv) {
 	set_net_opt(FDBNetworkOption::FDB_NET_OPTION_TRACE_PARTIAL_FILE_SUFFIX, trace_partial_file_suffix);

 	fdb_check(fdb_setup_network());
-	std::thread network_thread{ &fdb_run_network };
+	std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };

 	// Apparently you need to open a database to initialize logging
 	FDBDatabase* out;
--- a/bindings/c/test/unit/unit_tests.cpp
+++ b/bindings/c/test/unit/unit_tests.cpp
@ -2998,7 +2998,7 @@ int main(int argc, char** argv) {
 	context.applyCommandLine(argc, argv);

 	fdb_check(fdb_setup_network());
-	std::thread network_thread{ &fdb_run_network };
+	std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };

 	db = fdb_open_database(argv[1]);
 	clusterFilePath = std::string(argv[1]);
--- a/bindings/c/test/unit/unit_tests_version_510.cpp
+++ b/bindings/c/test/unit/unit_tests_version_510.cpp
@ -88,7 +88,7 @@ int main(int argc, char** argv) {
 	context.applyCommandLine(argc, argv);

 	fdb_check(fdb_setup_network());
-	std::thread network_thread{ &fdb_run_network };
+	std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };

 	{
 		FDBCluster* cluster;
--- a/bindings/go/src/fdb/generated.go
+++ b/bindings/go/src/fdb/generated.go
@ -392,11 +392,6 @@ func (o DatabaseOptions) SetTransactionIncludePortInAddress() error {
 	return o.setOpt(505, nil)
 }

-// Set a random idempotency id for all transactions. See the transaction option description for more information.
-func (o DatabaseOptions) SetTransactionAutomaticIdempotency() error {
-	return o.setOpt(506, nil)
-}
-
 // Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. This sets the ``bypass_unreadable`` option of each transaction created by this database. See the transaction option description for more information.
 func (o DatabaseOptions) SetTransactionBypassUnreadable() error {
 	return o.setOpt(700, nil)
@ -556,18 +551,6 @@ func (o TransactionOptions) SetSizeLimit(param int64) error {
 	return o.setOpt(503, int64ToBytes(param))
 }

-// Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes.
-//
-// Parameter: Unique ID
-func (o TransactionOptions) SetIdempotencyId(param string) error {
-	return o.setOpt(504, []byte(param))
-}
-
-// Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future.
-func (o TransactionOptions) SetAutomaticIdempotency() error {
-	return o.setOpt(505, nil)
-}
-
 // Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior.
 func (o TransactionOptions) SetSnapshotRywEnable() error {
 	return o.setOpt(600, nil)
--- a/cmake/AddFdbTest.cmake
+++ b/cmake/AddFdbTest.cmake
@ -320,11 +320,11 @@ function(create_long_running_correctness_package)
  add_custom_command(
    OUTPUT ${tar_file}
    DEPENDS ${package_files}
-            ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
-            ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
+            ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTest.sh
+            ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTest.sh
                                    ${out_dir}/joshua_test
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh
                                    ${out_dir}/joshua_timeout
    COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} ${package_files}
                                                    ${out_dir}/joshua_test
--- a/contrib/Joshua/scripts/longRunningCorrectnessTest.sh
+++ b/contrib/Joshua/scripts/longRunningCorrectnessTest.sh
@ -0,0 +1,9 @@
+#!/bin/sh
+
+# Simulation currently has memory leaks. We need to investigate before we can enable leak detection in joshua.
+export ASAN_OPTIONS="detect_leaks=0"
+
+OLDBINDIR="${OLDBINDIR:-/app/deploy/global_data/oldBinaries}"
+#mono bin/TestHarness.exe joshua-run "${OLDBINDIR}" false
+
+python3 -m test_harness.app -s ${JOSHUA_SEED} --old-binaries-path ${OLDBINDIR} --long-running
--- a/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh
+++ b/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh
@ -0,0 +1,3 @@
+#!/bin/bash -u
+
+python3 -m test_harness.timeout --long-running
--- a/contrib/TestHarness2/test_harness/config.py
+++ b/contrib/TestHarness2/test_harness/config.py
@ -184,6 +184,8 @@ class Config:
        self.reproduce_prefix: str | None = None
        self.reproduce_prefix_args = {'type': str, 'required': False,
                                      'help': 'When printing the results, prepend this string to the command'}
+        self.long_running: bool = False
+        self.long_running_args = {'action': 'store_true'}
        self._env_names: Dict[str, str] = {}
        self._config_map = self._build_map()
        self._read_env()
--- a/contrib/TestHarness2/test_harness/run.py
+++ b/contrib/TestHarness2/test_harness/run.py
@ -303,6 +303,7 @@ class TestRun:
        self.stats: str | None = stats
        self.expected_unseed: int | None = expected_unseed
        self.use_valgrind: bool = config.use_valgrind
+        self.long_running: bool = config.long_running
        self.old_binary_path: Path = config.old_binaries_path
        self.buggify_enabled: bool = buggify_enabled
        self.fault_injection_enabled: bool = True
@ -375,7 +376,7 @@ class TestRun:
        process = subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, cwd=self.temp_path,
                                   text=True, env=env)
        did_kill = False
-        timeout = 20 * config.kill_seconds if self.use_valgrind else config.kill_seconds
+        timeout = 20 * config.kill_seconds if self.use_valgrind or self.long_running else config.kill_seconds
        err_out: str
        try:
            _, err_out = process.communicate(timeout=timeout)
--- a/contrib/TestHarness2/test_harness/summarize.py
+++ b/contrib/TestHarness2/test_harness/summarize.py
@ -384,6 +384,7 @@ class Summary:
            child.attributes['Severity'] = '40'
            child.attributes['ErrorCount'] = str(self.errors)
            self.out.append(child)
+            self.error = True
        if self.was_killed:
            child = SummaryTree('ExternalTimeout')
            child.attributes['Severity'] = '40'
@ -420,6 +421,7 @@ class Summary:
            child = SummaryTree('TestUnexpectedlyNotFinished')
            child.attributes['Severity'] = '40'
            self.out.append(child)
+            self.error = True
        if self.error_out is not None and len(self.error_out) > 0:
            lines = self.error_out.splitlines()
            stderr_bytes = 0
--- a/design/global-tag-throttling.md
+++ b/design/global-tag-throttling.md
@ -47,6 +47,12 @@ Note that the quotas are specified in terms of bytes/second, and internally conv
 page_cost_quota = ceiling(byte_quota / CLIENT_KNOBS->READ_COST_BYTE_FACTOR)
 ```

+To clear a both reserved and total throughput quotas for a tag, run:
+
+```
+fdbcli> quota clear <tag>
+```
+
 ### Limit Calculation
 The transaction budget that ratekeeper calculates and distributes to clients (via GRV proxies) for each tag is calculated based on several intermediate rate calculations, outlined in this section.

--- a/documentation/sphinx/source/backups.rst
+++ b/documentation/sphinx/source/backups.rst
@ -524,6 +524,12 @@ The ``start`` command will start a new restore on the specified (or default) tag
 ``--inconsistent-snapshot-only``
  Ignore mutation log files during the restore to speedup the process. Because only range files are restored, this option gives an inconsistent snapshot in most cases and is not recommended to use.

+``--user-data``
+  Restore only the user keyspace. This option should NOT be used alongside --system-metadata (below) and CANNOT be used alongside other specified key ranges.
+
+``--system-metadata``
+  Restore only the relevant system keyspace. This option should NOT be used alongside --user-data (above) and CANNOT be used alongside other specified key ranges.
+
 .. program:: fdbrestore abort

 ``abort``
--- a/documentation/sphinx/source/client-testing.rst
+++ b/documentation/sphinx/source/client-testing.rst
@ -648,6 +648,16 @@ The subclasses of the ``ApiWorkload`` inherit the following configuration option
   initiated by a test script to check if the client workload is successfully progressing after a
   cluster change.

+The FDB server configuration can be specialized in the section ``[[server]]``:
+
+- ``tenants_enabled``: enable multitenancy (default: true)
+- ``blob_granules_enabled``:  enable support for blob granules (default: false)
+- ``tls_enabled``: enable TLS (default: false)
+- ``tls_client_chain_len``: the length of the client-side TLS chain (default: 2)
+- ``tls_server_chain_len``: the length of the server-side TLS chain (default: 3)
+- ``min_num_processes`` and ``max_num_processes``: the number of FDB server processes to be
+  randomly selected from the given range (default 1-3)
+
 Executing the Tests
 ===================

@ -656,19 +666,35 @@ according to its specification. Before that we must create a FDB cluster and pas
 a parameter to ``fdb_c_api_tester``. Note that multithreaded tests also need to be provided with an
 external client library. 

-For example, we can create a temporary cluster and use it for execution of one of the existing API tests:
+The ``run_c_api_tests.py`` script automates execution of the API tests on a local cluster. The cluster
+is created according to the options specified in the ``[[server]]`` section of the given test file.

 .. code-block:: bash

-   ${srcDir}/tests/TestRunner/tmp_cluster.py --build-dir ${buildDir} -- \
-      ${buildDir}/bin/fdb_c_api_tester \
-      --cluster-file @CLUSTER_FILE@ \
-      --external-client-library=${buildDir}/bindings/c/libfdb_c_external.so \
+   ${srcDir}/bindings/c/test/apitester/run_c_api_tests.py
+      --build-dir ${buildDir}
+      --api-tester-bin ${buildDir}/bin/fdb_c_api_tester
+      --external-client-library ${buildDir}/bindings/c/libfdb_c_external.so
      --test-file ${srcDir}/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml

 The test specifications added to the ``bindings/c/test/apitester/tests/`` directory are executed as a part
-of the regression test suite. They can be executed using the ``ctest`` target ``fdb_c_api_tests``:
+of the regression test suite as ``ctest`` targets with names ``fdb_c_api_test_{file_name}``.
+
+The ``ctest`` targets provide a more convenient way for executing the API tests. We can execute 
+a single test:

 .. code-block:: bash
   
-   ctest -R fdb_c_api_tests -VV
+   ctest -R fdb_c_api_test_CApiCorrectnessMultiThr -VV
+
+or execute all of them in parallel (here ``-j20`` specifies the parallelization level):
+
+.. code-block:: bash
+   
+   ctest -R fdb_c_api_test_ -j20 --output-on-failure
+
+More sophisticated filters can be applied to execute a selected set of tests, e.g. the tests using TLS:
+
+.. code-block:: bash
+
+   ctest -R 'fdb_c_api_test_.*TLS' -j20 --output_on_failure
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@ -528,7 +528,8 @@
                  "duplicate_mutation_fetch_timeout",
                  "primary_dc_missing",
                  "fetch_primary_dc_timeout",
-                  "fetch_storage_wiggler_stats_timeout"
+                  "fetch_storage_wiggler_stats_timeout",
+                  "fetch_consistency_scan_info_timeout"
               ]
            },
            "issues":[
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@ -47,6 +47,7 @@
 #include "fdbclient/IKnobCollection.h"
 #include "fdbclient/RunTransaction.actor.h"
 #include "fdbclient/S3BlobStore.h"
+#include "fdbclient/SystemData.h"
 #include "fdbclient/json_spirit/json_spirit_writer_template.h"

 #include "flow/Platform.h"
@ -155,6 +156,11 @@ enum {
 	OPT_RESTORE_CLUSTERFILE_ORIG,
 	OPT_RESTORE_BEGIN_VERSION,
 	OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY,
+	// The two restore options below allow callers of fdbrestore to divide a normal restore into one which restores just
+	// the system keyspace and another that restores just the user key space. This is unlike the backup command where
+	// all keys (both system and user) will be backed up together
+	OPT_RESTORE_USER_DATA,
+	OPT_RESTORE_SYSTEM_DATA,

 	// Shared constants
 	OPT_CLUSTERFILE,
@ -696,6 +702,8 @@ CSimpleOpt::SOption g_rgRestoreOptions[] = {
 	{ OPT_BACKUPKEYS, "--keys", SO_REQ_SEP },
 	{ OPT_WAITFORDONE, "-w", SO_NONE },
 	{ OPT_WAITFORDONE, "--waitfordone", SO_NONE },
+	{ OPT_RESTORE_USER_DATA, "--user-data", SO_NONE },
+	{ OPT_RESTORE_SYSTEM_DATA, "--system-metadata", SO_NONE },
 	{ OPT_RESTORE_VERSION, "--version", SO_REQ_SEP },
 	{ OPT_RESTORE_VERSION, "-v", SO_REQ_SEP },
 	{ OPT_TRACE, "--log", SO_NONE },
@ -1187,6 +1195,13 @@ static void printRestoreUsage(bool devhelp) {
 	printf("                 The cluster file for the original database from which the backup was created.  The "
 	       "original database\n");
 	printf("                 is only needed to convert a --timestamp argument to a database version.\n");
+	printf("  --user-data\n"
+	       "                  Restore only the user keyspace. This option should NOT be used alongside "
+	       "--system-metadata (below) and CANNOT be used alongside other specified key ranges.\n");
+	printf(
+	    "  --system-metadata\n"
+	    "                 Restore only the relevant system keyspace. This option "
+	    "should NOT be used alongside --user-data (above) and CANNOT be used alongside other specified key ranges.\n");

 	if (devhelp) {
 #ifdef _WIN32
@ -3367,6 +3382,8 @@ int main(int argc, char* argv[]) {
 		bool trace = false;
 		bool quietDisplay = false;
 		bool dryRun = false;
+		bool restoreSystemKeys = false;
+		bool restoreUserKeys = false;
 		// TODO (Nim): Set this value when we add optional encrypt_files CLI argument to backup agent start
 		bool encryptionEnabled = true;
 		std::string traceDir = "";
@ -3691,6 +3708,14 @@ int main(int argc, char* argv[]) {
 				restoreVersion = ver;
 				break;
 			}
+			case OPT_RESTORE_USER_DATA: {
+				restoreUserKeys = true;
+				break;
+			}
+			case OPT_RESTORE_SYSTEM_DATA: {
+				restoreSystemKeys = true;
+				break;
+			}
 			case OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY: {
 				inconsistentSnapshotOnly.set(true);
 				break;
@ -3838,6 +3863,11 @@ int main(int argc, char* argv[]) {
 			}
 		}

+		if (restoreSystemKeys && restoreUserKeys) {
+			fprintf(stderr, "ERROR: Please only specify one of --user-data or --system-metadata, not both\n");
+			return FDB_EXIT_ERROR;
+		}
+
 		if (trace) {
 			if (!traceLogGroup.empty())
 				setNetworkOption(FDBNetworkOptions::TRACE_LOG_GROUP, StringRef(traceLogGroup));
@ -3938,10 +3968,30 @@ int main(int argc, char* argv[]) {

 		// The fastrestore tool does not yet support multiple ranges and is incompatible with tenants
 		// or other features that back up data in the system keys
-		if (backupKeys.empty() && programExe != ProgramExe::FASTRESTORE_TOOL) {
+		if (!restoreSystemKeys && !restoreUserKeys && backupKeys.empty() &&
+		    programExe != ProgramExe::FASTRESTORE_TOOL) {
 			addDefaultBackupRanges(backupKeys);
 		}

+		if ((restoreSystemKeys || restoreUserKeys) && programExe == ProgramExe::FASTRESTORE_TOOL) {
+			fprintf(stderr, "ERROR: Options: --user-data and --system-metadata are not supported with fastrestore\n");
+			return FDB_EXIT_ERROR;
+		}
+
+		if ((restoreUserKeys || restoreSystemKeys) && !backupKeys.empty()) {
+			fprintf(stderr,
+			        "ERROR: Cannot specify additional ranges when using --user-data or --system-metadata "
+			        "options\n");
+			return FDB_EXIT_ERROR;
+		}
+		if (restoreUserKeys) {
+			backupKeys.push_back_deep(backupKeys.arena(), normalKeys);
+		} else if (restoreSystemKeys) {
+			for (const auto& r : getSystemBackupRanges()) {
+				backupKeys.push_back_deep(backupKeys.arena(), r);
+			}
+		}
+
 		switch (programExe) {
 		case ProgramExe::AGENT:
 			if (!initCluster())
--- a/fdbcli/QuotaCommand.actor.cpp
+++ b/fdbcli/QuotaCommand.actor.cpp
@ -56,7 +56,7 @@ ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
 	loop {
 		tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
 		try {
-			state ThreadFuture<Optional<Value>> resultFuture = tr->get(tag.withPrefix(tagQuotaPrefix));
+			state ThreadFuture<Optional<Value>> resultFuture = tr->get(ThrottleApi::getTagQuotaKey(tag));
 			Optional<Value> v = wait(safeThreadFutureToFuture(resultFuture));
 			if (!v.present()) {
 				fmt::print("<empty>\n");
@ -77,11 +77,10 @@ ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy

 ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitType limitType, double value) {
 	state Reference<ITransaction> tr = db->createTransaction();
-	state Key key = tag.withPrefix(tagQuotaPrefix);
 	loop {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		try {
-			state ThreadFuture<Optional<Value>> resultFuture = tr->get(key);
+			state ThreadFuture<Optional<Value>> resultFuture = tr->get(ThrottleApi::getTagQuotaKey(tag));
 			Optional<Value> v = wait(safeThreadFutureToFuture(resultFuture));
 			ThrottleApi::TagQuotaValue quota;
 			if (v.present()) {
@ -94,8 +93,27 @@ ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
 			} else if (limitType == LimitType::RESERVED) {
 				quota.reservedQuota = (value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1;
 			}
+			if (!quota.isValid()) {
+				throw invalid_throttle_quota_value();
+			}
 			ThrottleApi::setTagQuota(tr, tag, quota.reservedQuota, quota.totalQuota);
 			wait(safeThreadFutureToFuture(tr->commit()));
+			fmt::print("Successfully updated quota.\n");
+			return Void();
+		} catch (Error& e) {
+			wait(safeThreadFutureToFuture(tr->onError(e)));
+		}
+	}
+}
+
+ACTOR Future<Void> clearQuota(Reference<IDatabase> db, TransactionTag tag) {
+	state Reference<ITransaction> tr = db->createTransaction();
+	loop {
+		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+		try {
+			tr->clear(ThrottleApi::getTagQuotaKey(tag));
+			wait(safeThreadFutureToFuture(tr->commit()));
+			fmt::print("Successfully cleared quota.\n");
 			return Void();
 		} catch (Error& e) {
 			wait(safeThreadFutureToFuture(tr->onError(e)));
@ -104,7 +122,7 @@ ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
 }

 constexpr auto usage = "quota [get <tag> [reserved_throughput|total_throughput] | set <tag> "
-                       "[reserved_throughput|total_throughput] <value>]";
+                       "[reserved_throughput|total_throughput] <value> | clear <tag>]";

 bool exitFailure() {
 	fmt::print(usage);
@ -117,30 +135,40 @@ namespace fdb_cli {

 ACTOR Future<bool> quotaCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
 	state bool result = true;
-	if (tokens.size() != 5 && tokens.size() != 6) {
+	if (tokens.size() < 3 || tokens.size() > 5) {
 		return exitFailure();
 	} else {
-		auto tag = parseTag(tokens[2]);
-		auto limitType = parseLimitType(tokens[3]);
-		if (!tag.present() || !limitType.present()) {
+		auto const tag = parseTag(tokens[2]);
+		if (!tag.present()) {
 			return exitFailure();
 		}
 		if (tokens[1] == "get"_sr) {
 			if (tokens.size() != 4) {
 				return exitFailure();
 			}
+			auto const limitType = parseLimitType(tokens[3]);
+			if (!limitType.present()) {
+				return exitFailure();
+			}
 			wait(getQuota(db, tag.get(), limitType.get()));
 			return true;
 		} else if (tokens[1] == "set"_sr) {
 			if (tokens.size() != 5) {
 				return exitFailure();
 			}
+			auto const limitType = parseLimitType(tokens[3]);
 			auto const limitValue = parseLimitValue(tokens[4]);
-			if (!limitValue.present()) {
+			if (!limitType.present() || !limitValue.present()) {
 				return exitFailure();
 			}
 			wait(setQuota(db, tag.get(), limitType.get(), limitValue.get()));
 			return true;
+		} else if (tokens[1] == "clear"_sr) {
+			if (tokens.size() != 3) {
+				return exitFailure();
+			}
+			wait(clearQuota(db, tag.get()));
+			return true;
 		} else {
 			return exitFailure();
 		}
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@ -542,8 +542,8 @@ void initHelp() {
 	                "Displays the current read version of the database or currently running transaction.");
 	helpMap["quota"] = CommandHelp("quota",
 	                               "quota [get <tag> [reserved_throughput|total_throughput] | set <tag> "
-	                               "[reserved_throughput|total_throughput] <value>]",
-	                               "Get or modify the throughput quota for the specified tag.");
+	                               "[reserved_throughput|total_throughput] <value> | clear <tag>]",
+	                               "Get, modify, or clear the throughput quota for the specified tag.");
 	helpMap["reset"] =
 	    CommandHelp("reset",
 	                "reset the current transaction",
@ -1480,6 +1480,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
 						if (isCommitDesc && tokens.size() == 1) {
 							// prompt for description and add to txn
 							state Optional<std::string> raw;
+							warn.cancel();
 							while (!raw.present() || raw.get().empty()) {
 								fprintf(stdout,
 								        "Please set a description for the change. Description must be non-empty.\n");
@ -1490,6 +1491,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
 							std::string line = raw.get();
 							config_tr->set("\xff\xff/description"_sr, line);
 						}
+						warn =
+						    checkStatus(timeWarning(5.0, "\nWARNING: Long delay (Ctrl-C to interrupt)\n"), db, localDb);
 						if (transtype == TransType::Db) {
 							wait(commitTransaction(tr));
 						} else {
@ -1821,6 +1824,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
 						if (!intrans) {
 							// prompt for description and add to txn
 							state Optional<std::string> raw_desc;
+							warn.cancel();
 							while (!raw_desc.present() || raw_desc.get().empty()) {
 								fprintf(stdout,
 								        "Please set a description for the change. Description must be non-empty\n");
@ -1830,6 +1834,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
 							}
 							std::string line = raw_desc.get();
 							config_tr->set("\xff\xff/description"_sr, line);
+							warn = checkStatus(
+							    timeWarning(5.0, "\nWARNING: Long delay (Ctrl-C to interrupt)\n"), db, localDb);
 							wait(commitTransaction(config_tr));
 						} else {
 							isCommitDesc = true;
--- a/fdbcli/tests/fdbcli_tests.py
+++ b/fdbcli/tests/fdbcli_tests.py
@ -103,6 +103,59 @@ def maintenance(logger):
    output3 = run_fdbcli_command('maintenance')
    assert output3 == no_maintenance_output

+@enable_logging()
+def quota(logger):
+    # Should be a noop
+    command = 'quota clear green'
+    output = run_fdbcli_command(command)
+    logger.debug(command + ' : ' + output)
+    assert output == 'Successfully cleared quota.'
+
+    command = 'quota get green total_throughput'
+    output = run_fdbcli_command(command)
+    logger.debug(command + ' : ' + output)
+    assert output == '<empty>'
+
+    # Ignored update
+    command = 'quota set red total_throughput 49152'
+    output = run_fdbcli_command(command)
+    logger.debug(command + ' : ' + output)
+    assert output == 'Successfully updated quota.'
+
+    command = 'quota set green total_throughput 32768'
+    output = run_fdbcli_command(command)
+    logger.debug(command + ' : ' + output)
+    assert output == 'Successfully updated quota.'
+
+    command = 'quota set green reserved_throughput 16384'
+    output = run_fdbcli_command(command)
+    logger.debug(command + ' : ' + output)
+    assert output == 'Successfully updated quota.'
+
+    command = 'quota get green total_throughput'
+    output = run_fdbcli_command(command)
+    logger.debug(command + ' : ' + output)
+    assert output == '32768'
+
+    command = 'quota get green reserved_throughput'
+    output = run_fdbcli_command(command)
+    logger.debug(command + ' : ' + output)
+    assert output == '16384'
+
+    command = 'quota clear green'
+    output = run_fdbcli_command(command)
+    logger.debug(command + ' : ' + output)
+    assert output == 'Successfully cleared quota.'
+
+    command = 'quota get green total_throughput'
+    output = run_fdbcli_command(command)
+    logger.debug(command + ' : ' + output)
+    assert output == '<empty>'
+
+    # Too few arguments, should log help message
+    command = 'quota get green'
+    output = run_fdbcli_command(command)
+    logger.debug(command + ' : ' + output)

@enable_logging()
 def setclass(logger):
@ -1035,6 +1088,7 @@ if __name__ == '__main__':
        integer_options()
        tls_address_suffix()
        knobmanagement()
+        quota()
    else:
        assert args.process_number > 1, "Process number should be positive"
        coordinators()
--- a/fdbclient/BackupContainerLocalDirectory.actor.cpp
+++ b/fdbclient/BackupContainerLocalDirectory.actor.cpp
@ -63,7 +63,7 @@ public:
 		m_buffer = Standalone<VectorRef<uint8_t>>(old.slice(size, old.size()));

 		// Write the old buffer to the underlying file and update the write offset
-		Future<Void> r = holdWhile(old, m_file->write(old.begin(), size, m_writeOffset));
+		Future<Void> r = uncancellable(holdWhile(old, m_file->write(old.begin(), size, m_writeOffset)));
 		m_writeOffset += size;

 		return r;
--- a/fdbclient/BlobGranuleFiles.cpp
+++ b/fdbclient/BlobGranuleFiles.cpp
@ -1057,6 +1057,9 @@ ParsedDeltaBoundaryRef deltaAtVersion(const DeltaBoundaryRef& delta, Version beg
 	                  beginVersion <= delta.clearVersion.get();
 	if (delta.values.empty()) {
 		return ParsedDeltaBoundaryRef(delta.key, clearAfter);
+	} else if (readVersion >= delta.values.back().version && beginVersion <= delta.values.back().version) {
+		// for all but zero or one delta files, readVersion >= the entire delta file. optimize this case
+		return ParsedDeltaBoundaryRef(delta.key, clearAfter, delta.values.back());
 	}
 	auto valueAtVersion = std::lower_bound(delta.values.begin(),
 	                                       delta.values.end(),
@ -1324,7 +1327,8 @@ typedef std::priority_queue<MergeStreamNext, std::vector<MergeStreamNext>, Order

 static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
                                     const std::vector<Standalone<VectorRef<ParsedDeltaBoundaryRef>>>& streams,
-                                     const std::vector<bool> startClears) {
+                                     const std::vector<bool> startClears,
+                                     GranuleMaterializeStats& stats) {
 	ASSERT(streams.size() < std::numeric_limits<int16_t>::max());
 	ASSERT(startClears.size() == streams.size());

@ -1337,6 +1341,10 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
 	std::set<int16_t, std::greater<int16_t>> activeClears;
 	int16_t maxActiveClear = -1;

+	// trade off memory for cpu performance by assuming all inserts
+	RangeResult result;
+	int maxExpectedSize = 0;
+
 	// check if a given stream is actively clearing
 	bool clearActive[streams.size()];
 	for (int16_t i = 0; i < streams.size(); i++) {
@ -1354,10 +1362,12 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
 			item.streamIdx = i;
 			item.dataIdx = 0;
 			next.push(item);
+			maxExpectedSize += streams[i].size();
+			result.arena().dependsOn(streams[i].arena());
 		}
 	}
+	result.reserve(result.arena(), maxExpectedSize);

-	RangeResult result;
 	std::vector<MergeStreamNext> cur;
 	cur.reserve(streams.size());
 	while (!next.empty()) {
@ -1373,6 +1383,7 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,

 		// un-set clears and find latest value for key (if present)
 		bool foundValue = false;
+		bool includesSnapshot = cur.back().streamIdx == 0 && chunk.snapshotFile.present();
 		for (auto& it : cur) {
 			auto& v = streams[it.streamIdx][it.dataIdx];
 			if (clearActive[it.streamIdx]) {
@ -1391,7 +1402,14 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
 				if (v.isSet() && maxActiveClear < it.streamIdx) {
 					KeyRef finalKey =
 					    chunk.tenantPrefix.present() ? v.key.removePrefix(chunk.tenantPrefix.get()) : v.key;
-					result.push_back_deep(result.arena(), KeyValueRef(finalKey, v.value));
+					result.push_back(result.arena(), KeyValueRef(finalKey, v.value));
+					if (!includesSnapshot) {
+						stats.rowsInserted++;
+					} else if (it.streamIdx > 0) {
+						stats.rowsUpdated++;
+					}
+				} else if (includesSnapshot) {
+					stats.rowsCleared++;
 				}
 			}
 		}
@ -1413,6 +1431,36 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
 		}
 	}

+	// FIXME: if memory assumption was wrong and result is significantly smaller than total input size, could copy it
+	// with push_back_deep to a new result. This is rare though
+
+	stats.outputBytes += result.expectedSize();
+
+	return result;
+}
+
+RangeResult materializeJustSnapshot(const BlobGranuleChunkRef& chunk,
+                                    Optional<StringRef> snapshotData,
+                                    const KeyRange& requestRange,
+                                    GranuleMaterializeStats& stats) {
+	stats.inputBytes += snapshotData.get().size();
+
+	Standalone<VectorRef<ParsedDeltaBoundaryRef>> snapshotRows = loadSnapshotFile(
+	    chunk.snapshotFile.get().filename, snapshotData.get(), requestRange, chunk.snapshotFile.get().cipherKeysCtx);
+	RangeResult result;
+	if (!snapshotRows.empty()) {
+		result.arena().dependsOn(snapshotRows.arena());
+		result.reserve(result.arena(), snapshotRows.size());
+		for (auto& it : snapshotRows) {
+			// TODO REMOVE validation
+			ASSERT(it.op == MutationRef::Type::SetValue);
+			KeyRef finalKey = chunk.tenantPrefix.present() ? it.key.removePrefix(chunk.tenantPrefix.get()) : it.key;
+			result.push_back(result.arena(), KeyValueRef(finalKey, it.value));
+		}
+		stats.outputBytes += result.expectedSize();
+		stats.snapshotRows += result.size();
+	}
+
 	return result;
 }

@ -1421,7 +1469,8 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
                                   Version beginVersion,
                                   Version readVersion,
                                   Optional<StringRef> snapshotData,
-                                   StringRef deltaFileData[]) {
+                                   StringRef deltaFileData[],
+                                   GranuleMaterializeStats& stats) {
 	// TODO REMOVE with early replying
 	ASSERT(readVersion == chunk.includedVersion);

@ -1438,12 +1487,18 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
 		requestRange = keyRange;
 	}

+	// fast case for only-snapshot read
+	if (chunk.snapshotFile.present() && chunk.deltaFiles.empty() && chunk.newDeltas.empty()) {
+		return materializeJustSnapshot(chunk, snapshotData, requestRange, stats);
+	}
+
 	std::vector<Standalone<VectorRef<ParsedDeltaBoundaryRef>>> streams;
 	std::vector<bool> startClears;
 	// +1 for possible snapshot, +1 for possible memory deltas
 	streams.reserve(chunk.deltaFiles.size() + 2);

 	if (snapshotData.present()) {
+		stats.inputBytes += snapshotData.get().size();
 		ASSERT(chunk.snapshotFile.present());
 		Standalone<VectorRef<ParsedDeltaBoundaryRef>> snapshotRows =
 		    loadSnapshotFile(chunk.snapshotFile.get().filename,
@ -1454,13 +1509,17 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
 			streams.push_back(snapshotRows);
 			startClears.push_back(false);
 			arena.dependsOn(streams.back().arena());
+			stats.snapshotRows += snapshotRows.size();
 		}
+	} else {
+		ASSERT(!chunk.snapshotFile.present());
 	}

 	if (BG_READ_DEBUG) {
 		fmt::print("Applying {} delta files\n", chunk.deltaFiles.size());
 	}
 	for (int deltaIdx = 0; deltaIdx < chunk.deltaFiles.size(); deltaIdx++) {
+		stats.inputBytes += deltaFileData[deltaIdx].size();
 		bool startClear = false;
 		auto deltaRows = loadChunkedDeltaFile(chunk.deltaFiles[deltaIdx].filename,
 		                                      deltaFileData[deltaIdx],
@ -1480,6 +1539,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
 		fmt::print("Applying {} memory deltas\n", chunk.newDeltas.size());
 	}
 	if (!chunk.newDeltas.empty()) {
+		stats.inputBytes += chunk.newDeltas.expectedSize();
 		// TODO REMOVE validation
 		ASSERT(beginVersion <= chunk.newDeltas.front().version);
 		ASSERT(readVersion >= chunk.newDeltas.back().version);
@ -1491,7 +1551,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
 		}
 	}

-	return mergeDeltaStreams(chunk, streams, startClears);
+	return mergeDeltaStreams(chunk, streams, startClears, stats);
 }

 struct GranuleLoadFreeHandle : NonCopyable, ReferenceCounted<GranuleLoadFreeHandle> {
@ -1560,8 +1620,6 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
 	}

 	GranuleLoadIds loadIds[files.size()];
-	int64_t inputBytes = 0;
-	int64_t outputBytes = 0;

 	try {
 		// Kick off first file reads if parallelism > 1
@ -1586,7 +1644,6 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
 				if (!snapshotData.get().begin()) {
 					return ErrorOr<RangeResult>(blob_granule_file_load_error());
 				}
-				inputBytes += snapshotData.get().size();
 			}

 			// +1 to avoid UBSAN variable length array of size zero
@ -1599,16 +1656,11 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
 				if (!deltaData[i].begin()) {
 					return ErrorOr<RangeResult>(blob_granule_file_load_error());
 				}
-				inputBytes += deltaData[i].size();
 			}

-			inputBytes += files[chunkIdx].newDeltas.expectedSize();
-
 			// materialize rows from chunk
-			chunkRows =
-			    materializeBlobGranule(files[chunkIdx], keyRange, beginVersion, readVersion, snapshotData, deltaData);
-
-			outputBytes += chunkRows.expectedSize();
+			chunkRows = materializeBlobGranule(
+			    files[chunkIdx], keyRange, beginVersion, readVersion, snapshotData, deltaData, stats);

 			results.arena().dependsOn(chunkRows.arena());
 			results.append(results.arena(), chunkRows.begin(), chunkRows.size());
@ -1616,8 +1668,6 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
 			// free once done by forcing FreeHandles to trigger
 			loadIds[chunkIdx].freeHandles.clear();
 		}
-		stats.inputBytes = inputBytes;
-		stats.outputBytes = outputBytes;
 		return ErrorOr<RangeResult>(results);
 	} catch (Error& e) {
 		return ErrorOr<RangeResult>(e);
@ -2303,6 +2353,7 @@ void checkDeltaRead(const KeyValueGen& kvGen,
 	// expected answer
 	std::map<KeyRef, ValueRef> expectedData;
 	Version lastFileEndVersion = 0;
+	GranuleMaterializeStats stats;

 	fmt::print("Delta Read [{0} - {1}) @ {2} - {3}\n",
 	           range.begin.printable(),
@ -2322,7 +2373,7 @@ void checkDeltaRead(const KeyValueGen& kvGen,
 	chunk.includedVersion = readVersion;
 	chunk.snapshotVersion = invalidVersion;

-	RangeResult actualData = materializeBlobGranule(chunk, range, beginVersion, readVersion, {}, serialized);
+	RangeResult actualData = materializeBlobGranule(chunk, range, beginVersion, readVersion, {}, serialized, stats);

 	if (expectedData.size() != actualData.size()) {
 		fmt::print("Expected Data {0}:\n", expectedData.size());
@ -2430,6 +2481,7 @@ void checkGranuleRead(const KeyValueGen& kvGen,
 	}
 	Version lastFileEndVersion = 0;
 	applyDeltasByVersion(deltaData, range, beginVersion, readVersion, lastFileEndVersion, expectedData);
+	GranuleMaterializeStats stats;

 	// actual answer
 	Standalone<BlobGranuleChunkRef> chunk;
@ -2477,7 +2529,8 @@ void checkGranuleRead(const KeyValueGen& kvGen,
 	if (beginVersion == 0) {
 		snapshotPtr = serializedSnapshot;
 	}
-	RangeResult actualData = materializeBlobGranule(chunk, range, beginVersion, readVersion, snapshotPtr, deltaPtrs);
+	RangeResult actualData =
+	    materializeBlobGranule(chunk, range, beginVersion, readVersion, snapshotPtr, deltaPtrs, stats);

 	if (expectedData.size() != actualData.size()) {
 		fmt::print("Expected Size {0} != Actual Size {1}\n", expectedData.size(), actualData.size());
@ -2663,6 +2716,14 @@ struct CommonPrefixStats {
 	int totalKeys = 0;
 	int minKeySize = 1000000000;
 	int maxKeySize = 0;
+	int64_t logicalBytes = 0;
+	int64_t totalLogicalBytes = 0;
+
+	int deltas = 0;
+	int deltasSet = 0;
+	int deltasClear = 0;
+	int deltasNoOp = 0;
+	int deltasClearAfter = 0;

 	void addKey(const KeyRef& k) {
 		if (len == -1) {
@ -2677,7 +2738,38 @@ struct CommonPrefixStats {
 		maxKeySize = std::max(maxKeySize, k.size());
 	}

+	void addKeyValue(const KeyRef& k, const ValueRef& v) {
+		addKey(k);
+		logicalBytes += k.size();
+		logicalBytes += v.size();
+	}
+
+	void addBoundary(const ParsedDeltaBoundaryRef& d) {
+		addKey(d.key);
+
+		deltas++;
+		if (d.isSet()) {
+			deltasSet++;
+			logicalBytes += d.value.size();
+		} else if (d.isClear()) {
+			deltasClear++;
+		} else {
+			ASSERT(d.isNoOp());
+			deltasNoOp++;
+		}
+		if (d.clearAfter) {
+			deltasClearAfter++;
+		}
+	}
+
+	void doneFile() {
+		totalLogicalBytes += logicalBytes;
+		fmt::print("Logical Size: {0}\n", logicalBytes);
+		logicalBytes = 0;
+	}
+
 	Key done() {
+		doneFile();
 		ASSERT(len >= 0);
 		fmt::print("Common prefix: {0}\nCommon Prefix Length: {1}\nAverage Key Size: {2}\nMin Key Size: {3}, Max Key "
 		           "Size: {4}\n",
@ -2686,11 +2778,21 @@ struct CommonPrefixStats {
 		           totalKeySize / totalKeys,
 		           minKeySize,
 		           maxKeySize);
+
+		if (deltas > 0) {
+			fmt::print("Delta stats: {0} deltas, {1} sets, {2} clears, {3} noops, {4} clearAfters\n",
+			           deltas,
+			           deltasSet,
+			           deltasClear,
+			           deltasNoOp,
+			           deltasClearAfter);
+		}
+		fmt::print("Logical Size: {0}\n", totalLogicalBytes);
 		return key.substr(0, len);
 	}
 };

-FileSet loadFileSet(std::string basePath, const std::vector<std::string>& filenames) {
+FileSet loadFileSet(std::string basePath, const std::vector<std::string>& filenames, bool newFormat) {
 	FileSet files;
 	CommonPrefixStats stats;
 	for (int i = 0; i < filenames.size(); i++) {
@ -2701,40 +2803,66 @@ FileSet loadFileSet(std::string basePath, const std::vector<std::string>& filena
 			std::string fpath = basePath + filenames[i];
 			Value data = loadFileData(fpath);

-			Arena arena;
-			GranuleSnapshot file;
-			ObjectReader dataReader(data.begin(), Unversioned());
-			dataReader.deserialize(FileIdentifierFor<GranuleSnapshot>::value, file, arena);
-			Standalone<GranuleSnapshot> parsed(file, arena);
+			Standalone<GranuleSnapshot> parsed;
+			if (!newFormat) {
+				Arena arena;
+				GranuleSnapshot file;
+				ObjectReader dataReader(data.begin(), Unversioned());
+				dataReader.deserialize(FileIdentifierFor<GranuleSnapshot>::value, file, arena);
+				parsed = Standalone<GranuleSnapshot>(file, arena);
+				fmt::print("Loaded {0} rows from snapshot file\n", parsed.size());
+
+				for (auto& it : parsed) {
+					stats.addKeyValue(it.key, it.value);
+				}
+			} else {
+				Standalone<VectorRef<ParsedDeltaBoundaryRef>> res = loadSnapshotFile(""_sr, data, normalKeys, {});
+				fmt::print("Loaded {0} rows from snapshot file\n", res.size());
+				for (auto& it : res) {
+					stats.addKeyValue(it.key, it.value);
+				}
+			}

-			fmt::print("Loaded {0} rows from snapshot file\n", parsed.size());
 			files.snapshotFile = { filenames[i], version, data, parsed };

-			for (auto& it : parsed) {
-				stats.addKey(it.key);
-			}
 		} else {
 			std::string fpath = basePath + filenames[i];
 			Value data = loadFileData(fpath);

-			Arena arena;
-			GranuleDeltas file;
-			ObjectReader dataReader(data.begin(), Unversioned());
-			dataReader.deserialize(FileIdentifierFor<GranuleDeltas>::value, file, arena);
-			Standalone<GranuleDeltas> parsed(file, arena);
+			if (!newFormat) {
+				Arena arena;
+				GranuleDeltas file;
+				ObjectReader dataReader(data.begin(), Unversioned());
+				dataReader.deserialize(FileIdentifierFor<GranuleDeltas>::value, file, arena);
+				Standalone<GranuleDeltas> parsed(file, arena);

-			fmt::print("Loaded {0} deltas from delta file\n", parsed.size());
-			files.deltaFiles.push_back({ filenames[i], version, data, parsed });
+				fmt::print("Loaded {0} deltas from delta file\n", parsed.size());
+				files.deltaFiles.push_back({ filenames[i], version, data, parsed });

-			for (auto& it : parsed) {
-				for (auto& it2 : it.mutations) {
-					stats.addKey(it2.param1);
-					if (it2.type == MutationRef::Type::ClearRange) {
-						stats.addKey(it2.param2);
+				for (auto& it : parsed) {
+					for (auto& it2 : it.mutations) {
+						stats.addKey(it2.param1);
+						if (it2.type == MutationRef::Type::ClearRange) {
+							stats.addKey(it2.param2);
+						}
 					}
 				}
+			} else {
+				bool startClear = false;
+				Standalone<VectorRef<ParsedDeltaBoundaryRef>> res =
+				    loadChunkedDeltaFile(""_sr, data, normalKeys, 0, version, {}, startClear);
+				ASSERT(!startClear);
+
+				Standalone<GranuleDeltas> parsed;
+				fmt::print("Loaded {0} boundaries from delta file\n", res.size());
+				files.deltaFiles.push_back({ filenames[i], version, data, parsed });
+
+				for (auto& it : res) {
+					stats.addBoundary(it);
+				}
 			}
 		}
+		stats.doneFile();
 	}

 	files.commonPrefix = stats.done();
@ -2792,6 +2920,28 @@ std::pair<int64_t, double> doDeltaWriteBench(const Standalone<GranuleDeltas>& da
 	return { serializedBytes, elapsed };
 }

+void chunkFromFileSet(const FileSet& fileSet,
+                      Standalone<BlobGranuleChunkRef>& chunk,
+                      StringRef* deltaPtrs,
+                      Version readVersion,
+                      Optional<BlobGranuleCipherKeysCtx> keys,
+                      int numDeltaFiles) {
+	size_t snapshotSize = std::get<3>(fileSet.snapshotFile).size();
+	chunk.snapshotFile =
+	    BlobFilePointerRef(chunk.arena(), std::get<0>(fileSet.snapshotFile), 0, snapshotSize, snapshotSize, keys);
+
+	for (int i = 0; i < numDeltaFiles; i++) {
+		size_t deltaSize = std::get<3>(fileSet.deltaFiles[i]).size();
+		chunk.deltaFiles.emplace_back_deep(
+		    chunk.arena(), std::get<0>(fileSet.deltaFiles[i]), 0, deltaSize, deltaSize, keys);
+		deltaPtrs[i] = std::get<2>(fileSet.deltaFiles[i]);
+	}
+
+	chunk.keyRange = fileSet.range;
+	chunk.includedVersion = readVersion;
+	chunk.snapshotVersion = std::get<1>(fileSet.snapshotFile);
+}
+
 FileSet rewriteChunkedFileSet(const FileSet& fileSet,
                              Optional<BlobGranuleCipherKeysCtx> keys,
                              Optional<CompressionFilter> compressionFilter) {
@ -2818,40 +2968,30 @@ std::pair<int64_t, double> doReadBench(const FileSet& fileSet,
                                       KeyRange readRange,
                                       bool clearAllAtEnd,
                                       Optional<BlobGranuleCipherKeysCtx> keys,
-                                       Optional<CompressionFilter> compressionFilter) {
+                                       int numDeltaFiles,
+                                       bool printStats = false) {
 	Version readVersion = std::get<1>(fileSet.deltaFiles.back());

 	Standalone<BlobGranuleChunkRef> chunk;
-	StringRef deltaPtrs[fileSet.deltaFiles.size()];
+	GranuleMaterializeStats stats;
+	ASSERT(numDeltaFiles >= 0 && numDeltaFiles <= fileSet.deltaFiles.size());
+	StringRef deltaPtrs[numDeltaFiles];

 	MutationRef clearAllAtEndMutation;
 	if (clearAllAtEnd) {
 		clearAllAtEndMutation = MutationRef(MutationRef::Type::ClearRange, readRange.begin, readRange.end);
 	}
 	if (chunked) {
-		size_t snapshotSize = std::get<3>(fileSet.snapshotFile).size();
-		chunk.snapshotFile =
-		    BlobFilePointerRef(chunk.arena(), std::get<0>(fileSet.snapshotFile), 0, snapshotSize, snapshotSize, keys);
-
-		for (int i = 0; i < fileSet.deltaFiles.size(); i++) {
-			size_t deltaSize = std::get<3>(fileSet.deltaFiles[i]).size();
-			chunk.deltaFiles.emplace_back_deep(
-			    chunk.arena(), std::get<0>(fileSet.deltaFiles[i]), 0, deltaSize, deltaSize, keys);
-			deltaPtrs[i] = std::get<2>(fileSet.deltaFiles[i]);
-		}
-
+		chunkFromFileSet(fileSet, chunk, deltaPtrs, readVersion, keys, numDeltaFiles);
 		if (clearAllAtEnd) {
 			readVersion++;
 			MutationsAndVersionRef lastDelta;
 			lastDelta.version = readVersion;
 			lastDelta.mutations.push_back(chunk.arena(), clearAllAtEndMutation);
+			chunk.includedVersion = readVersion;

 			chunk.newDeltas.push_back_deep(chunk.arena(), lastDelta);
 		}
-
-		chunk.keyRange = fileSet.range;
-		chunk.includedVersion = readVersion;
-		chunk.snapshotVersion = std::get<1>(fileSet.snapshotFile);
 	}

 	int64_t serializedBytes = 0;
@ -2875,14 +3015,26 @@ std::pair<int64_t, double> doReadBench(const FileSet& fileSet,
 			}
 			serializedBytes += actualData.expectedSize();
 		} else {
-			RangeResult actualData =
-			    materializeBlobGranule(chunk, readRange, 0, readVersion, std::get<2>(fileSet.snapshotFile), deltaPtrs);
+			RangeResult actualData = materializeBlobGranule(
+			    chunk, readRange, 0, readVersion, std::get<2>(fileSet.snapshotFile), deltaPtrs, stats);
 			serializedBytes += actualData.expectedSize();
 		}
 	}
 	elapsed += timer_monotonic();
 	elapsed /= READ_RUNS;
 	serializedBytes /= READ_RUNS;
+
+	if (printStats) {
+		fmt::print("Materialize stats:\n");
+		fmt::print("  Input bytes:  {0}\n", stats.inputBytes / READ_RUNS);
+		fmt::print("  Output bytes: {0}\n", stats.outputBytes / READ_RUNS);
+		fmt::print("    Write Amp:  {0}\n", (1.0 * stats.inputBytes) / stats.outputBytes);
+		fmt::print("  Snapshot Rows: {0}\n", stats.snapshotRows / READ_RUNS);
+		fmt::print("  Rows Cleared:  {0}\n", stats.rowsCleared / READ_RUNS);
+		fmt::print("  Rows Inserted: {0}\n", stats.rowsInserted / READ_RUNS);
+		fmt::print("  Rows Updated:  {0}\n", stats.rowsUpdated / READ_RUNS);
+	}
+
 	return { serializedBytes, elapsed };
 }

@ -2913,7 +3065,7 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
 	int64_t logicalSnapshotSize = 0;
 	int64_t logicalDeltaSize = 0;
 	for (auto& it : fileSetNames) {
-		FileSet fileSet = loadFileSet(basePath, it);
+		FileSet fileSet = loadFileSet(basePath, it, false);
 		fileSets.push_back(fileSet);
 		logicalSnapshotSize += std::get<3>(fileSet.snapshotFile).expectedSize();
 		for (auto& deltaFile : fileSet.deltaFiles) {
@ -2944,7 +3096,7 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
 					if (encrypt) {
 						name += "ENC";
 					}
-					if (compressionFilter.present()) {
+					if (compressionFilter.present() && compressionFilter.get() != CompressionFilter::NONE) {
 						name += "CMP";
 					}
 					if (name.empty()) {
@ -3000,9 +3152,16 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
 	std::vector<std::string> readRunNames = {};
 	std::vector<std::pair<int64_t, double>> readMetrics;

-	bool doEdgeCaseReadTests = true;
+	bool doEdgeCaseReadTests = false;
+	bool doVaryingDeltaTests = false;
 	std::vector<double> clearAllReadMetrics;
 	std::vector<double> readSingleKeyMetrics;
+	std::vector<std::vector<std::pair<int64_t, double>>> varyingDeltaMetrics;
+
+	size_t maxDeltaFiles = 100000;
+	for (auto& f : fileSets) {
+		maxDeltaFiles = std::min(maxDeltaFiles, f.deltaFiles.size());
+	}

 	for (bool chunk : chunkModes) {
 		for (bool encrypt : encryptionModes) {
@ -3025,7 +3184,7 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
 					if (encrypt) {
 						name += "ENC";
 					}
-					if (compressionFilter.present()) {
+					if (compressionFilter.present() && compressionFilter.get() != CompressionFilter::NONE) {
 						name += "CMP";
 					}
 					if (name.empty()) {
@ -3038,6 +3197,10 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
 				double totalElapsed = 0.0;
 				double totalElapsedClearAll = 0.0;
 				double totalElapsedSingleKey = 0.0;
+				std::vector<std::pair<int64_t, double>> varyingDeltas;
+				for (int i = 0; i <= maxDeltaFiles; i++) {
+					varyingDeltas.push_back({ 0, 0.0 });
+				}
 				for (auto& fileSet : fileSets) {
 					FileSet newFileSet;
 					if (!chunk) {
@ -3046,24 +3209,38 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
 						newFileSet = rewriteChunkedFileSet(fileSet, keys, compressionFilter);
 					}

-					auto res = doReadBench(newFileSet, chunk, fileSet.range, false, keys, compressionFilter);
+					auto res = doReadBench(newFileSet, chunk, fileSet.range, false, keys, newFileSet.deltaFiles.size());
 					totalBytesRead += res.first;
 					totalElapsed += res.second;

 					if (doEdgeCaseReadTests) {
 						totalElapsedClearAll +=
-						    doReadBench(newFileSet, chunk, fileSet.range, true, keys, compressionFilter).second;
+						    doReadBench(newFileSet, chunk, fileSet.range, true, keys, newFileSet.deltaFiles.size())
+						        .second;
 						Key k = std::get<3>(fileSet.snapshotFile).front().key;
 						KeyRange singleKeyRange(KeyRangeRef(k, keyAfter(k)));
 						totalElapsedSingleKey +=
-						    doReadBench(newFileSet, chunk, singleKeyRange, false, keys, compressionFilter).second;
+						    doReadBench(newFileSet, chunk, singleKeyRange, false, keys, newFileSet.deltaFiles.size())
+						        .second;
+					}
+
+					if (doVaryingDeltaTests && chunk) {
+						for (int i = 0; i <= maxDeltaFiles; i++) {
+							auto r = doReadBench(newFileSet, chunk, fileSet.range, false, keys, i);
+							varyingDeltas[i].first += r.first;
+							varyingDeltas[i].second += r.second;
+						}
 					}
 				}
 				readMetrics.push_back({ totalBytesRead, totalElapsed });
+
 				if (doEdgeCaseReadTests) {
 					clearAllReadMetrics.push_back(totalElapsedClearAll);
 					readSingleKeyMetrics.push_back(totalElapsedSingleKey);
 				}
+				if (doVaryingDeltaTests) {
+					varyingDeltaMetrics.push_back(varyingDeltas);
+				}
 			}
 		}
 	}
@ -3097,6 +3274,25 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
 		}
 	}

+	if (doVaryingDeltaTests) {
+		ASSERT(readRunNames.size() == varyingDeltaMetrics.size());
+		fmt::print("\n\nVarying Deltas Read Results:\nDF#\t");
+		for (int i = 0; i <= maxDeltaFiles; i++) {
+			fmt::print("{0}\t", i);
+		}
+		fmt::print("\n");
+
+		for (int i = 0; i < readRunNames.size(); i++) {
+			fmt::print("{0}", readRunNames[i]);
+
+			for (auto& it : varyingDeltaMetrics[i]) {
+				double MBperCPUsec = (it.first / 1024.0 / 1024.0) / it.second;
+				fmt::print("\t{:.6}", MBperCPUsec);
+			}
+			fmt::print("\n");
+		}
+	}
+
 	fmt::print("\n\nCombined Results:\n");
 	ASSERT(readRunNames.size() == runNames.size() - 1);
 	for (int i = 0; i < readRunNames.size(); i++) {
@ -3113,3 +3309,22 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {

 	return Void();
 }
+
+TEST_CASE("!/blobgranule/files/repeatFromFiles") {
+	std::string basePath = "SET_ME";
+	std::vector<std::vector<std::string>> fileSetNames = { { "SET_ME" } };
+
+	int64_t totalBytesRead = 0;
+	double totalElapsed = 0.0;
+	for (auto& it : fileSetNames) {
+		FileSet fileSet = loadFileSet(basePath, it, true);
+		auto res = doReadBench(fileSet, true, fileSet.range, false, {}, fileSet.deltaFiles.size(), true);
+		totalBytesRead += res.first;
+		totalElapsed += res.second;
+	}
+
+	double MBperCPUsec = (totalBytesRead / 1024.0 / 1024.0) / totalElapsed;
+	fmt::print("Read Results: {:.6} MB/cpusec\n", MBperCPUsec);
+
+	return Void();
+}
--- a/fdbclient/BlobGranuleReader.actor.cpp
+++ b/fdbclient/BlobGranuleReader.actor.cpp
@ -105,7 +105,9 @@ ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
 			arena.dependsOn(data.arena());
 		}

-		return materializeBlobGranule(chunk, keyRange, beginVersion, readVersion, snapshotData, deltaData);
+		// TODO do something useful with stats?
+		GranuleMaterializeStats stats;
+		return materializeBlobGranule(chunk, keyRange, beginVersion, readVersion, snapshotData, deltaData, stats);

 	} catch (Error& e) {
 		throw e;
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@ -1040,13 +1040,10 @@ private:
 	Key lastValue;
 };

-ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
-                                        Standalone<VectorRef<KeyValueRef>>* results,
-                                        bool encryptedBlock,
-                                        Optional<Database> cx) {
+void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>* results) {
 	// Read begin key, if this fails then block was invalid.
-	state uint32_t kLen = reader->consumeNetworkUInt32();
-	state const uint8_t* k = reader->consume(kLen);
+	uint32_t kLen = reader->consumeNetworkUInt32();
+	const uint8_t* k = reader->consume(kLen);
 	results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef()));

 	// Read kv pairs and end key
@ -1075,7 +1072,6 @@ ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
 	for (auto b : reader->remainder())
 		if (b != 0xFF)
 			throw restore_corrupted_data_padding();
-	return Void();
 }

 ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file,
@ -1083,7 +1079,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
                                                                      int len,
                                                                      Optional<Database> cx) {
 	state Standalone<StringRef> buf = makeString(len);
-	int rLen = wait(file->read(mutateString(buf), len, offset));
+	int rLen = wait(uncancellable(holdWhile(buf, file->read(mutateString(buf), len, offset))));
 	if (rLen != len)
 		throw restore_bad_read();

@ -1098,7 +1094,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
 		// BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION
 		int32_t file_version = reader.consume<int32_t>();
 		if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {
-			wait(decodeKVPairs(&reader, &results, false, cx));
+			decodeKVPairs(&reader, &results);
 		} else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) {
 			CODE_PROBE(true, "decoding encrypted block");
 			ASSERT(cx.present());
@ -1121,7 +1117,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
 			StringRef decryptedData =
 			    wait(EncryptedRangeFileWriter::decrypt(cx.get(), header, dataPayloadStart, dataLen, &results.arena()));
 			reader = StringRefReader(decryptedData, restore_corrupted_data());
-			wait(decodeKVPairs(&reader, &results, true, cx));
+			decodeKVPairs(&reader, &results);
 		} else {
 			throw restore_unsupported_file_version();
 		}
@ -1704,7 +1700,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {
 		state std::unique_ptr<IRangeFileWriter> rangeFile;
 		state BackupConfig backup(task);
 		state Arena arena;
-		state Reference<TenantEntryCache<Void>> tenantCache = makeReference<TenantEntryCache<Void>>(cx);
+		state Reference<TenantEntryCache<Void>> tenantCache;

 		// Don't need to check keepRunning(task) here because we will do that while finishing each output file, but
 		// if bc is false then clearly the backup is no longer in progress
@ -1798,6 +1794,10 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {
 				// Initialize range file writer and write begin key
 				if (encryptionEnabled) {
 					CODE_PROBE(true, "using encrypted snapshot file writer");
+					if (!tenantCache.isValid()) {
+						tenantCache = makeReference<TenantEntryCache<Void>>(cx, TenantEntryCacheRefreshMode::WATCH);
+						wait(tenantCache->init());
+					}
 					rangeFile = std::make_unique<EncryptedRangeFileWriter>(cx, &arena, tenantCache, outFile, blockSize);
 				} else {
 					rangeFile = std::make_unique<RangeFileWriter>(outFile, blockSize);
--- a/fdbclient/IdempotencyId.cpp
+++ b/fdbclient/IdempotencyId.cpp
@ -122,6 +122,7 @@ IdempotencyIdRef generate(Arena& arena) {
 TEST_CASE("/fdbclient/IdempotencyId/basic") {
 	Arena arena;
 	uint16_t firstBatchIndex = deterministicRandom()->randomUInt32();
+	firstBatchIndex &= 0xff7f; // ensure firstBatchIndex+5 won't change the higher order byte
 	uint16_t batchIndex = firstBatchIndex;
 	Version commitVersion = deterministicRandom()->randomInt64(0, std::numeric_limits<Version>::max());
 	std::vector<IdempotencyIdRef> idVector; // Reference
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -40,6 +40,7 @@
 #include "fdbclient/FDBTypes.h"
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbrpc/MultiInterface.h"
+#include "fdbrpc/TenantInfo.h"

 #include "fdbclient/ActorLineageProfiler.h"
 #include "fdbclient/AnnotateActor.h"
@ -66,6 +67,7 @@
 #include "fdbclient/SpecialKeySpace.actor.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbclient/SystemData.h"
+#include "fdbclient/Tenant.h"
 #include "fdbclient/TenantSpecialKeys.actor.h"
 #include "fdbclient/TransactionLineage.h"
 #include "fdbclient/versions.h"
@ -687,25 +689,8 @@ ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
 			    .detail("MedianBytesPerCommit", cx->bytesPerCommit.median())
 			    .detail("MaxBytesPerCommit", cx->bytesPerCommit.max())
 			    .detail("NumLocalityCacheEntries", cx->locationCache.size());
-			if (cx->anyBlobGranuleRequests) {
-				ev.detail("MeanBGLatency", cx->bgLatencies.mean())
-				    .detail("MedianBGLatency", cx->bgLatencies.median())
-				    .detail("MaxBGLatency", cx->bgLatencies.max())
-				    .detail("MeanBGGranulesPerRequest", cx->bgGranulesPerRequest.mean())
-				    .detail("MedianBGGranulesPerRequest", cx->bgGranulesPerRequest.median())
-				    .detail("MaxBGGranulesPerRequest", cx->bgGranulesPerRequest.max());
-			}
 		}

-		cx->latencies.clear();
-		cx->readLatencies.clear();
-		cx->GRVLatencies.clear();
-		cx->commitLatencies.clear();
-		cx->mutationsPerCommit.clear();
-		cx->bytesPerCommit.clear();
-		cx->bgLatencies.clear();
-		cx->bgGranulesPerRequest.clear();
-
 		if (cx->usedAnyChangeFeeds && logTraces) {
 			TraceEvent feedEv("ChangeFeedClientMetrics", cx->dbId);

@ -719,6 +704,37 @@ ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
 			cx->ccFeed.logToTraceEvent(feedEv);
 		}

+		if (cx->anyBGReads && logTraces) {
+			TraceEvent bgReadEv("BlobGranuleReadMetrics", cx->dbId);
+
+			bgReadEv.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged)
+			    .detail("Cluster",
+			            cx->getConnectionRecord()
+			                ? cx->getConnectionRecord()->getConnectionString().clusterKeyName().toString()
+			                : "")
+			    .detail("Internal", cx->internal);
+
+			// add counters
+			cx->ccBG.logToTraceEvent(bgReadEv);
+
+			// add latencies
+			bgReadEv.detail("MeanBGLatency", cx->bgLatencies.mean())
+			    .detail("MedianBGLatency", cx->bgLatencies.median())
+			    .detail("MaxBGLatency", cx->bgLatencies.max())
+			    .detail("MeanBGGranulesPerRequest", cx->bgGranulesPerRequest.mean())
+			    .detail("MedianBGGranulesPerRequest", cx->bgGranulesPerRequest.median())
+			    .detail("MaxBGGranulesPerRequest", cx->bgGranulesPerRequest.max());
+		}
+
+		cx->latencies.clear();
+		cx->readLatencies.clear();
+		cx->GRVLatencies.clear();
+		cx->commitLatencies.clear();
+		cx->mutationsPerCommit.clear();
+		cx->bytesPerCommit.clear();
+		cx->bgLatencies.clear();
+		cx->bgGranulesPerRequest.clear();
+
 		lastLogged = now();
 	}
 }
@ -1524,17 +1540,21 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<IClusterConnection
    transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc),
    transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc),
    transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc),
-    transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), bgReadInputBytes("BGReadInputBytes", cc),
-    bgReadOutputBytes("BGReadOutputBytes", cc), usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"),
-    feedStreamStarts("FeedStreamStarts", ccFeed), feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed),
-    feedErrors("FeedErrors", ccFeed), feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed),
-    feedPops("FeedPops", ccFeed), feedPopsFallback("FeedPopsFallback", ccFeed), latencies(1000), readLatencies(1000),
-    commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), bgLatencies(1000),
-    bgGranulesPerRequest(1000), outstandingWatches(0), sharedStatePtr(nullptr), lastGrvTime(0.0), cachedReadVersion(0),
-    lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0), lastProxyRequestTime(0.0),
-    transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor),
-    coordinator(coordinator), apiVersion(_apiVersion), mvCacheInsertLocation(0), healthMetricsLastUpdated(0),
-    detailedHealthMetricsLastUpdated(0), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
+    transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), anyBGReads(false),
+    ccBG("BlobGranuleReadMetrics"), bgReadInputBytes("BGReadInputBytes", ccBG),
+    bgReadOutputBytes("BGReadOutputBytes", ccBG), bgReadSnapshotRows("BGReadSnapshotRows", ccBG),
+    bgReadRowsCleared("BGReadRowsCleared", ccBG), bgReadRowsInserted("BGReadRowsInserted", ccBG),
+    bgReadRowsUpdated("BGReadRowsUpdated", ccBG), bgLatencies(1000), bgGranulesPerRequest(1000),
+    usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"), feedStreamStarts("FeedStreamStarts", ccFeed),
+    feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed), feedErrors("FeedErrors", ccFeed),
+    feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed), feedPops("FeedPops", ccFeed),
+    feedPopsFallback("FeedPopsFallback", ccFeed), latencies(1000), readLatencies(1000), commitLatencies(1000),
+    GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), outstandingWatches(0), sharedStatePtr(nullptr),
+    lastGrvTime(0.0), cachedReadVersion(0), lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0),
+    lastProxyRequestTime(0.0), transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo),
+    clientInfoMonitor(clientInfoMonitor), coordinator(coordinator), apiVersion(_apiVersion), mvCacheInsertLocation(0),
+    healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0),
+    smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
    specialKeySpace(std::make_unique<SpecialKeySpace>(specialKeys.begin, specialKeys.end, /* test */ false)),
    connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {

@ -1824,14 +1844,17 @@ DatabaseContext::DatabaseContext(const Error& err)
    transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc),
    transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc),
    transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc),
-    transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), bgReadInputBytes("BGReadInputBytes", cc),
-    bgReadOutputBytes("BGReadOutputBytes", cc), usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"),
-    feedStreamStarts("FeedStreamStarts", ccFeed), feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed),
-    feedErrors("FeedErrors", ccFeed), feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed),
-    feedPops("FeedPops", ccFeed), feedPopsFallback("FeedPopsFallback", ccFeed), latencies(1000), readLatencies(1000),
-    commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), bgLatencies(1000),
-    bgGranulesPerRequest(1000), sharedStatePtr(nullptr), transactionTracingSample(false),
-    smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
+    transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), anyBGReads(false),
+    ccBG("BlobGranuleReadMetrics"), bgReadInputBytes("BGReadInputBytes", ccBG),
+    bgReadOutputBytes("BGReadOutputBytes", ccBG), bgReadSnapshotRows("BGReadSnapshotRows", ccBG),
+    bgReadRowsCleared("BGReadRowsCleared", ccBG), bgReadRowsInserted("BGReadRowsInserted", ccBG),
+    bgReadRowsUpdated("BGReadRowsUpdated", ccBG), bgLatencies(1000), bgGranulesPerRequest(1000),
+    usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"), feedStreamStarts("FeedStreamStarts", ccFeed),
+    feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed), feedErrors("FeedErrors", ccFeed),
+    feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed), feedPops("FeedPops", ccFeed),
+    feedPopsFallback("FeedPopsFallback", ccFeed), latencies(1000), readLatencies(1000), commitLatencies(1000),
+    GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), sharedStatePtr(nullptr),
+    transactionTracingSample(false), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
    connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {}

 // Static constructor used by server processes to create a DatabaseContext
@ -6224,7 +6247,7 @@ ACTOR Future<Optional<ClientTrCommitCostEstimation>> estimateCommitCosts(Referen
 			trCommitCosts.opsCount++;
 			keyRange = KeyRangeRef(it->param1, it->param2);
 			if (trState->options.expensiveClearCostEstimation) {
-				StorageMetrics m = wait(trState->cx->getStorageMetrics(keyRange, CLIENT_KNOBS->TOO_MANY));
+				StorageMetrics m = wait(trState->cx->getStorageMetrics(keyRange, CLIENT_KNOBS->TOO_MANY, trState));
 				trCommitCosts.clearIdxCosts.emplace_back(i, getWriteOperationCost(m.bytes));
 				trCommitCosts.writeCosts += getWriteOperationCost(m.bytes);
 				++trCommitCosts.expensiveCostEstCount;
@ -7497,34 +7520,45 @@ Future<Void> Transaction::onError(Error const& e) {

 	return e;
 }
-ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx, KeyRange keys);
+ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx,
+                                                            KeyRange keys,
+                                                            Optional<Reference<TransactionState>> trState);

-ACTOR Future<StorageMetrics> doGetStorageMetrics(Database cx, KeyRange keys, Reference<LocationInfo> locationInfo) {
-	loop {
-		try {
-			WaitMetricsRequest req(keys, StorageMetrics(), StorageMetrics());
-			req.min.bytes = 0;
-			req.max.bytes = -1;
-			StorageMetrics m = wait(loadBalance(
-			    locationInfo->locations(), &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution));
-			return m;
-		} catch (Error& e) {
-			if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
-				TraceEvent(SevError, "WaitStorageMetricsError").error(e);
-				throw;
-			}
-			wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
-			cx->invalidateCache(Key(), keys);
-			StorageMetrics m = wait(getStorageMetricsLargeKeyRange(cx, keys));
-			return m;
+ACTOR Future<StorageMetrics> doGetStorageMetrics(Database cx,
+                                                 KeyRange keys,
+                                                 Reference<LocationInfo> locationInfo,
+                                                 TenantMapEntry tenantEntry,
+                                                 Optional<Reference<TransactionState>> trState) {
+	state TenantInfo tenantInfo =
+	    wait(trState.present() ? populateAndGetTenant(trState.get(), keys.begin, latestVersion) : TenantInfo());
+	try {
+		WaitMetricsRequest req(tenantInfo, keys, StorageMetrics(), StorageMetrics());
+		req.min.bytes = 0;
+		req.max.bytes = -1;
+		StorageMetrics m = wait(loadBalance(
+		    locationInfo->locations(), &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution));
+		return m;
+	} catch (Error& e) {
+		if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
+			TraceEvent(SevError, "WaitStorageMetricsError").error(e);
+			throw;
 		}
+		wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
+		cx->invalidateCache(tenantEntry.prefix, keys);
+
+		StorageMetrics m = wait(getStorageMetricsLargeKeyRange(cx, keys, trState));
+		return m;
 	}
 }

-ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx, KeyRange keys) {
+ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx,
+                                                            KeyRange keys,
+                                                            Optional<Reference<TransactionState>> trState) {
 	state Span span("NAPI:GetStorageMetricsLargeKeyRange"_loc);
+	state TenantInfo tenantInfo =
+	    wait(trState.present() ? populateAndGetTenant(trState.get(), keys.begin, latestVersion) : TenantInfo());
 	std::vector<KeyRangeLocationInfo> locations = wait(getKeyRangeLocations(cx,
-	                                                                        TenantInfo(),
+	                                                                        tenantInfo,
 	                                                                        keys,
 	                                                                        std::numeric_limits<int>::max(),
 	                                                                        Reverse::False,
@ -7540,7 +7574,8 @@ ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx, KeyRang
 	for (int i = 0; i < nLocs; i++) {
 		partBegin = (i == 0) ? keys.begin : locations[i].range.begin;
 		partEnd = (i == nLocs - 1) ? keys.end : locations[i].range.end;
-		fx[i] = doGetStorageMetrics(cx, KeyRangeRef(partBegin, partEnd), locations[i].locations);
+		fx[i] = doGetStorageMetrics(
+		    cx, KeyRangeRef(partBegin, partEnd), locations[i].locations, locations[i].tenantEntry, trState);
 	}
 	wait(waitForAll(fx));
 	for (int i = 0; i < nLocs; i++) {
@ -7549,14 +7584,15 @@ ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx, KeyRang
 	return total;
 }

-ACTOR Future<Void> trackBoundedStorageMetrics(KeyRange keys,
+ACTOR Future<Void> trackBoundedStorageMetrics(TenantInfo tenantInfo,
+                                              KeyRange keys,
                                              Reference<LocationInfo> location,
                                              StorageMetrics x,
                                              StorageMetrics halfError,
                                              PromiseStream<StorageMetrics> deltaStream) {
 	try {
 		loop {
-			WaitMetricsRequest req(keys, x - halfError, x + halfError);
+			WaitMetricsRequest req(tenantInfo, keys, x - halfError, x + halfError);
 			StorageMetrics nextX = wait(loadBalance(location->locations(), &StorageServerInterface::waitMetrics, req));
 			deltaStream.send(nextX - x);
 			x = nextX;
@ -7567,7 +7603,8 @@ ACTOR Future<Void> trackBoundedStorageMetrics(KeyRange keys,
 	}
 }

-ACTOR Future<StorageMetrics> waitStorageMetricsMultipleLocations(std::vector<KeyRangeLocationInfo> locations,
+ACTOR Future<StorageMetrics> waitStorageMetricsMultipleLocations(TenantInfo tenantInfo,
+                                                                 std::vector<KeyRangeLocationInfo> locations,
                                                                 StorageMetrics min,
                                                                 StorageMetrics max,
                                                                 StorageMetrics permittedError) {
@ -7581,7 +7618,7 @@ ACTOR Future<StorageMetrics> waitStorageMetricsMultipleLocations(std::vector<Key
 	state StorageMetrics minMinus = min - halfErrorPerMachine * (nLocs - 1);

 	for (int i = 0; i < nLocs; i++) {
-		WaitMetricsRequest req(locations[i].range, StorageMetrics(), StorageMetrics());
+		WaitMetricsRequest req(tenantInfo, locations[i].range, StorageMetrics(), StorageMetrics());
 		req.min.bytes = 0;
 		req.max.bytes = -1;
 		fx[i] = loadBalance(locations[i].locations->locations(),
@ -7602,7 +7639,7 @@ ACTOR Future<StorageMetrics> waitStorageMetricsMultipleLocations(std::vector<Key

 	for (int i = 0; i < nLocs; i++)
 		wx[i] = trackBoundedStorageMetrics(
-		    locations[i].range, locations[i].locations, fx[i].get(), halfErrorPerMachine, deltas);
+		    tenantInfo, locations[i].range, locations[i].locations, fx[i].get(), halfErrorPerMachine, deltas);

 	loop {
 		StorageMetrics delta = waitNext(deltas.getFuture());
@ -7687,25 +7724,30 @@ ACTOR Future<Standalone<VectorRef<ReadHotRangeWithMetrics>>> getReadHotRanges(Da
 	}
 }

-ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(Database cx,
-                                                                          KeyRange keys,
-                                                                          StorageMetrics min,
-                                                                          StorageMetrics max,
-                                                                          StorageMetrics permittedError,
-                                                                          int shardLimit,
-                                                                          int expectedShardCount) {
+ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
+    Database cx,
+    KeyRange keys,
+    StorageMetrics min,
+    StorageMetrics max,
+    StorageMetrics permittedError,
+    int shardLimit,
+    int expectedShardCount,
+    Optional<Reference<TransactionState>> trState) {
 	state Span span("NAPI:WaitStorageMetrics"_loc, generateSpanID(cx->transactionTracingSample));
+	state TenantInfo tenantInfo =
+	    wait(trState.present() ? populateAndGetTenant(trState.get(), keys.begin, latestVersion) : TenantInfo());
 	loop {
-		std::vector<KeyRangeLocationInfo> locations = wait(getKeyRangeLocations(cx,
-		                                                                        TenantInfo(),
-		                                                                        keys,
-		                                                                        shardLimit,
-		                                                                        Reverse::False,
-		                                                                        &StorageServerInterface::waitMetrics,
-		                                                                        span.context,
-		                                                                        Optional<UID>(),
-		                                                                        UseProvisionalProxies::False,
-		                                                                        latestVersion));
+		state std::vector<KeyRangeLocationInfo> locations =
+		    wait(getKeyRangeLocations(cx,
+		                              tenantInfo,
+		                              keys,
+		                              shardLimit,
+		                              Reverse::False,
+		                              &StorageServerInterface::waitMetrics,
+		                              span.context,
+		                              Optional<UID>(),
+		                              UseProvisionalProxies::False,
+		                              latestVersion));
 		if (expectedShardCount >= 0 && locations.size() != expectedShardCount) {
 			return std::make_pair(Optional<StorageMetrics>(), locations.size());
 		}
@ -7716,9 +7758,9 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(Databa
 			try {
 				Future<StorageMetrics> fx;
 				if (locations.size() > 1) {
-					fx = waitStorageMetricsMultipleLocations(locations, min, max, permittedError);
+					fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError);
 				} else {
-					WaitMetricsRequest req(keys, min, max);
+					WaitMetricsRequest req(tenantInfo, keys, min, max);
 					fx = loadBalance(locations[0].locations->locations(),
 					                 &StorageServerInterface::waitMetrics,
 					                 req,
@ -7731,7 +7773,7 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(Databa
 					TraceEvent(SevError, "WaitStorageMetricsError").error(e);
 					throw;
 				}
-				cx->invalidateCache(Key(), keys);
+				cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
 				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
 			}
 		} else {
@ -7741,7 +7783,7 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(Databa
 			    .detail("JitteredSecondsOfPenitence", CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY);
 			wait(delayJittered(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
 			// make sure that the next getKeyRangeLocations() call will actually re-fetch the range
-			cx->invalidateCache(Key(), keys);
+			cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
 		}
 	}
 }
@ -7752,17 +7794,21 @@ Future<std::pair<Optional<StorageMetrics>, int>> DatabaseContext::waitStorageMet
    StorageMetrics const& max,
    StorageMetrics const& permittedError,
    int shardLimit,
-    int expectedShardCount) {
+    int expectedShardCount,
+    Optional<Reference<TransactionState>> trState) {
 	return ::waitStorageMetrics(Database(Reference<DatabaseContext>::addRef(this)),
 	                            keys,
 	                            min,
 	                            max,
 	                            permittedError,
 	                            shardLimit,
-	                            expectedShardCount);
+	                            expectedShardCount,
+	                            trState);
 }

-Future<StorageMetrics> DatabaseContext::getStorageMetrics(KeyRange const& keys, int shardLimit) {
+Future<StorageMetrics> DatabaseContext::getStorageMetrics(KeyRange const& keys,
+                                                          int shardLimit,
+                                                          Optional<Reference<TransactionState>> trState) {
 	if (shardLimit > 0) {
 		StorageMetrics m;
 		m.bytes = -1;
@ -7772,9 +7818,10 @@ Future<StorageMetrics> DatabaseContext::getStorageMetrics(KeyRange const& keys,
 		                                           m,
 		                                           StorageMetrics(),
 		                                           shardLimit,
-		                                           -1));
+		                                           -1,
+		                                           trState));
 	} else {
-		return ::getStorageMetricsLargeKeyRange(Database(Reference<DatabaseContext>::addRef(this)), keys);
+		return ::getStorageMetricsLargeKeyRange(Database(Reference<DatabaseContext>::addRef(this)), keys, trState);
 	}
 }

@ -8062,8 +8109,7 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
 	}
 	if (blobGranuleMapping.more) {
 		if (BG_REQUEST_DEBUG) {
-			fmt::print(
-			    "BG Mapping for [{0} - %{1}) too large!\n", keyRange.begin.printable(), keyRange.end.printable());
+			fmt::print("BG Mapping for [{0} - {1}) too large!\n", keyRange.begin.printable(), keyRange.end.printable());
 		}
 		TraceEvent(SevWarn, "BGMappingTooLarge")
 		    .detail("Range", range)
@ -8276,7 +8322,7 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
 		}
 	}

-	self->trState->cx->anyBlobGranuleRequests = true;
+	self->trState->cx->anyBGReads = true;
 	self->trState->cx->bgGranulesPerRequest.addSample(results.size());
 	self->trState->cx->bgLatencies.addSample(now() - startTime);

@ -8318,8 +8364,13 @@ Transaction::summarizeBlobGranules(const KeyRange& range, Optional<Version> summ
 }

 void Transaction::addGranuleMaterializeStats(const GranuleMaterializeStats& stats) {
+	trState->cx->anyBGReads = true;
 	trState->cx->bgReadInputBytes += stats.inputBytes;
 	trState->cx->bgReadOutputBytes += stats.outputBytes;
+	trState->cx->bgReadSnapshotRows += stats.snapshotRows;
+	trState->cx->bgReadRowsCleared += stats.rowsCleared;
+	trState->cx->bgReadRowsInserted += stats.rowsInserted;
+	trState->cx->bgReadRowsUpdated += stats.rowsUpdated;
 }

 ACTOR Future<Version> setPerpetualStorageWiggle(Database cx, bool enable, LockAware lockAware) {
--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@ -1770,7 +1770,10 @@ Future<int64_t> ReadYourWritesTransaction::getEstimatedRangeSizeBytes(const KeyR
 	if (resetPromise.isSet())
 		return resetPromise.getFuture().getError();

-	return map(waitOrError(tr.getDatabase()->getStorageMetrics(keys, -1), resetPromise.getFuture()),
+	// Pass in the TransactionState only if tenant is present
+	Optional<Reference<TransactionState>> trState =
+	    tr.trState->hasTenant() ? tr.trState : Optional<Reference<TransactionState>>();
+	return map(waitOrError(tr.getDatabase()->getStorageMetrics(keys, -1, trState), resetPromise.getFuture()),
 	           [](const StorageMetrics& m) { return m.bytes; });
 }

--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@ -582,7 +582,8 @@ const KeyRef JSONSchemas::statusSchema = R"statusSchema(
                  "duplicate_mutation_fetch_timeout",
                  "primary_dc_missing",
                  "fetch_primary_dc_timeout",
-                  "fetch_storage_wiggler_stats_timeout"
+                  "fetch_storage_wiggler_stats_timeout",
+                  "fetch_consistency_scan_info_timeout"
               ]
            },
            "issues":[
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@ -39,11 +39,12 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( ENABLE_VERSION_VECTOR,                               false );
 	init( ENABLE_VERSION_VECTOR_TLOG_UNICAST,                  false );

-        bool buggifyShortReadWindow = randomize && BUGGIFY && !ENABLE_VERSION_VECTOR;
+	bool buggifyShortReadWindow = randomize && BUGGIFY && !ENABLE_VERSION_VECTOR;
 	init( MAX_READ_TRANSACTION_LIFE_VERSIONS,      5 * VERSIONS_PER_SECOND ); if (randomize && BUGGIFY) MAX_READ_TRANSACTION_LIFE_VERSIONS = VERSIONS_PER_SECOND; else if (buggifyShortReadWindow) MAX_READ_TRANSACTION_LIFE_VERSIONS = std::max<int>(1, 0.1 * VERSIONS_PER_SECOND); else if( randomize && BUGGIFY ) MAX_READ_TRANSACTION_LIFE_VERSIONS = 10 * VERSIONS_PER_SECOND;
 	init( MAX_WRITE_TRANSACTION_LIFE_VERSIONS,     5 * VERSIONS_PER_SECOND ); if (randomize && BUGGIFY) MAX_WRITE_TRANSACTION_LIFE_VERSIONS=std::max<int>(1, 1 * VERSIONS_PER_SECOND);
 	init( MAX_COMMIT_BATCH_INTERVAL,                             2.0 ); if( randomize && BUGGIFY ) MAX_COMMIT_BATCH_INTERVAL = 0.5; // Each commit proxy generates a CommitTransactionBatchRequest at least this often, so that versions always advance smoothly
 	MAX_COMMIT_BATCH_INTERVAL = std::min(MAX_COMMIT_BATCH_INTERVAL, MAX_READ_TRANSACTION_LIFE_VERSIONS/double(2*VERSIONS_PER_SECOND)); // Ensure that the proxy commits 2 times every MAX_READ_TRANSACTION_LIFE_VERSIONS, otherwise the master will not give out versions fast enough
+	MAX_COMMIT_BATCH_INTERVAL = std::min(MAX_COMMIT_BATCH_INTERVAL, MAX_WRITE_TRANSACTION_LIFE_VERSIONS/double(2*VERSIONS_PER_SECOND)); // Ensure that the proxy commits 2 times every MAX_WRITE_TRANSACTION_LIFE_VERSIONS, otherwise the master will not give out versions fast enough
 	init( MAX_VERSION_RATE_MODIFIER,                             0.1 );
 	init( MAX_VERSION_RATE_OFFSET,               VERSIONS_PER_SECOND ); // If the calculated version is more than this amount away from the expected version, it will be clamped to this value. This prevents huge version jumps.
 	init( ENABLE_VERSION_VECTOR_HA_OPTIMIZATION,               false );
@ -296,7 +297,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC,   isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120;
 	init( DD_TENANT_AWARENESS_ENABLED,                         false );
 	init( TENANT_CACHE_LIST_REFRESH_INTERVAL,                      2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
-
+	init( TENANT_CACHE_STORAGE_REFRESH_INTERVAL,                   2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);

 	// TeamRemover
 	init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER,                false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
@ -420,6 +421,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	// Enable this knob only for experminatal purpose, never enable this in production.
 	// If enabled, all the committed in-memory memtable writes are lost on a crash.
 	init( ROCKSDB_DISABLE_WAL_EXPERIMENTAL,                    false );
+	// If ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE is enabled, disable ENABLE_CLEAR_RANGE_EAGER_READS knob.
+	// These knobs have contrary functionality.
+	init( ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE,             false ); if( randomize && BUGGIFY ) ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE = deterministicRandom()->coinflip() ? false : true;
+	init( ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT,              200000 ); // 200KB
 	// Can commit will delay ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD seconds for
 	// ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD times, if rocksdb overloaded.
 	// Set ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD to 0, to disable
@ -787,7 +792,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( RANGESTREAM_LIMIT_BYTES,                               2e6 ); if( randomize && BUGGIFY ) RANGESTREAM_LIMIT_BYTES = 1;
 	init( CHANGEFEEDSTREAM_LIMIT_BYTES,                          1e6 ); if( randomize && BUGGIFY ) CHANGEFEEDSTREAM_LIMIT_BYTES = 1;
 	init( BLOBWORKERSTATUSSTREAM_LIMIT_BYTES,                    1e4 ); if( randomize && BUGGIFY ) BLOBWORKERSTATUSSTREAM_LIMIT_BYTES = 1;
-	init( ENABLE_CLEAR_RANGE_EAGER_READS,                       true );
+	init( ENABLE_CLEAR_RANGE_EAGER_READS,                       true ); if( randomize && BUGGIFY ) ENABLE_CLEAR_RANGE_EAGER_READS = deterministicRandom()->coinflip() ? false : true;
 	init( CHECKPOINT_TRANSFER_BLOCK_BYTES,                      40e6 );
 	init( QUICK_GET_VALUE_FALLBACK,                             true );
 	init( QUICK_GET_KEY_VALUES_FALLBACK,                        true );
--- a/fdbclient/TagThrottle.actor.cpp
+++ b/fdbclient/TagThrottle.actor.cpp
@ -145,13 +145,13 @@ Value ThrottleApi::TagQuotaValue::toValue() const {

 ThrottleApi::TagQuotaValue ThrottleApi::TagQuotaValue::fromValue(ValueRef value) {
 	auto tuple = Tuple::unpack(value);
-	if (tuple.size() != 4) {
+	if (tuple.size() != 2) {
 		throw invalid_throttle_quota_value();
 	}
 	TagQuotaValue result;
 	try {
-		result.reservedQuota = tuple.getDouble(0);
-		result.totalQuota = tuple.getDouble(1);
+		result.reservedQuota = tuple.getInt(0);
+		result.totalQuota = tuple.getInt(1);
 	} catch (Error& e) {
 		TraceEvent(SevWarnAlways, "TagQuotaValueFailedToDeserialize").error(e);
 		throw invalid_throttle_quota_value();
--- a/fdbclient/include/fdbclient/BlobGranuleCommon.h
+++ b/fdbclient/include/fdbclient/BlobGranuleCommon.h
@ -56,10 +56,18 @@ struct GranuleDeltas : VectorRef<MutationsAndVersionRef> {
 };

 struct GranuleMaterializeStats {
+	// file-level stats
 	int64_t inputBytes;
 	int64_t outputBytes;

-	GranuleMaterializeStats() : inputBytes(0), outputBytes(0) {}
+	// merge stats
+	int32_t snapshotRows;
+	int32_t rowsCleared;
+	int32_t rowsInserted;
+	int32_t rowsUpdated;
+
+	GranuleMaterializeStats()
+	  : inputBytes(0), outputBytes(0), snapshotRows(0), rowsCleared(0), rowsInserted(0), rowsUpdated(0) {}
 };

 struct BlobGranuleCipherKeysMeta {
--- a/fdbclient/include/fdbclient/BlobGranuleFiles.h
+++ b/fdbclient/include/fdbclient/BlobGranuleFiles.h
@ -51,7 +51,8 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
                                   Version beginVersion,
                                   Version readVersion,
                                   Optional<StringRef> snapshotData,
-                                   StringRef deltaFileData[]);
+                                   StringRef deltaFileData[],
+                                   GranuleMaterializeStats& stats);

 std::string randomBGFilename(UID blobWorkerID, UID granuleID, Version version, std::string suffix);

--- a/fdbclient/include/fdbclient/BlobMetadataUtils.h
+++ b/fdbclient/include/fdbclient/BlobMetadataUtils.h
@ -62,8 +62,8 @@ struct BlobMetadataDetailsRef {
 	                                BlobMetadataDomainNameRef domainName,
 	                                Optional<StringRef> base,
 	                                VectorRef<StringRef> partitions,
-	                                int64_t refreshAt,
-	                                int64_t expireAt)
+	                                double refreshAt,
+	                                double expireAt)
 	  : domainId(domainId), domainName(ar, domainName), partitions(ar, partitions), refreshAt(refreshAt),
 	    expireAt(expireAt) {
 		if (base.present()) {
--- a/fdbclient/include/fdbclient/DatabaseContext.h
+++ b/fdbclient/include/fdbclient/DatabaseContext.h
@ -298,13 +298,19 @@ public:
 	Future<Void> onProxiesChanged() const;
 	Future<HealthMetrics> getHealthMetrics(bool detailed);
 	// Pass a negative value for `shardLimit` to indicate no limit on the shard number.
-	Future<StorageMetrics> getStorageMetrics(KeyRange const& keys, int shardLimit);
-	Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(KeyRange const& keys,
-	                                                                    StorageMetrics const& min,
-	                                                                    StorageMetrics const& max,
-	                                                                    StorageMetrics const& permittedError,
-	                                                                    int shardLimit,
-	                                                                    int expectedShardCount);
+	// Pass a valid `trState` with `hasTenant() == true` to make the function tenant-aware.
+	Future<StorageMetrics> getStorageMetrics(
+	    KeyRange const& keys,
+	    int shardLimit,
+	    Optional<Reference<TransactionState>> trState = Optional<Reference<TransactionState>>());
+	Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
+	    KeyRange const& keys,
+	    StorageMetrics const& min,
+	    StorageMetrics const& max,
+	    StorageMetrics const& permittedError,
+	    int shardLimit,
+	    int expectedShardCount,
+	    Optional<Reference<TransactionState>> trState = Optional<Reference<TransactionState>>());
 	Future<Void> splitStorageMetricsStream(PromiseStream<Key> const& resultsStream,
 	                                       KeyRange const& keys,
 	                                       StorageMetrics const& limit,
@ -548,8 +554,17 @@ public:
 	Counter transactionGrvFullBatches;
 	Counter transactionGrvTimedOutBatches;
 	Counter transactionCommitVersionNotFoundForSS;
+
+	// Blob Granule Read metrics. Omit from logging if not used.
+	bool anyBGReads;
+	CounterCollection ccBG;
 	Counter bgReadInputBytes;
 	Counter bgReadOutputBytes;
+	Counter bgReadSnapshotRows;
+	Counter bgReadRowsCleared;
+	Counter bgReadRowsInserted;
+	Counter bgReadRowsUpdated;
+	ContinuousSample<double> bgLatencies, bgGranulesPerRequest;

 	// Change Feed metrics. Omit change feed metrics from logging if not used
 	bool usedAnyChangeFeeds;
@ -562,7 +577,7 @@ public:
 	Counter feedPopsFallback;

 	ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit,
-	    bytesPerCommit, bgLatencies, bgGranulesPerRequest;
+	    bytesPerCommit;

 	int outstandingWatches;
 	int maxOutstandingWatches;
@ -591,7 +606,6 @@ public:
 	bool transactionTracingSample;
 	double verifyCausalReadsProp = 0.0;
 	bool blobGranuleNoMaterialize = false;
-	bool anyBlobGranuleRequests = false;

 	Future<Void> logger;
 	Future<Void> throttleExpirer;
--- a/fdbclient/include/fdbclient/FDBTypes.h
+++ b/fdbclient/include/fdbclient/FDBTypes.h
@ -1402,6 +1402,25 @@ struct TenantMode {
 		serializer(ar, mode);
 	}

+	// This does not go back-and-forth cleanly with toString
+	// The '_experimental' suffix, if present, needs to be removed in order to be parsed.
+	static TenantMode fromString(std::string mode) {
+		if (mode.find("_experimental") != std::string::npos) {
+			mode.replace(mode.find("_experimental"), std::string::npos, "");
+		}
+		if (mode == "disabled") {
+			return TenantMode::DISABLED;
+		} else if (mode == "optional") {
+			return TenantMode::OPTIONAL_TENANT;
+		} else if (mode == "required") {
+			return TenantMode::REQUIRED;
+		} else {
+			TraceEvent(SevError, "UnknownTenantMode").detail("TenantMode", mode);
+			ASSERT(false);
+			throw internal_error();
+		}
+	}
+
 	std::string toString() const {
 		switch (mode) {
 		case DISABLED:
@ -1686,10 +1705,20 @@ struct Versionstamp {
 		serializer(ar, beVersion, beBatch);

 		if constexpr (Ar::isDeserializing) {
-			version = bigEndian64(version);
+			version = bigEndian64(beVersion);
 			batchNumber = bigEndian16(beBatch);
 		}
 	}
 };

+template <class Ar>
+inline void save(Ar& ar, const Versionstamp& value) {
+	return const_cast<Versionstamp&>(value).serialize(ar);
+}
+
+template <class Ar>
+inline void load(Ar& ar, Versionstamp& value) {
+	value.serialize(ar);
+}
+
 #endif
--- a/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h
+++ b/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h
@ -104,6 +104,11 @@ Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>> getL

 	// Collect cached cipher keys.
 	for (auto& domain : domains) {
+		if (domain.first == FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
+			ASSERT(domain.second == FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
+		} else if (domain.first == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
+			ASSERT(domain.second == FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
+		}
 		Reference<BlobCipherKey> cachedCipherKey = cipherKeyCache->getLatestCipherKey(domain.first /*domainId*/);
 		if (cachedCipherKey.isValid()) {
 			cipherKeys[domain.first] = cachedCipherKey;
@ -301,7 +306,7 @@ template <class T>
 Future<TextAndHeaderCipherKeys> getLatestSystemEncryptCipherKeys(const Reference<AsyncVar<T> const>& db,
                                                                 BlobCipherMetrics::UsageType usageType) {
 	return getLatestEncryptCipherKeysForDomain(
-	    db, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME, usageType);
+	    db, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME, usageType);
 }

 ACTOR template <class T>
--- a/fdbclient/include/fdbclient/KeyBackedTypes.h
+++ b/fdbclient/include/fdbclient/KeyBackedTypes.h
@ -319,6 +319,11 @@ public:
 		tr->clear(key);
 	}

+	template <class Transaction>
+	Future<Void> watch(Transaction tr) {
+		return tr->watch(key);
+	}
+
 	Key key;
 };

--- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h
+++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h
@ -502,6 +502,7 @@ Future<Void> decommissionMetacluster(Reference<DB> db) {
 			ManagementClusterMetadata::tenantMetadata().lastTenantId.clear(tr);
 			ManagementClusterMetadata::tenantMetadata().tenantTombstones.clear(tr);
 			ManagementClusterMetadata::tenantMetadata().tombstoneCleanupData.clear(tr);
+			ManagementClusterMetadata::tenantMetadata().lastTenantModification.clear(tr);

 			wait(managementClusterCheckEmpty(tr));
 			MetaclusterMetadata::metaclusterRegistration().clear(tr);
@ -797,6 +798,7 @@ struct RemoveClusterImpl {
 			ASSERT(entry.getString(0) == self->ctx.clusterName.get());
 			ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, entry.getString(1));
 			ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, entry.getInt(2));
+			ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
 		}

 		// Erase all of the tenants processed in this transaction from the cluster tenant index
@ -1262,6 +1264,7 @@ struct CreateTenantImpl {
 		self->tenantEntry.tenantState = TenantState::REGISTERING;
 		ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, self->tenantEntry);
 		ManagementClusterMetadata::tenantMetadata().tenantIdIndex.set(tr, self->tenantEntry.id, self->tenantName);
+		ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);

 		ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue);
 		ManagementClusterMetadata::clusterTenantCount.atomicOp(
@ -1317,6 +1320,7 @@ struct CreateTenantImpl {
 			TenantMapEntry updatedEntry = managementEntry.get();
 			updatedEntry.tenantState = TenantState::READY;
 			ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, updatedEntry);
+			ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
 		}

 		return Void();
@ -1446,6 +1450,7 @@ struct DeleteTenantImpl {
 			}
 			updatedEntry.tenantState = TenantState::REMOVING;
 			ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, updatedEntry);
+			ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
 			// If this has a rename pair, also mark the other entry for deletion
 			if (self->pairName.present()) {
 				state Optional<TenantMapEntry> pairEntry = wait(tryGetTenantTransaction(tr, self->pairName.get()));
@ -1457,6 +1462,8 @@ struct DeleteTenantImpl {
 				CODE_PROBE(true, "marking pair tenant in removing state");
 				updatedPairEntry.tenantState = TenantState::REMOVING;
 				ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->pairName.get(), updatedPairEntry);
+				ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(
+				    tr, Versionstamp(), 0);
 			}
 		}

@ -1485,6 +1492,7 @@ struct DeleteTenantImpl {
 		// Erase the tenant entry itself
 		ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, tenantName);
 		ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, tenantEntry.get().id);
+		ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);

 		// This is idempotent because this function is only called if the tenant is in the map
 		ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue);
@ -1689,6 +1697,7 @@ struct ConfigureTenantImpl {

 		++self->updatedEntry.configurationSequenceNum;
 		ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, self->updatedEntry);
+		ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);

 		return Void();
 	}
@ -1724,6 +1733,7 @@ struct ConfigureTenantImpl {

 		tenantEntry.get().tenantState = TenantState::READY;
 		ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, tenantEntry.get());
+		ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
 		return Void();
 	}

@ -1770,6 +1780,7 @@ struct RenameTenantImpl {
 	                                                            TenantMapEntry tenantEntry) {
 		// Erase the tenant entry itself
 		ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, self->oldName);
+		ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);

 		// Remove old tenant from tenant count
 		ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue);
@ -1857,6 +1868,7 @@ struct RenameTenantImpl {

 		ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->oldName, updatedOldEntry);
 		ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->newName, updatedNewEntry);
+		ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);

 		// Add temporary tenant to tenantCount to prevent exceeding capacity during a rename
 		ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue);
@ -1919,6 +1931,7 @@ struct RenameTenantImpl {
 			updatedNewEntry.renamePair.reset();
 			ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->newName, updatedNewEntry);
 			ManagementClusterMetadata::tenantMetadata().tenantIdIndex.set(tr, self->tenantId, self->newName);
+			ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
 		}

 		// We will remove the old entry from the management cluster
--- a/fdbclient/include/fdbclient/ServerKnobs.h
+++ b/fdbclient/include/fdbclient/ServerKnobs.h
@ -237,6 +237,8 @@ public:
 	    DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC; // Minimal age of a correct-configured server before it's chosen to be wiggled
 	bool DD_TENANT_AWARENESS_ENABLED;
 	int TENANT_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantCache is refreshed
+	int TENANT_CACHE_STORAGE_REFRESH_INTERVAL; // How often the storage bytes used by each tenant in the TenantCache is
+	                                           // refreshed

 	// TeamRemover to remove redundant teams
 	bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor
@ -343,6 +345,8 @@ public:
 	int ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD;
 	int ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD;
 	bool ROCKSDB_DISABLE_WAL_EXPERIMENTAL;
+	bool ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE;
+	int64_t ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT;
 	int64_t ROCKSDB_COMPACTION_READAHEAD_SIZE;
 	int64_t ROCKSDB_BLOCK_SIZE;
 	bool ENABLE_SHARDED_ROCKSDB;
--- a/fdbclient/include/fdbclient/StorageServerInterface.h
+++ b/fdbclient/include/fdbclient/StorageServerInterface.h
@ -103,7 +103,7 @@ struct StorageServerInterface {
 	PublicRequestStream<struct GetMappedKeyValuesRequest> getMappedKeyValues;

 	RequestStream<struct GetShardStateRequest> getShardState;
-	RequestStream<struct WaitMetricsRequest> waitMetrics;
+	PublicRequestStream<struct WaitMetricsRequest> waitMetrics;
 	RequestStream<struct SplitMetricsRequest> splitMetrics;
 	RequestStream<struct GetStorageMetricsRequest> getStorageMetrics;
 	RequestStream<ReplyPromise<Void>> waitFailure;
@ -161,7 +161,8 @@ public:
 				    PublicRequestStream<struct GetKeyValuesRequest>(getValue.getEndpoint().getAdjustedEndpoint(2));
 				getShardState =
 				    RequestStream<struct GetShardStateRequest>(getValue.getEndpoint().getAdjustedEndpoint(3));
-				waitMetrics = RequestStream<struct WaitMetricsRequest>(getValue.getEndpoint().getAdjustedEndpoint(4));
+				waitMetrics =
+				    PublicRequestStream<struct WaitMetricsRequest>(getValue.getEndpoint().getAdjustedEndpoint(4));
 				splitMetrics = RequestStream<struct SplitMetricsRequest>(getValue.getEndpoint().getAdjustedEndpoint(5));
 				getStorageMetrics =
 				    RequestStream<struct GetStorageMetricsRequest>(getValue.getEndpoint().getAdjustedEndpoint(6));
@ -713,18 +714,25 @@ struct WaitMetricsRequest {
 	// Waits for any of the given minimum or maximum metrics to be exceeded, and then returns the current values
 	// Send a reversed range for min, max to receive an immediate report
 	constexpr static FileIdentifier file_identifier = 1795961;
+	// Setting the tenantInfo makes the request tenant-aware.
+	Optional<TenantInfo> tenantInfo;
 	Arena arena;
 	KeyRangeRef keys;
 	StorageMetrics min, max;
 	ReplyPromise<StorageMetrics> reply;

+	bool verify() const { return tenantInfo.present() && tenantInfo.get().isAuthorized(); }
+
 	WaitMetricsRequest() {}
-	WaitMetricsRequest(KeyRangeRef const& keys, StorageMetrics const& min, StorageMetrics const& max)
-	  : keys(arena, keys), min(min), max(max) {}
+	WaitMetricsRequest(TenantInfo tenantInfo,
+	                   KeyRangeRef const& keys,
+	                   StorageMetrics const& min,
+	                   StorageMetrics const& max)
+	  : tenantInfo(tenantInfo), keys(arena, keys), min(min), max(max) {}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, keys, min, max, reply, arena);
+		serializer(ar, keys, min, max, reply, tenantInfo, arena);
 	}
 };

--- a/fdbclient/include/fdbclient/TagThrottle.actor.h
+++ b/fdbclient/include/fdbclient/TagThrottle.actor.h
@ -597,8 +597,8 @@ Future<Void> enableAuto(Reference<DB> db, bool enabled) {

 class TagQuotaValue {
 public:
-	double reservedQuota{ 0.0 };
-	double totalQuota{ 0.0 };
+	int64_t reservedQuota{ 0 };
+	int64_t totalQuota{ 0 };
 	bool isValid() const;
 	Value toValue() const;
 	static TagQuotaValue fromValue(ValueRef);
--- a/fdbclient/include/fdbclient/Tenant.h
+++ b/fdbclient/include/fdbclient/Tenant.h
@ -181,6 +181,7 @@ struct TenantMetadataSpecification {
 	KeyBackedObjectProperty<TenantTombstoneCleanupData, decltype(IncludeVersion())> tombstoneCleanupData;
 	KeyBackedSet<Tuple> tenantGroupTenantIndex;
 	KeyBackedObjectMap<TenantGroupName, TenantGroupEntry, decltype(IncludeVersion()), NullCodec> tenantGroupMap;
+	KeyBackedBinaryValue<Versionstamp> lastTenantModification;

 	TenantMetadataSpecification(KeyRef prefix)
 	  : subspace(prefix.withSuffix("tenant/"_sr)), tenantMap(subspace.withSuffix("map/"_sr), IncludeVersion()),
@ -188,7 +189,8 @@ struct TenantMetadataSpecification {
 	    tenantCount(subspace.withSuffix("count"_sr)), tenantTombstones(subspace.withSuffix("tombstones/"_sr)),
 	    tombstoneCleanupData(subspace.withSuffix("tombstoneCleanup"_sr), IncludeVersion()),
 	    tenantGroupTenantIndex(subspace.withSuffix("tenantGroup/tenantIndex/"_sr)),
-	    tenantGroupMap(subspace.withSuffix("tenantGroup/map/"_sr), IncludeVersion()) {}
+	    tenantGroupMap(subspace.withSuffix("tenantGroup/map/"_sr), IncludeVersion()),
+	    lastTenantModification(subspace.withSuffix("lastModification"_sr)) {}
 };

 struct TenantMetadata {
@ -203,6 +205,7 @@ struct TenantMetadata {
 	static inline auto& tombstoneCleanupData() { return instance().tombstoneCleanupData; }
 	static inline auto& tenantGroupTenantIndex() { return instance().tenantGroupTenantIndex; }
 	static inline auto& tenantGroupMap() { return instance().tenantGroupMap; }
+	static inline auto& lastTenantModification() { return instance().lastTenantModification; }

 	static Key tenantMapPrivatePrefix();
 };
--- a/fdbclient/include/fdbclient/TenantEntryCache.actor.h
+++ b/fdbclient/include/fdbclient/TenantEntryCache.actor.h
@ -44,8 +44,14 @@
 using TenantNameEntryPair = std::pair<TenantName, TenantMapEntry>;
 using TenantNameEntryPairVec = std::vector<TenantNameEntryPair>;

-enum class TenantEntryCacheRefreshReason { INIT = 1, PERIODIC_TASK = 2, CACHE_MISS = 3, REMOVE_ENTRY = 4 };
-enum class TenantEntryCacheRefreshMode { PERIODIC_TASK = 1, NONE = 2 };
+enum class TenantEntryCacheRefreshReason {
+	INIT = 1,
+	PERIODIC_TASK = 2,
+	CACHE_MISS = 3,
+	REMOVE_ENTRY = 4,
+	WATCH_TRIGGER = 5
+};
+enum class TenantEntryCacheRefreshMode { PERIODIC_TASK = 1, WATCH = 2, NONE = 3 };

 template <class T>
 struct TenantEntryCachePayload {
@ -62,12 +68,6 @@ using TenantEntryCachePayloadFunc = std::function<TenantEntryCachePayload<T>(con
 // 1. Lookup by 'TenantId'
 // 2. Lookup by 'TenantPrefix'
 // 3. Lookup by 'TenantName'
-//
-// TODO:
-// ----
-// The cache allows user to construct the 'cached object' by supplying a callback. The cache implements a periodic
-// refresh mechanism, polling underlying database for updates (add/remove tenants), in future we might want to implement
-// database range-watch to monitor such updates

 template <class T>
 class TenantEntryCache : public ReferenceCounted<TenantEntryCache<T>>, NonCopyable {
@ -78,6 +78,10 @@ private:
 	TenantEntryCacheRefreshMode refreshMode;

 	Future<Void> refresher;
+	Future<Void> watchRefresher;
+	Future<Void> lastTenantIdRefresher;
+	Promise<Void> setInitialWatch;
+	Optional<int64_t> lastTenantId;
 	Map<int64_t, TenantEntryCachePayload<T>> mapByTenantId;
 	Map<TenantName, TenantEntryCachePayload<T>> mapByTenantName;

@ -87,6 +91,7 @@ private:
 	Counter refreshByCacheInit;
 	Counter refreshByCacheMiss;
 	Counter numRefreshes;
+	Counter refreshByWatchTrigger;

 	ACTOR static Future<TenantNameEntryPairVec> getTenantList(Reference<ReadYourWritesTransaction> tr) {
 		tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
@ -102,16 +107,166 @@ private:
 		return tenantList.results;
 	}

+	ACTOR static Future<Void> refreshCacheById(int64_t tenantId,
+	                                           TenantEntryCache<T>* cache,
+	                                           TenantEntryCacheRefreshReason reason) {
+		TraceEvent(SevDebug, "TenantEntryCacheIDRefreshStart", cache->id()).detail("Reason", static_cast<int>(reason));
+		state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction();
+		loop {
+			try {
+				tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+				tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
+				state Optional<TenantName> name = wait(TenantMetadata::tenantIdIndex().get(tr, tenantId));
+				if (name.present()) {
+					Optional<TenantMapEntry> entry = wait(TenantMetadata::tenantMap().get(tr, name.get()));
+					if (entry.present()) {
+						cache->put(std::make_pair(name.get(), entry.get()));
+						updateCacheRefreshMetrics(cache, reason);
+					}
+				}
+				break;
+			} catch (Error& e) {
+				wait(tr->onError(e));
+			}
+		}
+		TraceEvent(SevDebug, "TenantEntryCacheIDRefreshEnd", cache->id()).detail("Reason", static_cast<int>(reason));
+		return Void();
+	}
+
+	ACTOR static Future<Void> refreshCacheByName(TenantName name,
+	                                             TenantEntryCache<T>* cache,
+	                                             TenantEntryCacheRefreshReason reason) {
+		TraceEvent(SevDebug, "TenantEntryCacheNameRefreshStart", cache->id())
+		    .detail("Reason", static_cast<int>(reason));
+		state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction();
+		loop {
+			try {
+				tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+				tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
+				Optional<TenantMapEntry> entry = wait(TenantMetadata::tenantMap().get(tr, name));
+				if (entry.present()) {
+					cache->put(std::make_pair(name, entry.get()));
+					updateCacheRefreshMetrics(cache, reason);
+				}
+				break;
+			} catch (Error& e) {
+				wait(tr->onError(e));
+			}
+		}
+		TraceEvent(SevDebug, "TenantEntryCacheNameRefreshEnd", cache->id()).detail("Reason", static_cast<int>(reason));
+		return Void();
+	}
+
 	static void updateCacheRefreshMetrics(TenantEntryCache<T>* cache, TenantEntryCacheRefreshReason reason) {
 		if (reason == TenantEntryCacheRefreshReason::INIT) {
 			cache->refreshByCacheInit += 1;
 		} else if (reason == TenantEntryCacheRefreshReason::CACHE_MISS) {
 			cache->refreshByCacheMiss += 1;
+		} else if (reason == TenantEntryCacheRefreshReason::WATCH_TRIGGER) {
+			cache->refreshByWatchTrigger += 1;
 		}

 		cache->numRefreshes += 1;
 	}

+	ACTOR static Future<Void> refreshCacheUsingWatch(TenantEntryCache<T>* cache, TenantEntryCacheRefreshReason reason) {
+		TraceEvent(SevDebug, "TenantEntryCacheRefreshUsingWatchStart", cache->id())
+		    .detail("Reason", static_cast<int>(reason));
+
+		state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction();
+		loop {
+			try {
+				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+				state Future<Void> tenantModifiedWatch = TenantMetadata::lastTenantModification().watch(tr);
+				wait(tr->commit());
+				TraceEvent(SevDebug, "TenantEntryCacheRefreshWatchSet", cache->id());
+				// setInitialWatch is set to indicate that an inital watch has been set for the lastTenantModification
+				// key. Currently this is only used in simulation to avoid a race condition where a tenant is created
+				// before the inital watch is set. However, it can be enabled by passing waitForInitalWatch = true to
+				// the init() method.
+				if (cache->setInitialWatch.canBeSet()) {
+					cache->setInitialWatch.send(Void());
+				}
+				wait(tenantModifiedWatch);
+				// If watch triggered then refresh the cache as tenant metadata was updated
+				TraceEvent(SevDebug, "TenantEntryCacheRefreshUsingWatchTriggered", cache->id())
+				    .detail("Reason", static_cast<int>(reason));
+				wait(refreshImpl(cache, reason));
+				tr->reset();
+			} catch (Error& e) {
+				if (e.code() != error_code_actor_cancelled) {
+					TraceEvent("TenantEntryCacheRefreshUsingWatchError", cache->id())
+					    .errorUnsuppressed(e)
+					    .suppressFor(1.0);
+				}
+				wait(tr->onError(e));
+				// In case the watch threw an error then refresh the cache just in case it was updated
+				wait(refreshImpl(cache, reason));
+			}
+		}
+	}
+
+	static bool tenantsEnabled(TenantEntryCache<T>* cache) {
+		// Avoid using the cache if the tenant mode is disabled. However since we use clientInfo, sometimes it may not
+		// be fully up to date (i.e it may indicate the tenantMode is disabled when in fact it is required). Thus if
+		// there is at least one tenant that has been created on the cluster then use the cache to avoid an incorrect
+		// miss.
+		if (cache->getDatabase()->clientInfo->get().tenantMode == TenantMode::DISABLED) {
+			if (!cache->lastTenantId.present()) {
+				return false;
+			}
+			return cache->lastTenantId.get() > 0;
+		}
+		return true;
+	}
+
+	ACTOR static Future<Void> setLastTenantId(TenantEntryCache<T>* cache) {
+		state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction();
+		loop {
+			try {
+				tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+				tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
+				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+				Optional<int64_t> lastTenantId = wait(TenantMetadata::lastTenantId().get(tr));
+				cache->lastTenantId = lastTenantId;
+				return Void();
+			} catch (Error& e) {
+				wait(tr->onError(e));
+			}
+		}
+	}
+
+	ACTOR static Future<Void> lastTenantIdWatch(TenantEntryCache<T>* cache) {
+		TraceEvent(SevDebug, "TenantEntryCacheLastTenantIdWatchStart", cache->id());
+		// monitor for any changes on the last tenant id and update it as necessary
+		state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction();
+		loop {
+			try {
+				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+				state Future<Void> lastTenantIdWatch = tr->watch(TenantMetadata::lastTenantId().key);
+				wait(tr->commit());
+				wait(lastTenantIdWatch);
+				wait(setLastTenantId(cache));
+				tr->reset();
+			} catch (Error& e) {
+				state Error err(e);
+				if (err.code() != error_code_actor_cancelled) {
+					TraceEvent("TenantEntryCacheLastTenantIdWatchError", cache->id())
+					    .errorUnsuppressed(err)
+					    .suppressFor(1.0);
+					// In case watch errors out refresh the lastTenantId in case it has changed or we would have missed
+					// an update
+					wait(setLastTenantId(cache));
+				}
+				wait(tr->onError(err));
+			}
+		}
+	}
+
 	ACTOR static Future<Void> refreshImpl(TenantEntryCache<T>* cache, TenantEntryCacheRefreshReason reason) {
 		TraceEvent(SevDebug, "TenantEntryCacheRefreshStart", cache->id()).detail("Reason", static_cast<int>(reason));

@ -130,9 +285,7 @@ private:
 				break;
 			} catch (Error& e) {
 				if (e.code() != error_code_actor_cancelled) {
-					TraceEvent(SevInfo, "TenantEntryCacheRefreshError", cache->id())
-					    .errorUnsuppressed(e)
-					    .suppressFor(1.0);
+					TraceEvent("TenantEntryCacheRefreshError", cache->id()).errorUnsuppressed(e).suppressFor(1.0);
 				}
 				wait(tr->onError(e));
 			}
@ -151,12 +304,22 @@ private:
 			return ret;
 		}

-		TraceEvent(SevInfo, "TenantEntryCacheGetByIdRefresh").detail("TenantId", tenantId);
+		if (!tenantsEnabled(cache)) {
+			// If tenants are disabled on the cluster avoid using the cache
+			return Optional<TenantEntryCachePayload<T>>();
+		}

-		// Entry not found. Refresh cacheEntries by scanning underlying KeyRange.
-		// TODO: Cache will implement a "KeyRange" watch, monitoring notification when a new entry gets added or any
-		// existing entry gets updated within the KeyRange of interest. Hence, misses would be very rare
-		wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS));
+		TraceEvent("TenantEntryCacheGetByIdRefresh").detail("TenantId", tenantId);
+
+		if (cache->refreshMode == TenantEntryCacheRefreshMode::WATCH) {
+			// Entry not found. Do a point refresh
+			// TODO: Don't initiate refresh if tenantId < maxTenantId (stored as a system key currently) as we know that
+			// such a tenant does not exist (it has either never existed or has been deleted)
+			wait(refreshCacheById(tenantId, cache, TenantEntryCacheRefreshReason::CACHE_MISS));
+		} else {
+			// Entry not found. Refresh cacheEntries by scanning underlying KeyRange.
+			wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS));
+		}

 		cache->misses += 1;
 		return cache->lookupById(tenantId);
@ -170,12 +333,20 @@ private:
 			return ret;
 		}

+		if (!tenantsEnabled(cache)) {
+			// If tenants are disabled on the cluster avoid using the cache
+			return Optional<TenantEntryCachePayload<T>>();
+		}
+
 		TraceEvent("TenantEntryCacheGetByNameRefresh").detail("TenantName", name);

-		// Entry not found. Refresh cacheEntries by scanning underlying KeyRange.
-		// TODO: Cache will implement a "KeyRange" watch, monitoring notification when a new entry gets added or any
-		// existing entry gets updated within the KeyRange of interest. Hence, misses would be very rare
-		wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS));
+		if (cache->refreshMode == TenantEntryCacheRefreshMode::WATCH) {
+			// Entry not found. Do a point refresh
+			wait(refreshCacheByName(name, cache, TenantEntryCacheRefreshReason::CACHE_MISS));
+		} else {
+			// Entry not found. Refresh cacheEntries by scanning underlying KeyRange.
+			wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS));
+		}

 		cache->misses += 1;
 		return cache->lookupByName(name);
@ -272,7 +443,18 @@ public:
 	    hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics),
 	    refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
 	    refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
-	    numRefreshes("TenantEntryCacheNumRefreshes", metrics) {
+	    numRefreshes("TenantEntryCacheNumRefreshes", metrics),
+	    refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) {
+		TraceEvent("TenantEntryCacheCreatedDefaultFunc", uid);
+	}
+
+	TenantEntryCache(Database db, TenantEntryCacheRefreshMode mode)
+	  : uid(deterministicRandom()->randomUniqueID()), db(db), createPayloadFunc(defaultCreatePayload),
+	    refreshMode(mode), metrics("TenantEntryCacheMetrics", uid.toString()), hits("TenantEntryCacheHits", metrics),
+	    misses("TenantEntryCacheMisses", metrics), refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
+	    refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
+	    numRefreshes("TenantEntryCacheNumRefreshes", metrics),
+	    refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) {
 		TraceEvent("TenantEntryCacheCreatedDefaultFunc", uid);
 	}

@ -282,7 +464,8 @@ public:
 	    hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics),
 	    refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
 	    refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
-	    numRefreshes("TenantEntryCacheNumRefreshes", metrics) {
+	    numRefreshes("TenantEntryCacheNumRefreshes", metrics),
+	    refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) {
 		TraceEvent("TenantEntryCacheCreated", uid);
 	}

@ -291,7 +474,8 @@ public:
 	    metrics("TenantEntryCacheMetrics", uid.toString()), hits("TenantEntryCacheHits", metrics),
 	    misses("TenantEntryCacheMisses", metrics), refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
 	    refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
-	    numRefreshes("TenantEntryCacheNumRefreshes", metrics) {
+	    numRefreshes("TenantEntryCacheNumRefreshes", metrics),
+	    refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) {
 		TraceEvent("TenantEntryCacheCreated", uid);
 	}

@ -300,26 +484,36 @@ public:
 	    hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics),
 	    refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
 	    refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
-	    numRefreshes("TenantEntryCacheNumRefreshes", metrics) {
+	    numRefreshes("TenantEntryCacheNumRefreshes", metrics),
+	    refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) {
 		TraceEvent("TenantEntryCacheCreated", uid);
 	}

-	Future<Void> init() {
+	Future<Void> init(bool waitForInitalWatch = false) {
 		TraceEvent("TenantEntryCacheInit", uid);

 		Future<Void> f = refreshImpl(this, TenantEntryCacheRefreshReason::INIT);

 		// Launch reaper task to periodically refresh cache by scanning database KeyRange
 		TenantEntryCacheRefreshReason reason = TenantEntryCacheRefreshReason::PERIODIC_TASK;
+		Future<Void> initalWatchFuture = Void();
+		lastTenantIdRefresher = lastTenantIdWatch(this);
 		if (refreshMode == TenantEntryCacheRefreshMode::PERIODIC_TASK) {
 			refresher = recurringAsync([&, reason]() { return refresh(reason); },
 			                           CLIENT_KNOBS->TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, /* interval */
 			                           true, /* absoluteIntervalDelay */
 			                           CLIENT_KNOBS->TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, /* intialDelay */
 			                           TaskPriority::Worker);
+		} else if (refreshMode == TenantEntryCacheRefreshMode::WATCH) {
+			if (waitForInitalWatch) {
+				initalWatchFuture = setInitialWatch.getFuture();
+			}
+			watchRefresher = refreshCacheUsingWatch(this, TenantEntryCacheRefreshReason::WATCH_TRIGGER);
 		}

-		return f;
+		Future<Void> setLastTenant = setLastTenantId(this);
+
+		return f && initalWatchFuture && setLastTenant;
 	}

 	Database getDatabase() const { return db; }
@ -341,28 +535,33 @@ public:
 	}

 	void put(const TenantNameEntryPair& pair) {
-		TenantEntryCachePayload<T> payload = createPayloadFunc(pair.first, pair.second);
-		auto idItr = mapByTenantId.find(pair.second.id);
-		auto nameItr = mapByTenantName.find(pair.first);
+		const auto& [name, entry] = pair;
+		TenantEntryCachePayload<T> payload = createPayloadFunc(name, entry);
+		auto idItr = mapByTenantId.find(entry.id);
+		auto nameItr = mapByTenantName.find(name);

 		Optional<TenantName> existingName;
 		Optional<int64_t> existingId;
 		if (nameItr != mapByTenantName.end()) {
 			existingId = nameItr->value.entry.id;
-			mapByTenantId.erase(nameItr->value.entry.id);
 		}
 		if (idItr != mapByTenantId.end()) {
 			existingName = idItr->value.name;
-			mapByTenantName.erase(idItr->value.name);
+		}
+		if (existingId.present()) {
+			mapByTenantId.erase(existingId.get());
+		}
+		if (existingName.present()) {
+			mapByTenantName.erase(existingName.get());
 		}

-		mapByTenantId[pair.second.id] = payload;
-		mapByTenantName[pair.first] = payload;
+		mapByTenantId[entry.id] = payload;
+		mapByTenantName[name] = payload;

 		TraceEvent("TenantEntryCachePut")
-		    .detail("TenantName", pair.first)
+		    .detail("TenantName", name)
 		    .detail("TenantNameExisting", existingName)
-		    .detail("TenantID", pair.second.id)
+		    .detail("TenantID", entry.id)
 		    .detail("TenantIDExisting", existingId)
 		    .detail("TenantPrefix", pair.second.prefix);

@ -384,7 +583,8 @@ public:
 	Counter::Value numCacheRefreshes() const { return numRefreshes.getValue(); }
 	Counter::Value numRefreshByMisses() const { return refreshByCacheMiss.getValue(); }
 	Counter::Value numRefreshByInit() const { return refreshByCacheInit.getValue(); }
+	Counter::Value numWatchRefreshes() const { return refreshByWatchTrigger.getValue(); }
 };

 #include "flow/unactorcompiler.h"
-#endif // FDBCLIENT_TENANTENTRYCACHE_ACTOR_H
+#endif // FDBCLIENT_TENANTENTRYCACHE_ACTOR_H
--- a/fdbclient/include/fdbclient/TenantManagement.actor.h
+++ b/fdbclient/include/fdbclient/TenantManagement.actor.h
@ -178,6 +178,7 @@ Future<std::pair<Optional<TenantMapEntry>, bool>> createTenantTransaction(

 	TenantMetadata::tenantMap().set(tr, name, tenantEntry);
 	TenantMetadata::tenantIdIndex().set(tr, tenantEntry.id, name);
+	TenantMetadata::lastTenantModification().setVersionstamp(tr, Versionstamp(), 0);

 	if (tenantEntry.tenantGroup.present()) {
 		TenantMetadata::tenantGroupTenantIndex().insert(tr, Tuple::makeTuple(tenantEntry.tenantGroup.get(), name));
@ -346,6 +347,7 @@ Future<Void> deleteTenantTransaction(Transaction tr,
 		TenantMetadata::tenantMap().erase(tr, name);
 		TenantMetadata::tenantIdIndex().erase(tr, tenantEntry.get().id);
 		TenantMetadata::tenantCount().atomicOp(tr, -1, MutationRef::AddValue);
+		TenantMetadata::lastTenantModification().setVersionstamp(tr, Versionstamp(), 0);

 		if (tenantEntry.get().tenantGroup.present()) {
 			TenantMetadata::tenantGroupTenantIndex().erase(tr,
@ -420,6 +422,7 @@ Future<Void> configureTenantTransaction(Transaction tr,

 	tr->setOption(FDBTransactionOptions::RAW_ACCESS);
 	TenantMetadata::tenantMap().set(tr, tenantName, updatedTenantEntry);
+	TenantMetadata::lastTenantModification().setVersionstamp(tr, Versionstamp(), 0);

 	// If the tenant group was changed, we need to update the tenant group metadata structures
 	if (originalEntry.tenantGroup != updatedTenantEntry.tenantGroup) {
@ -523,6 +526,7 @@ Future<Void> renameTenantTransaction(Transaction tr,
 	TenantMetadata::tenantMap().erase(tr, oldName);
 	TenantMetadata::tenantMap().set(tr, newName, oldEntry.get());
 	TenantMetadata::tenantIdIndex().set(tr, oldEntry.get().id, newName);
+	TenantMetadata::lastTenantModification().setVersionstamp(tr, Versionstamp(), 0);

 	// Update the tenant group index to reflect the new tenant name
 	if (oldEntry.get().tenantGroup.present()) {
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@ -202,8 +202,9 @@ description is not currently required but encouraged.
            description="Deprecated. Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect."
            defaultFor="23"/>
    <Option name="transaction_automatic_idempotency" code="506"
-            description="Set a random idempotency id for all transactions. See the transaction option description for more information." 
-            defaultFor="505"/>
+            description="Set a random idempotency id for all transactions. See the transaction option description for more information. This feature is in development and not ready for general use." 
+            defaultFor="505"
+            hidden="true"/>
    <Option name="transaction_bypass_unreadable" code="700"
            description="Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. This sets the ``bypass_unreadable`` option of each transaction created by this database. See the transaction option description for more information."
            defaultFor="1100"/>
@ -278,9 +279,11 @@ description is not currently required but encouraged.
            description="Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit." />
    <Option name="idempotency_id" code="504"
            paramType="String" paramDescription="Unique ID"
-            description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes." />
+            description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes. This feature is in development and not ready for general use."
+            hidden="true" />
    <Option name="automatic_idempotency" code="505"
-            description="Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future." />
+            description="Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future. This feature is in development and not ready for general use."
+            hidden="true" />
    <Option name="snapshot_ryw_enable" code="600"
            description="Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior." />
    <Option name="snapshot_ryw_disable" code="601"
--- a/fdbrpc/AsyncFileEncrypted.actor.cpp
+++ b/fdbrpc/AsyncFileEncrypted.actor.cpp
@ -48,15 +48,17 @@ public:
 	ACTOR static Future<Standalone<StringRef>> readBlock(AsyncFileEncrypted* self, uint32_t block) {
 		state Arena arena;
 		state unsigned char* encrypted = new (arena) unsigned char[FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE];
-		int bytes = wait(
-		    self->file->read(encrypted, FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE, FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE * block));
+		int bytes = wait(uncancellable(holdWhile(arena,
+		                                         self->file->read(encrypted,
+		                                                          FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE,
+		                                                          FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE * block))));
 		StreamCipherKey const* cipherKey = StreamCipherKey::getGlobalCipherKey();
 		DecryptionStreamCipher decryptor(cipherKey, self->getIV(block));
 		auto decrypted = decryptor.decrypt(encrypted, bytes, arena);
 		return Standalone<StringRef>(decrypted, arena);
 	}

-	ACTOR static Future<int> read(AsyncFileEncrypted* self, void* data, int length, int64_t offset) {
+	ACTOR static Future<int> read(Reference<AsyncFileEncrypted> self, void* data, int length, int64_t offset) {
 		state const uint32_t firstBlock = offset / FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE;
 		state const uint32_t lastBlock = (offset + length - 1) / FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE;
 		state uint32_t block;
@ -70,7 +72,7 @@ public:
 			if (cachedBlock.present()) {
 				plaintext = cachedBlock.get();
 			} else {
-				wait(store(plaintext, readBlock(self, block)));
+				wait(store(plaintext, readBlock(self.getPtr(), block)));
 				self->readBuffers.insert(block, plaintext);
 			}
 			auto start = (block == firstBlock) ? plaintext.begin() + (offset % FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE)
@ -96,7 +98,7 @@ public:
 		return bytesRead;
 	}

-	ACTOR static Future<Void> write(AsyncFileEncrypted* self, void const* data, int length, int64_t offset) {
+	ACTOR static Future<Void> write(Reference<AsyncFileEncrypted> self, void const* data, int length, int64_t offset) {
 		ASSERT(self->mode == AsyncFileEncrypted::Mode::APPEND_ONLY);
 		// All writes must append to the end of the file:
 		ASSERT_EQ(offset, self->currentBlock * FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE + self->offsetInBlock);
@ -122,7 +124,7 @@ public:
 		return Void();
 	}

-	ACTOR static Future<Void> sync(AsyncFileEncrypted* self) {
+	ACTOR static Future<Void> sync(Reference<AsyncFileEncrypted> self) {
 		ASSERT(self->mode == AsyncFileEncrypted::Mode::APPEND_ONLY);
 		wait(self->writeLastBlockToFile());
 		wait(self->file->sync());
@ -135,7 +137,7 @@ public:
 		Arena arena;
 		auto zeroes = new (arena) unsigned char[length];
 		memset(zeroes, 0, length);
-		wait(self->write(zeroes, length, offset));
+		wait(uncancellable(holdWhile(arena, self->write(zeroes, length, offset))));
 		return Void();
 	}
 };
@ -159,11 +161,11 @@ void AsyncFileEncrypted::delref() {
 }

 Future<int> AsyncFileEncrypted::read(void* data, int length, int64_t offset) {
-	return AsyncFileEncryptedImpl::read(this, data, length, offset);
+	return AsyncFileEncryptedImpl::read(Reference<AsyncFileEncrypted>::addRef(this), data, length, offset);
 }

 Future<Void> AsyncFileEncrypted::write(void const* data, int length, int64_t offset) {
-	return AsyncFileEncryptedImpl::write(this, data, length, offset);
+	return AsyncFileEncryptedImpl::write(Reference<AsyncFileEncrypted>::addRef(this), data, length, offset);
 }

 Future<Void> AsyncFileEncrypted::zeroRange(int64_t offset, int64_t length) {
@ -177,7 +179,7 @@ Future<Void> AsyncFileEncrypted::truncate(int64_t size) {

 Future<Void> AsyncFileEncrypted::sync() {
 	ASSERT(mode == Mode::APPEND_ONLY);
-	return AsyncFileEncryptedImpl::sync(this);
+	return AsyncFileEncryptedImpl::sync(Reference<AsyncFileEncrypted>::addRef(this));
 }

 Future<Void> AsyncFileEncrypted::flush() {
@ -217,7 +219,11 @@ StreamCipher::IV AsyncFileEncrypted::getIV(uint32_t block) const {
 }

 Future<Void> AsyncFileEncrypted::writeLastBlockToFile() {
-	return file->write(&writeBuffer[0], offsetInBlock, currentBlock * FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE);
+	// The source buffer for the write is owned by *this so this must be kept alive by reference count until the write
+	// is finished.
+	return uncancellable(
+	    holdWhile(Reference<AsyncFileEncrypted>::addRef(this),
+	              file->write(&writeBuffer[0], offsetInBlock, currentBlock * FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE)));
 }

 size_t AsyncFileEncrypted::RandomCache::evict() {
--- a/fdbrpc/include/fdbrpc/AsyncFileChaos.h
+++ b/fdbrpc/include/fdbrpc/AsyncFileChaos.h
@ -72,8 +72,9 @@ public:

 		// Wait for diskDelay before submitting the I/O
 		// Template types are being provided explicitly because they can't be automatically deduced for some reason.
+		// Capture file by value in case this is destroyed during the delay
 		return mapAsync<Void, std::function<Future<int>(Void)>, int>(
-		    delay(diskDelay), [=](Void _) -> Future<int> { return file->read(data, length, offset); });
+		    delay(diskDelay), [=, file = file](Void _) -> Future<int> { return file->read(data, length, offset); });
 	}

 	Future<Void> write(void const* data, int length, int64_t offset) override {
@ -102,9 +103,9 @@ public:
 					    .log();

 					// increment the metric for bit flips
-					auto res = g_network->global(INetwork::enChaosMetrics);
-					if (res) {
-						ChaosMetrics* chaosMetrics = static_cast<ChaosMetrics*>(res);
+					auto chaosMetricsPointer = g_network->global(INetwork::enChaosMetrics);
+					if (chaosMetricsPointer) {
+						ChaosMetrics* chaosMetrics = static_cast<ChaosMetrics*>(chaosMetricsPointer);
 						chaosMetrics->bitFlips++;
 					}
 				}
@ -112,28 +113,30 @@ public:
 		}

 		// Wait for diskDelay before submitting the I/O
-		return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(delay(getDelay()), [=](Void _) -> Future<Void> {
-			if (pdata) {
-				// if (g_network->isSimulated())
-				return map(holdWhile(arena, file->write(pdata, length, offset)), [corruptedBlock, this](auto res) {
-					if (g_network->isSimulated()) {
-						g_simulator->corruptedBlocks.template emplace(file->getFilename(), corruptedBlock);
-					}
-					return res;
-				});
-			}
+		// Capture file by value in case this is destroyed during the delay
+		return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(
+		    delay(getDelay()), [=, file = file](Void _) -> Future<Void> {
+			    if (pdata) {
+				    return map(
+				        holdWhile(arena, file->write(pdata, length, offset)), [corruptedBlock, file = file](auto res) {
+					        if (g_network->isSimulated()) {
+						        g_simulator->corruptedBlocks.template emplace(file->getFilename(), corruptedBlock);
+					        }
+					        return res;
+				        });
+			    }

-			return map(file->write(data, length, offset), [this, pdata, offset, length](auto res) {
-				if (pdata != nullptr || !g_network->isSimulated()) {
-					return res;
-				}
-				g_simulator->corruptedBlocks.erase(
-				    g_simulator->corruptedBlocks.lower_bound(std::make_pair(file->getFilename(), offset / 4096)),
-				    g_simulator->corruptedBlocks.upper_bound(
-				        std::make_pair(file->getFilename(), (offset + length) / 4096)));
-				return res;
-			});
-		});
+			    return map(file->write(data, length, offset), [this, pdata, offset, length, file = file](auto res) {
+				    if (pdata != nullptr || !g_network->isSimulated()) {
+					    return res;
+				    }
+				    g_simulator->corruptedBlocks.erase(
+				        g_simulator->corruptedBlocks.lower_bound(std::make_pair(file->getFilename(), offset / 4096)),
+				        g_simulator->corruptedBlocks.upper_bound(
+				            std::make_pair(file->getFilename(), (offset + length) / 4096)));
+				    return res;
+			    });
+		    });
 	}

 	Future<Void> truncate(int64_t size) override {
@ -142,8 +145,9 @@ public:
 			return file->truncate(size);

 		// Wait for diskDelay before submitting the I/O
+		// Capture file by value in case this is destroyed during the delay
 		return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(
-		    delay(diskDelay), [this, size](Void _) -> Future<Void> {
+		    delay(diskDelay), [this, size, file = file](Void _) -> Future<Void> {
 			    constexpr auto maxBlockValue =
 			        std::numeric_limits<decltype(g_simulator->corruptedBlocks)::key_type::second_type>::max();
 			    auto firstDeletedBlock =
@ -161,8 +165,9 @@ public:
 			return file->sync();

 		// Wait for diskDelay before submitting the I/O
+		// Capture file by value in case this is destroyed during the delay
 		return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(
-		    delay(diskDelay), [=](Void _) -> Future<Void> { return file->sync(); });
+		    delay(diskDelay), [=, file = file](Void _) -> Future<Void> { return file->sync(); });
 	}

 	Future<int64_t> size() const override {
@ -171,8 +176,9 @@ public:
 			return file->size();

 		// Wait for diskDelay before submitting the I/O
+		// Capture file by value in case this is destroyed during the delay
 		return mapAsync<Void, std::function<Future<int64_t>(Void)>, int64_t>(
-		    delay(diskDelay), [=](Void _) -> Future<int64_t> { return file->size(); });
+		    delay(diskDelay), [=, file = file](Void _) -> Future<int64_t> { return file->size(); });
 	}

 	int64_t debugFD() const override { return file->debugFD(); }
--- a/fdbrpc/include/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/include/fdbrpc/AsyncFileNonDurable.actor.h
@ -46,12 +46,17 @@ ACTOR Future<Void> sendErrorOnProcess(ISimulator::ProcessInfo* process,
                                      TaskPriority taskID);

 ACTOR template <class T>
-Future<T> sendErrorOnShutdown(Future<T> in) {
-	choose {
-		when(wait(success(g_simulator->getCurrentProcess()->shutdownSignal.getFuture()))) {
-			throw io_error().asInjectedFault();
+Future<T> sendErrorOnShutdown(Future<T> in, bool assertOnCancel = false) {
+	try {
+		choose {
+			when(wait(success(g_simulator->getCurrentProcess()->shutdownSignal.getFuture()))) {
+				throw io_error().asInjectedFault();
+			}
+			when(T rep = wait(in)) { return rep; }
 		}
-		when(T rep = wait(in)) { return rep; }
+	} catch (Error& e) {
+		ASSERT(e.code() != error_code_actor_cancelled || !assertOnCancel);
+		throw;
 	}
 }

@ -59,9 +64,12 @@ class AsyncFileDetachable final : public IAsyncFile, public ReferenceCounted<Asy
 private:
 	Reference<IAsyncFile> file;
 	Future<Void> shutdown;
+	bool assertOnReadWriteCancel;

 public:
-	explicit AsyncFileDetachable(Reference<IAsyncFile> file) : file(file) { shutdown = doShutdown(this); }
+	explicit AsyncFileDetachable(Reference<IAsyncFile> file) : file(file), assertOnReadWriteCancel(true) {
+		shutdown = doShutdown(this);
+	}

 	ACTOR Future<Void> doShutdown(AsyncFileDetachable* self) {
 		wait(success(g_simulator->getCurrentProcess()->shutdownSignal.getFuture()));
@ -84,13 +92,13 @@ public:
 	Future<int> read(void* data, int length, int64_t offset) override {
 		if (!file.getPtr() || g_simulator->getCurrentProcess()->shutdownSignal.getFuture().isReady())
 			return io_error().asInjectedFault();
-		return sendErrorOnShutdown(file->read(data, length, offset));
+		return sendErrorOnShutdown(file->read(data, length, offset), assertOnReadWriteCancel);
 	}

 	Future<Void> write(void const* data, int length, int64_t offset) override {
 		if (!file.getPtr() || g_simulator->getCurrentProcess()->shutdownSignal.getFuture().isReady())
 			return io_error().asInjectedFault();
-		return sendErrorOnShutdown(file->write(data, length, offset));
+		return sendErrorOnShutdown(file->write(data, length, offset), assertOnReadWriteCancel);
 	}

 	Future<Void> truncate(int64_t size) override {
--- a/fdbrpc/include/fdbrpc/AsyncFileReadAhead.actor.h
+++ b/fdbrpc/include/fdbrpc/AsyncFileReadAhead.actor.h
@ -52,7 +52,7 @@ public:

 		state Reference<CacheBlock> block(new CacheBlock(length));
 		try {
-			int len = wait(f->m_f->read(block->data, length, offset));
+			int len = wait(uncancellable(holdWhile(block, f->m_f->read(block->data, length, offset))));
 			block->len = len;
 		} catch (Error& e) {
 			f->m_max_concurrent_reads.release(1);
--- a/fdbrpc/include/fdbrpc/AsyncFileWriteChecker.h
+++ b/fdbrpc/include/fdbrpc/AsyncFileWriteChecker.h
@ -32,14 +32,18 @@ public:

 	// For read() and write(), the data buffer must remain valid until the future is ready
 	Future<int> read(void* data, int length, int64_t offset) override {
-		return map(m_f->read(data, length, offset), [=](int r) {
-			updateChecksumHistory(false, offset, r, (uint8_t*)data);
+		// Lambda must hold a reference to this to keep it alive until after the read
+		auto self = Reference<AsyncFileWriteChecker>::addRef(this);
+		return map(m_f->read(data, length, offset), [self, data, offset](int r) {
+			self->updateChecksumHistory(false, offset, r, (uint8_t*)data);
 			return r;
 		});
 	}
 	Future<Void> readZeroCopy(void** data, int* length, int64_t offset) override {
-		return map(m_f->readZeroCopy(data, length, offset), [=](Void r) {
-			updateChecksumHistory(false, offset, *length, (uint8_t*)data);
+		// Lambda must hold a reference to this to keep it alive until after the read
+		auto self = Reference<AsyncFileWriteChecker>::addRef(this);
+		return map(m_f->readZeroCopy(data, length, offset), [self, data, length, offset](Void r) {
+			self->updateChecksumHistory(false, offset, *length, (uint8_t*)data);
 			return r;
 		});
 	}
@ -50,12 +54,14 @@ public:
 	}

 	Future<Void> truncate(int64_t size) override {
-		return map(m_f->truncate(size), [=](Void r) {
+		// Lambda must hold a reference to this to keep it alive until after the read
+		auto self = Reference<AsyncFileWriteChecker>::addRef(this);
+		return map(m_f->truncate(size), [self, size](Void r) {
 			// Truncate the page checksum history if it is in use
-			if ((size / checksumHistoryPageSize) < checksumHistory.size()) {
-				int oldCapacity = checksumHistory.capacity();
-				checksumHistory.resize(size / checksumHistoryPageSize);
-				checksumHistoryBudget.get() -= (checksumHistory.capacity() - oldCapacity);
+			if ((size / checksumHistoryPageSize) < self->checksumHistory.size()) {
+				int oldCapacity = self->checksumHistory.capacity();
+				self->checksumHistory.resize(size / checksumHistoryPageSize);
+				checksumHistoryBudget.get() -= (self->checksumHistory.capacity() - oldCapacity);
 			}
 			return r;
 		});
--- a/fdbrpc/include/fdbrpc/FlowTransport.h
+++ b/fdbrpc/include/fdbrpc/FlowTransport.h
@ -239,7 +239,7 @@ public:
 	// Sets endpoint to be a new local endpoint which delivers messages to the given receiver
 	void addEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID);

-	void addEndpoints(std::vector<std::pair<struct FlowReceiver*, TaskPriority>> const& streams);
+	void addEndpoints(std::vector<std::pair<class FlowReceiver*, TaskPriority>> const& streams);

 	// The given local endpoint no longer delivers messages to the given receiver or uses resources
 	void removeEndpoint(const Endpoint&, NetworkMessageReceiver*);
--- a/fdbrpc/include/fdbrpc/TenantInfo.h
+++ b/fdbrpc/include/fdbrpc/TenantInfo.h
@ -42,6 +42,8 @@ struct TenantInfo {
 	// Is set during deserialization. It will be set to true if the tenant
 	// name is set and the client is authorized to use this tenant.
 	bool tenantAuthorized = false;
+	// Number of storage bytes currently used by this tenant.
+	int64_t storageUsage = 0;

 	// Helper function for most endpoints that read/write data. This returns true iff
 	// the client is either a) a trusted peer or b) is accessing keyspace belonging to a tenant,
--- a/fdbrpc/include/fdbrpc/fdbrpc.h
+++ b/fdbrpc/include/fdbrpc/fdbrpc.h
@ -28,9 +28,14 @@
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbrpc/networksender.actor.h"

-struct FlowReceiver : public NetworkMessageReceiver {
-	// Common endpoint code for NetSAV<> and NetNotifiedQueue<>
+// Common endpoint code for NetSAV<> and NetNotifiedQueue<>
+class FlowReceiver : public NetworkMessageReceiver, public NonCopyable {
+	Optional<PeerCompatibilityPolicy> peerCompatibilityPolicy_;
+	Endpoint endpoint;
+	bool m_isLocalEndpoint;
+	bool m_stream;

+protected:
 	FlowReceiver() : m_isLocalEndpoint(false), m_stream(false) {}

 	FlowReceiver(Endpoint const& remoteEndpoint, bool stream)
@ -46,8 +51,17 @@ struct FlowReceiver : public NetworkMessageReceiver {
 		}
 	}

-	bool isLocalEndpoint() { return m_isLocalEndpoint; }
-	bool isRemoteEndpoint() { return endpoint.isValid() && !m_isLocalEndpoint; }
+public:
+	bool isLocalEndpoint() const { return m_isLocalEndpoint; }
+	bool isRemoteEndpoint() const { return endpoint.isValid() && !m_isLocalEndpoint; }
+
+	void setRemoteEndpoint(Endpoint const& remoteEndpoint, bool stream) {
+		ASSERT(!m_isLocalEndpoint);
+		ASSERT(!endpoint.isValid());
+		endpoint = remoteEndpoint;
+		m_stream = stream;
+		FlowTransport::transport().addPeerReference(endpoint, m_stream);
+	}

 	// If already a remote endpoint, returns that.  Otherwise makes this
 	//   a local endpoint and returns that.
@ -80,12 +94,6 @@ struct FlowReceiver : public NetworkMessageReceiver {
 	}

 	const Endpoint& getRawEndpoint() { return endpoint; }
-
-private:
-	Optional<PeerCompatibilityPolicy> peerCompatibilityPolicy_;
-	Endpoint endpoint;
-	bool m_isLocalEndpoint;
-	bool m_stream;
 };

 template <class T>
@ -363,8 +371,9 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue<T>,
 			this->sendError(message.getError());
 		} else {
 			if (message.get().asUnderlyingType().acknowledgeToken.present()) {
-				acknowledgements = AcknowledgementReceiver(
-				    FlowTransport::transport().loadedEndpoint(message.get().asUnderlyingType().acknowledgeToken.get()));
+				acknowledgements.setRemoteEndpoint(
+				    FlowTransport::transport().loadedEndpoint(message.get().asUnderlyingType().acknowledgeToken.get()),
+				    false);
 				if (onConnect.isValid() && onConnect.canBeSet()) {
 					onConnect.send(Void());
 				}
--- a/fdbrpc/include/fdbrpc/simulator.h
+++ b/fdbrpc/include/fdbrpc/simulator.h
@ -120,6 +120,7 @@ public:
 		bool excludeFromRestarts = false;

 		std::vector<ProcessInfo*> childs;
+		bool drProcess = false;

 		ProcessInfo(const char* name,
 		            LocalityData locality,
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@ -1253,6 +1253,7 @@ public:
 				PromiseTask* task = self->taskQueue.getReadyTask();
 				self->taskQueue.popReadyTask();
 				self->execTask(*task);
+				delete task;
 				self->yielded = false;
 			}
 		}
@ -2275,7 +2276,7 @@ public:
 	}

 	// Implementation
-	struct PromiseTask final {
+	struct PromiseTask final : public FastAllocated<PromiseTask> {
 		Promise<Void> promise;
 		ProcessInfo* machine;
 		explicit PromiseTask(ProcessInfo* machine) : machine(machine) {}
--- a/fdbserver/DDRelocationQueue.actor.cpp
+++ b/fdbserver/DDRelocationQueue.actor.cpp
@ -687,6 +687,9 @@ struct DDQueue : public IDDRelocationQueue {

 	Reference<EventCacheHolder> movedKeyServersEventHolder;

+	int moveReusePhysicalShard;
+	int moveCreateNewPhysicalShard;
+
 	void startRelocation(int priority, int healthPriority) {
 		// Although PRIORITY_TEAM_REDUNDANT has lower priority than split and merge shard movement,
 		// we must count it into unhealthyRelocations; because team removers relies on unhealthyRelocations to
@ -750,7 +753,8 @@ struct DDQueue : public IDDRelocationQueue {
 	    output(output), input(input), getShardMetrics(getShardMetrics), getTopKMetrics(getTopKMetrics), lastInterval(0),
 	    suppressIntervals(0), rawProcessingUnhealthy(new AsyncVar<bool>(false)),
 	    rawProcessingWiggle(new AsyncVar<bool>(false)), unhealthyRelocations(0),
-	    movedKeyServersEventHolder(makeReference<EventCacheHolder>("MovedKeyServers")) {}
+	    movedKeyServersEventHolder(makeReference<EventCacheHolder>("MovedKeyServers")), moveReusePhysicalShard(0),
+	    moveCreateNewPhysicalShard(0) {}
 	DDQueue() = default;

 	void validate() {
@ -1676,6 +1680,11 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 					// when !rd.isRestore(), dataMoveId is just decided as physicalShardIDCandidate
 					// thus, update the physicalShardIDCandidate to related data structures
 					ASSERT(physicalShardIDCandidate != UID().first());
+					if (self->physicalShardCollection->physicalShardExists(physicalShardIDCandidate)) {
+						self->moveReusePhysicalShard++;
+					} else {
+						self->moveCreateNewPhysicalShard++;
+					}
 					rd.dataMoveId = newShardId(physicalShardIDCandidate, AssignEmptyRange::False);
 					auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
 					inFlightRange.value().dataMoveId = rd.dataMoveId;
@ -2472,6 +2481,14 @@ ACTOR Future<Void> dataDistributionQueue(Reference<IDDTxnProcessor> db,
 					    .trackLatest("MovingData"); // This trace event's trackLatest lifetime is controlled by
 					                                // DataDistributor::movingDataEventHolder. The track latest
 					                                // key we use here must match the key used in the holder.
+
+					if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+						TraceEvent("PhysicalShardMoveStats")
+						    .detail("MoveCreateNewPhysicalShard", self.moveCreateNewPhysicalShard)
+						    .detail("MoveReusePhysicalShard", self.moveReusePhysicalShard);
+						self.moveCreateNewPhysicalShard = 0;
+						self.moveReusePhysicalShard = 0;
+					}
 				}
 				when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator
 				when(wait(waitForAll(ddQueueFutures))) {}
--- a/fdbserver/DDShardTracker.actor.cpp
+++ b/fdbserver/DDShardTracker.actor.cpp
@ -2081,6 +2081,10 @@ void PhysicalShardCollection::logPhysicalShardCollection() {
 	}
 }

+bool PhysicalShardCollection::physicalShardExists(uint64_t physicalShardID) {
+	return physicalShardInstances.find(physicalShardID) != physicalShardInstances.end();
+}
+
 // FIXME: complete this test with non-empty range
 TEST_CASE("/DataDistributor/Tracker/FetchTopK") {
 	state DataDistributionTracker self;
--- a/fdbserver/DDTxnProcessor.actor.cpp
+++ b/fdbserver/DDTxnProcessor.actor.cpp
@ -25,6 +25,8 @@
 #include "fdbclient/DatabaseContext.h"
 #include "flow/actorcompiler.h" // This must be the last #include.

+FDB_DEFINE_BOOLEAN_PARAM(SkipDDModeCheck);
+
 class DDTxnProcessorImpl {
 	friend class DDTxnProcessor;

@ -240,7 +242,8 @@ class DDTxnProcessorImpl {
 	    UID distributorId,
 	    MoveKeysLock moveKeysLock,
 	    std::vector<Optional<Key>> remoteDcIds,
-	    const DDEnabledState* ddEnabledState) {
+	    const DDEnabledState* ddEnabledState,
+	    SkipDDModeCheck skipDDModeCheck) {
 		state Reference<InitialDataDistribution> result = makeReference<InitialDataDistribution>();
 		state Key beginKey = allKeys.begin;

@ -253,6 +256,7 @@ class DDTxnProcessorImpl {
 		state std::vector<std::pair<StorageServerInterface, ProcessClass>> tss_servers;
 		state int numDataMoves = 0;

+		CODE_PROBE((bool)skipDDModeCheck, "DD Mode won't prevent read initial data distribution.");
 		// Get the server list in its own try/catch block since it modifies result.  We don't want a subsequent failure
 		// causing entries to be duplicated
 		loop {
@ -285,7 +289,7 @@ class DDTxnProcessorImpl {
 					BinaryReader rd(mode.get(), Unversioned());
 					rd >> result->mode;
 				}
-				if (!result->mode || !ddEnabledState->isDDEnabled()) {
+				if ((!skipDDModeCheck && !result->mode) || !ddEnabledState->isDDEnabled()) {
 					// DD can be disabled persistently (result->mode = 0) or transiently (isDDEnabled() = 0)
 					TraceEvent(SevDebug, "GetInitialDataDistribution_DisabledDD").log();
 					return result;
@ -620,8 +624,10 @@ Future<Reference<InitialDataDistribution>> DDTxnProcessor::getInitialDataDistrib
    const UID& distributorId,
    const MoveKeysLock& moveKeysLock,
    const std::vector<Optional<Key>>& remoteDcIds,
-    const DDEnabledState* ddEnabledState) {
-	return DDTxnProcessorImpl::getInitialDataDistribution(cx, distributorId, moveKeysLock, remoteDcIds, ddEnabledState);
+    const DDEnabledState* ddEnabledState,
+    SkipDDModeCheck skipDDModeCheck) {
+	return DDTxnProcessorImpl::getInitialDataDistribution(
+	    cx, distributorId, moveKeysLock, remoteDcIds, ddEnabledState, skipDDModeCheck);
 }

 Future<Void> DDTxnProcessor::waitForDataDistributionEnabled(const DDEnabledState* ddEnabledState) const {
@ -681,6 +687,33 @@ Future<std::vector<ProcessData>> DDTxnProcessor::getWorkers() const {
 	return ::getWorkers(cx);
 }

+Future<Void> DDTxnProcessor::rawStartMovement(MoveKeysParams& params,
+                                              std::map<UID, StorageServerInterface>& tssMapping) {
+	return ::rawStartMovement(cx, params, tssMapping);
+}
+
+Future<Void> DDTxnProcessor::rawFinishMovement(MoveKeysParams& params,
+                                               const std::map<UID, StorageServerInterface>& tssMapping) {
+	return ::rawFinishMovement(cx, params, tssMapping);
+}
+
+struct DDMockTxnProcessorImpl {
+	ACTOR static Future<Void> moveKeys(DDMockTxnProcessor* self, MoveKeysParams params) {
+		state std::map<UID, StorageServerInterface> tssMapping;
+		self->rawStartMovement(params, tssMapping);
+		ASSERT(tssMapping.empty());
+
+		if (BUGGIFY_WITH_PROB(0.5)) {
+			wait(delayJittered(5.0));
+		}
+
+		self->rawFinishMovement(params, tssMapping);
+		if (!params.dataMovementComplete.isSet())
+			params.dataMovementComplete.send(Void());
+		return Void();
+	}
+};
+
 Future<ServerWorkerInfos> DDMockTxnProcessor::getServerListAndProcessClasses() {
 	ServerWorkerInfos res;
 	for (auto& [_, mss] : mgs->allServers) {
@ -757,7 +790,8 @@ Future<Reference<InitialDataDistribution>> DDMockTxnProcessor::getInitialDataDis
    const UID& distributorId,
    const MoveKeysLock& moveKeysLock,
    const std::vector<Optional<Key>>& remoteDcIds,
-    const DDEnabledState* ddEnabledState) {
+    const DDEnabledState* ddEnabledState,
+    SkipDDModeCheck skipDDModeCheck) {

 	// FIXME: now we just ignore ddEnabledState and moveKeysLock, will fix it in the future
 	Reference<InitialDataDistribution> res = makeReference<InitialDataDistribution>();
@ -817,9 +851,10 @@ void DDMockTxnProcessor::setupMockGlobalState(Reference<InitialDataDistribution>
 	mgs->shardMapping->setCheckMode(ShardsAffectedByTeamFailure::CheckMode::Normal);
 }

-// FIXME: finish moveKeys implementation
 Future<Void> DDMockTxnProcessor::moveKeys(const MoveKeysParams& params) {
-	UNREACHABLE();
+	// Not support location metadata yet
+	ASSERT(!SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
+	return DDMockTxnProcessorImpl::moveKeys(this, params);
 }

 // FIXME: finish implementation
@ -851,3 +886,48 @@ Future<std::pair<Optional<StorageMetrics>, int>> DDMockTxnProcessor::waitStorage
 Future<std::vector<ProcessData>> DDMockTxnProcessor::getWorkers() const {
 	return Future<std::vector<ProcessData>>();
 }
+
+void DDMockTxnProcessor::rawStartMovement(MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping) {
+	FlowLock::Releaser releaser(*params.startMoveKeysParallelismLock);
+	// Add wait(take) would always return immediately because there won’t be parallel rawStart or rawFinish in mock
+	// world due to the fact the following *mock* transaction code will always finish without coroutine switch.
+	ASSERT(params.startMoveKeysParallelismLock->take().isReady());
+
+	std::vector<ShardsAffectedByTeamFailure::Team> destTeams;
+	destTeams.emplace_back(params.destinationTeam, true);
+	mgs->shardMapping->moveShard(params.keys, destTeams);
+
+	for (auto& id : params.destinationTeam) {
+		mgs->allServers.at(id).setShardStatus(params.keys, MockShardStatus::INFLIGHT, mgs->restrictSize);
+	}
+}
+
+void DDMockTxnProcessor::rawFinishMovement(MoveKeysParams& params,
+                                           const std::map<UID, StorageServerInterface>& tssMapping) {
+	FlowLock::Releaser releaser(*params.finishMoveKeysParallelismLock);
+	// Add wait(take) would always return immediately because there won’t be parallel rawStart or rawFinish in mock
+	// world due to the fact the following *mock* transaction code will always finish without coroutine switch.
+	ASSERT(params.finishMoveKeysParallelismLock->take().isReady());
+
+	// get source and dest teams
+	auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsFor(params.keys);
+
+	ASSERT_EQ(destTeams.size(), 0);
+	if (destTeams.front() != ShardsAffectedByTeamFailure::Team{ params.destinationTeam, true }) {
+		TraceEvent(SevError, "MockRawFinishMovementError")
+		    .detail("Reason", "InconsistentDestinations")
+		    .detail("ShardMappingDest", describe(destTeams.front().servers))
+		    .detail("ParamDest", describe(params.destinationTeam));
+		ASSERT(false); // This shouldn't happen because the overlapped key range movement won't be executed in parallel
+	}
+
+	for (auto& id : params.destinationTeam) {
+		mgs->allServers.at(id).setShardStatus(params.keys, MockShardStatus::COMPLETED, mgs->restrictSize);
+	}
+
+	ASSERT_EQ(srcTeams.size(), 0);
+	for (auto& id : srcTeams.front().servers) {
+		mgs->allServers.at(id).removeShard(params.keys);
+	}
+	mgs->shardMapping->finishMove(params.keys);
+}
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -316,7 +316,8 @@ public:
 		                 ddId,
 		                 lock,
 		                 configuration.usableRegions > 1 ? remoteDcIds : std::vector<Optional<Key>>(),
-		                 context->ddEnabledState.get()));
+		                 context->ddEnabledState.get(),
+		                 SkipDDModeCheck::False));
 	}

 	void initDcInfo() {
@ -692,6 +693,10 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 				                                    "DDTenantCacheMonitor",
 				                                    self->ddId,
 				                                    &normalDDQueueErrors()));
+				actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorStorageUsage(),
+				                                    "StorageUsageTracker",
+				                                    self->ddId,
+				                                    &normalDDQueueErrors()));
 			}

 			std::vector<DDTeamCollection*> teamCollectionsPtrs;
--- a/fdbserver/DiskQueue.actor.cpp
+++ b/fdbserver/DiskQueue.actor.cpp
@ -429,7 +429,7 @@ public:
 		waitfor.push_back(self->files[1].f->write(pageData.begin(), pageData.size(), self->writingPos));
 		self->writingPos += pageData.size();

-		return waitForAll(waitfor);
+		return waitForAllReadyThenThrow(waitfor);
 	}

 	// Write the given data (pageData) to the queue files of self, sync data to disk, and delete the memory (pageMem)
@ -655,7 +655,7 @@ public:
 			for (int i = 0; i < 2; i++)
 				if (self->files[i].size > 0)
 					reads.push_back(self->files[i].f->read(self->firstPages[i], sizeof(Page), 0));
-			wait(waitForAll(reads));
+			wait(waitForAllReadyThenThrow(reads));

 			// Determine which file comes first
 			if (compare(self->firstPages[1], self->firstPages[0])) {
@ -743,7 +743,10 @@ public:
 	}

 	// Read nPages from pageOffset*sizeof(Page) offset in file self->files[file]
-	ACTOR static Future<Standalone<StringRef>> read(RawDiskQueue_TwoFiles* self, int file, int pageOffset, int nPages) {
+	ACTOR static UNCANCELLABLE Future<Standalone<StringRef>> read(RawDiskQueue_TwoFiles* self,
+	                                                              int file,
+	                                                              int pageOffset,
+	                                                              int nPages) {
 		state TrackMe trackMe(self);
 		state const size_t bytesRequested = nPages * sizeof(Page);
 		state Standalone<StringRef> result = makeAlignedString(sizeof(Page), bytesRequested);
--- a/fdbserver/EncryptKeyProxy.actor.cpp
+++ b/fdbserver/EncryptKeyProxy.actor.cpp
@ -31,6 +31,7 @@
 #include "fdbserver/WorkerInterface.actor.h"
 #include "fdbserver/ServerDBInfo.h"
 #include "flow/Arena.h"
+#include "flow/CodeProbe.h"
 #include "flow/EncryptUtils.h"
 #include "flow/Error.h"
 #include "flow/EventTypes.actor.h"
@ -387,6 +388,15 @@ ACTOR Future<Void> getCipherKeysByBaseCipherKeyIds(Reference<EncryptKeyProxyData
 		try {
 			KmsConnLookupEKsByKeyIdsReq keysByIdsReq;
 			for (const auto& item : lookupCipherInfoMap) {
+				// TODO: Currently getEncryptCipherKeys does not pass the domain name, once that is fixed we can remove
+				// the check on the empty domain name
+				if (!item.second.domainName.empty()) {
+					if (item.second.domainId == FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
+						ASSERT(item.second.domainName == FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
+					} else if (item.second.domainId == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
+						ASSERT(item.second.domainName == FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
+					}
+				}
 				keysByIdsReq.encryptKeyInfos.emplace_back_deep(
 				    keysByIdsReq.arena, item.second.domainId, item.second.baseCipherId, item.second.domainName);
 			}
@ -452,6 +462,8 @@ ACTOR Future<Void> getCipherKeysByBaseCipherKeyIds(Reference<EncryptKeyProxyData
 	keyIdsReply.numHits = cachedCipherDetails.size();
 	keysByIds.reply.send(keyIdsReply);

+	CODE_PROBE(!lookupCipherInfoMap.empty(), "EKP fetch cipherKeys by KeyId from KMS");
+
 	return Void();
 }

@ -475,13 +487,13 @@ ACTOR Future<Void> getLatestCipherKeys(Reference<EncryptKeyProxyData> ekpProxyDa
 	// Dedup the requested domainIds.
 	// TODO: endpoint serialization of std::unordered_set isn't working at the moment
 	std::unordered_map<EncryptCipherDomainId, EKPGetLatestCipherKeysRequestInfo> dedupedDomainInfos;
-	for (const auto info : req.encryptDomainInfos) {
+	for (const auto& info : req.encryptDomainInfos) {
 		dedupedDomainInfos.emplace(info.domainId, info);
 	}

 	if (dbgTrace.present()) {
 		dbgTrace.get().detail("NKeys", dedupedDomainInfos.size());
-		for (const auto info : dedupedDomainInfos) {
+		for (const auto& info : dedupedDomainInfos) {
 			// log encryptDomainIds queried
 			dbgTrace.get().detail(
 			    getEncryptDbgTraceKey(ENCRYPT_DBG_TRACE_QUERY_PREFIX, info.first, info.second.domainName), "");
@ -524,6 +536,11 @@ ACTOR Future<Void> getLatestCipherKeys(Reference<EncryptKeyProxyData> ekpProxyDa
 		try {
 			KmsConnLookupEKsByDomainIdsReq keysByDomainIdReq;
 			for (const auto& item : lookupCipherDomains) {
+				if (item.second.domainId == FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
+					ASSERT(item.second.domainName == FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
+				} else if (item.second.domainId == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
+					ASSERT(item.second.domainName == FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
+				}
 				keysByDomainIdReq.encryptDomainInfos.emplace_back_deep(
 				    keysByDomainIdReq.arena, item.second.domainId, item.second.domainName);
 			}
@ -588,6 +605,8 @@ ACTOR Future<Void> getLatestCipherKeys(Reference<EncryptKeyProxyData> ekpProxyDa
 	latestCipherReply.numHits = cachedCipherDetails.size();
 	latestKeysReq.reply.send(latestCipherReply);

+	CODE_PROBE(!lookupCipherDomains.empty(), "EKP fetch latest cipherKeys from KMS");
+
 	return Void();
 }

@ -610,7 +629,7 @@ bool isBlobMetadataEligibleForRefresh(const BlobMetadataDetailsRef& blobMetadata
 	return nextRefreshCycleTS > blobMetadata.expireAt || nextRefreshCycleTS > blobMetadata.refreshAt;
 }

-ACTOR Future<Void> refreshEncryptionKeysCore(Reference<EncryptKeyProxyData> ekpProxyData,
+ACTOR Future<Void> refreshEncryptionKeysImpl(Reference<EncryptKeyProxyData> ekpProxyData,
                                             KmsConnectorInterface kmsConnectorInf) {
 	state UID debugId = deterministicRandom()->randomUniqueID();

@ -672,6 +691,7 @@ ACTOR Future<Void> refreshEncryptionKeysCore(Reference<EncryptKeyProxyData> ekpP
 		ekpProxyData->baseCipherKeysRefreshed += rep.cipherKeyDetails.size();

 		t.detail("NumKeys", rep.cipherKeyDetails.size());
+		CODE_PROBE(!rep.cipherKeyDetails.empty(), "EKP refresh cipherKeys");
 	} catch (Error& e) {
 		if (!canReplyWith(e)) {
 			TraceEvent(SevWarn, "RefreshEKsError").error(e);
@ -685,7 +705,7 @@ ACTOR Future<Void> refreshEncryptionKeysCore(Reference<EncryptKeyProxyData> ekpP
 }

 Future<Void> refreshEncryptionKeys(Reference<EncryptKeyProxyData> ekpProxyData, KmsConnectorInterface kmsConnectorInf) {
-	return refreshEncryptionKeysCore(ekpProxyData, kmsConnectorInf);
+	return refreshEncryptionKeysImpl(ekpProxyData, kmsConnectorInf);
 }

 ACTOR Future<Void> getLatestBlobMetadata(Reference<EncryptKeyProxyData> ekpProxyData,
@ -775,7 +795,7 @@ ACTOR Future<Void> refreshBlobMetadataCore(Reference<EncryptKeyProxyData> ekpPro
 	state UID debugId = deterministicRandom()->randomUniqueID();
 	state double startTime;

-	state TraceEvent t("RefreshBlobMetadata_Start", ekpProxyData->myId);
+	state TraceEvent t("RefreshBlobMetadataStart", ekpProxyData->myId);
 	t.setMaxEventLength(SERVER_KNOBS->ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH);
 	t.detail("KmsConnInf", kmsConnectorInf.id());
 	t.detail("DebugId", debugId);
@ -817,7 +837,7 @@ ACTOR Future<Void> refreshBlobMetadataCore(Reference<EncryptKeyProxyData> ekpPro
 		t.detail("nKeys", rep.metadataDetails.size());
 	} catch (Error& e) {
 		if (!canReplyWith(e)) {
-			TraceEvent("RefreshBlobMetadata_Error").error(e);
+			TraceEvent("RefreshBlobMetadataError").error(e);
 			throw e;
 		}
 		TraceEvent("RefreshBlobMetadata").detail("ErrorCode", e.code());
@ -832,24 +852,25 @@ void refreshBlobMetadata(Reference<EncryptKeyProxyData> ekpProxyData, KmsConnect
 }

 void activateKmsConnector(Reference<EncryptKeyProxyData> ekpProxyData, KmsConnectorInterface kmsConnectorInf) {
-	if (g_network->isSimulated() || (SERVER_KNOBS->KMS_CONNECTOR_TYPE.compare(FDB_PREF_KMS_CONNECTOR_TYPE_STR) == 0)) {
-		ekpProxyData->kmsConnector = std::make_unique<SimKmsConnector>();
+	if (g_network->isSimulated()) {
+		ekpProxyData->kmsConnector = std::make_unique<SimKmsConnector>(FDB_SIM_KMS_CONNECTOR_TYPE_STR);
+	} else if (SERVER_KNOBS->KMS_CONNECTOR_TYPE.compare(FDB_PREF_KMS_CONNECTOR_TYPE_STR) == 0) {
+		ekpProxyData->kmsConnector = std::make_unique<SimKmsConnector>(FDB_PREF_KMS_CONNECTOR_TYPE_STR);
 	} else if (SERVER_KNOBS->KMS_CONNECTOR_TYPE.compare(REST_KMS_CONNECTOR_TYPE_STR) == 0) {
-		ekpProxyData->kmsConnector = std::make_unique<RESTKmsConnector>();
+		ekpProxyData->kmsConnector = std::make_unique<RESTKmsConnector>(REST_KMS_CONNECTOR_TYPE_STR);
 	} else {
 		throw not_implemented();
 	}

 	TraceEvent("EKPActiveKmsConnector", ekpProxyData->myId)
-	    .detail("ConnectorType",
-	            g_network->isSimulated() ? FDB_SIM_KMS_CONNECTOR_TYPE_STR : SERVER_KNOBS->KMS_CONNECTOR_TYPE)
+	    .detail("ConnectorType", ekpProxyData->kmsConnector->getConnectorStr())
 	    .detail("InfId", kmsConnectorInf.id());

 	ekpProxyData->addActor.send(ekpProxyData->kmsConnector->connectorCore(kmsConnectorInf));
 }

 ACTOR Future<Void> encryptKeyProxyServer(EncryptKeyProxyInterface ekpInterface, Reference<AsyncVar<ServerDBInfo>> db) {
-	state Reference<EncryptKeyProxyData> self(new EncryptKeyProxyData(ekpInterface.id()));
+	state Reference<EncryptKeyProxyData> self = makeReference<EncryptKeyProxyData>(ekpInterface.id());
 	state Future<Void> collection = actorCollection(self->addActor.getFuture());
 	self->addActor.send(traceRole(Role::ENCRYPT_KEY_PROXY, ekpInterface.id()));

--- a/fdbserver/KeyValueStoreCompressTestData.actor.cpp
+++ b/fdbserver/KeyValueStoreCompressTestData.actor.cpp
@ -53,7 +53,11 @@ struct KeyValueStoreCompressTestData final : IKeyValueStore {
 	void set(KeyValueRef keyValue, const Arena* arena = nullptr) override {
 		store->set(KeyValueRef(keyValue.key, pack(keyValue.value)), arena);
 	}
-	void clear(KeyRangeRef range, const Arena* arena = nullptr) override { store->clear(range, arena); }
+	void clear(KeyRangeRef range,
+	           const StorageServerMetrics* storageMetrics = nullptr,
+	           const Arena* arena = nullptr) override {
+		store->clear(range, storageMetrics, arena);
+	}
 	Future<Void> commit(bool sequential = false) override { return store->commit(sequential); }

 	Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> options) override {
--- a/fdbserver/KeyValueStoreMemory.actor.cpp
+++ b/fdbserver/KeyValueStoreMemory.actor.cpp
@ -130,7 +130,7 @@ public:
 		}
 	}

-	void clear(KeyRangeRef range, const Arena* arena) override {
+	void clear(KeyRangeRef range, const StorageServerMetrics* storageMetrics, const Arena* arena) override {
 		// A commit that occurs with no available space returns Never, so we can throw out all modifications
 		if (getAvailableSize() <= 0)
 			return;
--- a/fdbserver/KeyValueStoreRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp
@ -1846,22 +1846,52 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 	void set(KeyValueRef kv, const Arena*) override {
 		if (writeBatch == nullptr) {
 			writeBatch.reset(new rocksdb::WriteBatch());
+			keysSet.clear();
 		}
 		ASSERT(defaultFdbCF != nullptr);
 		writeBatch->Put(defaultFdbCF, toSlice(kv.key), toSlice(kv.value));
+		if (SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE) {
+			keysSet.insert(kv.key);
+		}
 	}

-	void clear(KeyRangeRef keyRange, const Arena*) override {
+	void clear(KeyRangeRef keyRange, const StorageServerMetrics* storageMetrics, const Arena*) override {
 		if (writeBatch == nullptr) {
 			writeBatch.reset(new rocksdb::WriteBatch());
+			keysSet.clear();
 		}

 		ASSERT(defaultFdbCF != nullptr);
-
 		if (keyRange.singleKeyRange()) {
 			writeBatch->Delete(defaultFdbCF, toSlice(keyRange.begin));
 		} else {
-			writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end));
+			if (SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE && storageMetrics != nullptr &&
+			    storageMetrics->byteSample.getEstimate(keyRange) <
+			        SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT) {
+				rocksdb::ReadOptions options = sharedState->getReadOptions();
+				auto beginSlice = toSlice(keyRange.begin);
+				auto endSlice = toSlice(keyRange.end);
+				options.iterate_lower_bound = &beginSlice;
+				options.iterate_upper_bound = &endSlice;
+				auto cursor = std::unique_ptr<rocksdb::Iterator>(db->NewIterator(options, defaultFdbCF));
+				cursor->Seek(toSlice(keyRange.begin));
+				while (cursor->Valid() && toStringRef(cursor->key()) < keyRange.end) {
+					writeBatch->Delete(defaultFdbCF, cursor->key());
+					cursor->Next();
+				}
+				if (!cursor->status().ok()) {
+					// if readrange iteration fails, then do a deleteRange.
+					writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end));
+				} else {
+					auto it = keysSet.lower_bound(keyRange.begin);
+					while (it != keysSet.end() && *it < keyRange.end) {
+						writeBatch->Delete(defaultFdbCF, toSlice(*it));
+						it++;
+					}
+				}
+			} else {
+				writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end));
+			}
 		}
 	}

@ -1890,6 +1920,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 		}
 		auto a = new Writer::CommitAction();
 		a->batchToCommit = std::move(writeBatch);
+		keysSet.clear();
 		auto res = a->done.getFuture();
 		writeThread->post(a);
 		return res;
@ -2083,6 +2114,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 	Promise<Void> closePromise;
 	Future<Void> openFuture;
 	std::unique_ptr<rocksdb::WriteBatch> writeBatch;
+	std::set<Key> keysSet;
 	Optional<Future<Void>> metrics;
 	FlowLock readSemaphore;
 	int numReadWaiters;
--- a/fdbserver/KeyValueStoreSQLite.actor.cpp
+++ b/fdbserver/KeyValueStoreSQLite.actor.cpp
@ -1603,7 +1603,9 @@ public:
 	StorageBytes getStorageBytes() const override;

 	void set(KeyValueRef keyValue, const Arena* arena = nullptr) override;
-	void clear(KeyRangeRef range, const Arena* arena = nullptr) override;
+	void clear(KeyRangeRef range,
+	           const StorageServerMetrics* storageMetrics = nullptr,
+	           const Arena* arena = nullptr) override;
 	Future<Void> commit(bool sequential = false) override;

 	Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> optionss) override;
@ -2222,7 +2224,7 @@ void KeyValueStoreSQLite::set(KeyValueRef keyValue, const Arena* arena) {
 	++writesRequested;
 	writeThread->post(new Writer::SetAction(keyValue));
 }
-void KeyValueStoreSQLite::clear(KeyRangeRef range, const Arena* arena) {
+void KeyValueStoreSQLite::clear(KeyRangeRef range, const StorageServerMetrics* storageMetrics, const Arena* arena) {
 	++writesRequested;
 	writeThread->post(new Writer::ClearAction(range));
 }
--- a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp
@ -49,6 +49,7 @@ static_assert((ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR == 27) ? ROCKSDB_PATCH >= 3 :
              "Unsupported rocksdb version. Update the rocksdb to 6.27.3 version");

 const std::string rocksDataFolderSuffix = "-data";
+const std::string METADATA_SHARD_ID = "kvs-metadata";
 const KeyRef shardMappingPrefix("\xff\xff/ShardMapping/"_sr);
 // TODO: move constants to a header file.
 const StringRef ROCKSDBSTORAGE_HISTOGRAM_GROUP = "RocksDBStorage"_sr;
@ -304,13 +305,12 @@ rocksdb::ReadOptions getReadOptions() {
 }

 struct ReadIterator {
-	rocksdb::ColumnFamilyHandle* cf;
 	uint64_t index; // incrementing counter to uniquely identify read iterator.
 	bool inUse;
 	std::shared_ptr<rocksdb::Iterator> iter;
 	double creationTime;
 	ReadIterator(rocksdb::ColumnFamilyHandle* cf, uint64_t index, rocksdb::DB* db, rocksdb::ReadOptions& options)
-	  : cf(cf), index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options, cf)) {}
+	  : index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options, cf)) {}
 };

 /*
@ -475,13 +475,26 @@ struct PhysicalShard {
 	}

 	~PhysicalShard() {
-		if (!deletePending)
-			return;
+		logShardEvent(id, ShardOp::CLOSE);
+		isInitialized.store(false);
+		readIterPool.reset();

-		// Destroy CF
-		auto s = db->DropColumnFamily(cf);
+		// Deleting default column family is not allowed.
+		if (id == "default") {
+			return;
+		}
+
+		if (deletePending) {
+			auto s = db->DropColumnFamily(cf);
+			if (!s.ok()) {
+				logRocksDBError(s, "DestroyShard");
+				logShardEvent(id, ShardOp::DESTROY, SevError, s.ToString());
+				return;
+			}
+		}
+		auto s = db->DestroyColumnFamilyHandle(cf);
 		if (!s.ok()) {
-			logRocksDBError(s, "DestroyShard");
+			logRocksDBError(s, "DestroyCFHandle");
 			logShardEvent(id, ShardOp::DESTROY, SevError, s.ToString());
 			return;
 		}
@ -628,7 +641,7 @@ public:
 		std::vector<rocksdb::ColumnFamilyDescriptor> descriptors;
 		bool foundMetadata = false;
 		for (const auto& name : columnFamilies) {
-			if (name == "kvs-metadata") {
+			if (name == METADATA_SHARD_ID) {
 				foundMetadata = true;
 			}
 			descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ name, cfOptions });
@ -652,19 +665,19 @@ public:
 			TraceEvent(SevInfo, "ShardedRocksInitLoadPhysicalShards", this->logId)
 			    .detail("PhysicalShardCount", handles.size());

+			std::shared_ptr<PhysicalShard> metadataShard = nullptr;
 			for (auto handle : handles) {
-				if (handle->GetName() == "kvs-metadata") {
-					metadataShard = std::make_shared<PhysicalShard>(db, "kvs-metadata", handle);
-				} else {
-					physicalShards[handle->GetName()] = std::make_shared<PhysicalShard>(db, handle->GetName(), handle);
+				auto shard = std::make_shared<PhysicalShard>(db, handle->GetName(), handle);
+				if (shard->id == METADATA_SHARD_ID) {
+					metadataShard = shard;
 				}
+				physicalShards[shard->id] = shard;
 				columnFamilyMap[handle->GetID()] = handle;
-				TraceEvent(SevVerbose, "ShardedRocksInitPhysicalShard", this->logId)
-				    .detail("PhysicalShard", handle->GetName());
+				TraceEvent(SevVerbose, "ShardedRocksInitPhysicalShard", this->logId).detail("PhysicalShard", shard->id);
 			}

 			std::set<std::string> unusedShards(columnFamilies.begin(), columnFamilies.end());
-			unusedShards.erase("kvs-metadata");
+			unusedShards.erase(METADATA_SHARD_ID);
 			unusedShards.erase("default");

 			KeyRange keyRange = prefixRange(shardMappingPrefix);
@ -746,9 +759,11 @@ public:
 			defaultShard->dataShards[specialKeys.begin.toString()] = std::move(dataShard);
 			physicalShards[defaultShard->id] = defaultShard;

-			metadataShard = std::make_shared<PhysicalShard>(db, "kvs-metadata");
+			// Create metadata shard.
+			auto metadataShard = std::make_shared<PhysicalShard>(db, METADATA_SHARD_ID);
 			metadataShard->init();
 			columnFamilyMap[metadataShard->cf->GetID()] = metadataShard->cf;
+			physicalShards[METADATA_SHARD_ID] = metadataShard;

 			// Write special key range metadata.
 			writeBatch = std::make_unique<rocksdb::WriteBatch>();
@ -763,7 +778,6 @@ public:
 			TraceEvent(SevInfo, "ShardedRocksInitializeMetaDataShard", this->logId)
 			    .detail("MetadataShardCF", metadataShard->cf->GetID());
 		}
-		physicalShards["kvs-metadata"] = metadataShard;

 		writeBatch = std::make_unique<rocksdb::WriteBatch>();
 		dirtyShards = std::make_unique<std::set<PhysicalShard*>>();
@ -910,6 +924,9 @@ public:
 	std::vector<std::shared_ptr<PhysicalShard>> getPendingDeletionShards(double cleanUpDelay) {
 		std::vector<std::shared_ptr<PhysicalShard>> emptyShards;
 		double currentTime = now();
+
+		TraceEvent(SevInfo, "ShardedRocksDB", logId)
+		    .detail("PendingDeletionShardQueueSize", pendingDeletionShards.size());
 		while (!pendingDeletionShards.empty()) {
 			const auto& id = pendingDeletionShards.front();
 			auto it = physicalShards.find(id);
@ -976,6 +993,10 @@ public:
 		    .detail("Info", "RangeToPersist")
 		    .detail("BeginKey", range.begin)
 		    .detail("EndKey", range.end);
+		auto it = physicalShards.find(METADATA_SHARD_ID);
+		ASSERT(it != physicalShards.end());
+		auto metadataShard = it->second;
+
 		writeBatch->DeleteRange(metadataShard->cf,
 		                        getShardMappingKey(range.begin, shardMappingPrefix),
 		                        getShardMappingKey(range.end, shardMappingPrefix));
@ -1043,24 +1064,30 @@ public:
 	}

 	void closeAllShards() {
-		for (auto& [_, shard] : physicalShards) {
-			shard->readIterPool.reset();
-		}
+		columnFamilyMap.clear();
+		physicalShards.clear();
 		// Close DB.
 		auto s = db->Close();
 		if (!s.ok()) {
 			logRocksDBError(s, "Close");
 			return;
 		}
+		TraceEvent("ShardedRocksDB", this->logId).detail("Info", "DBClosed");
 	}

 	void destroyAllShards() {
-		closeAllShards();
-		std::vector<rocksdb::ColumnFamilyDescriptor> cfs;
-		for (const auto& [key, _] : physicalShards) {
-			cfs.push_back(rocksdb::ColumnFamilyDescriptor{ key, getCFOptions() });
+		columnFamilyMap.clear();
+		for (auto& [_, shard] : physicalShards) {
+			shard->deletePending = true;
 		}
-		auto s = rocksdb::DestroyDB(path, getOptions(), cfs);
+		physicalShards.clear();
+		// Close DB.
+		auto s = db->Close();
+		if (!s.ok()) {
+			logRocksDBError(s, "Close");
+			return;
+		}
+		s = rocksdb::DestroyDB(path, getOptions());
 		if (!s.ok()) {
 			logRocksDBError(s, "DestroyDB");
 		}
@ -1121,7 +1148,6 @@ private:
 	std::unique_ptr<rocksdb::WriteBatch> writeBatch;
 	std::unique_ptr<std::set<PhysicalShard*>> dirtyShards;
 	KeyRangeMap<DataShard*> dataShardMap;
-	std::shared_ptr<PhysicalShard> metadataShard = nullptr;
 	std::deque<std::string> pendingDeletionShards;
 };

@ -2240,6 +2266,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		// TODO: Adapt the simulation framework to not advance time quickly when background reads/writes are
 		// occurring.
 		if (g_network->isSimulated()) {
+			TraceEvent(SevDebug, "ShardedRocksDB").detail("Info", "Use Coro threads in simulation.");
 			writeThread = CoroThreadPool::createThreadPool();
 			readThreads = CoroThreadPool::createThreadPool();
 		} else {
@ -2316,7 +2343,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {

 	void set(KeyValueRef kv, const Arena*) override { shardManager.put(kv.key, kv.value); }

-	void clear(KeyRangeRef range, const Arena*) override {
+	void clear(KeyRangeRef range, const StorageServerMetrics*, const Arena*) override {
 		if (range.singleKeyRange()) {
 			shardManager.clear(range.begin);
 		} else {
--- a/fdbserver/MockGlobalState.cpp
+++ b/fdbserver/MockGlobalState.cpp
@ -31,6 +31,106 @@ bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus s
 	return true;
 }

+void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status, bool restrictSize) {
+	auto ranges = serverKeys.intersectingRanges(range);
+	ASSERT(!ranges.empty());
+	if (ranges.begin().range().contains(range)) {
+		CODE_PROBE(true, "Implicitly split single shard to 3 pieces");
+		threeWayShardSplitting(ranges.begin().range(), range, ranges.begin().cvalue().shardSize, restrictSize);
+		return;
+	}
+	if (ranges.begin().begin() < range.begin) {
+		CODE_PROBE(true, "Implicitly split begin range to 2 pieces");
+		twoWayShardSplitting(ranges.begin().range(), range.begin, ranges.begin().cvalue().shardSize, restrictSize);
+	}
+	if (ranges.end().end() > range.end) {
+		CODE_PROBE(true, "Implicitly split end range to 2 pieces");
+		twoWayShardSplitting(ranges.end().range(), range.end, ranges.end().cvalue().shardSize, restrictSize);
+	}
+	ranges = serverKeys.containedRanges(range);
+	// now the boundary must be aligned
+	ASSERT(ranges.begin().begin() == range.begin);
+	ASSERT(ranges.end().end() == range.end);
+	uint64_t newSize = 0;
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		newSize += it->cvalue().shardSize;
+	}
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		auto oldStatus = it.value().status;
+		if (isStatusTransitionValid(oldStatus, status)) {
+			it.value() = ShardInfo{ status, newSize };
+		} else if (oldStatus == MockShardStatus::COMPLETED && status == MockShardStatus::INFLIGHT) {
+			CODE_PROBE(true, "Shard already on server");
+		} else {
+			TraceEvent(SevError, "MockShardStatusTransitionError")
+			    .detail("From", oldStatus)
+			    .detail("To", status)
+			    .detail("ID", id)
+			    .detail("KeyBegin", range.begin.toHexString())
+			    .detail("KeyEnd", range.begin.toHexString());
+		}
+	}
+	serverKeys.coalesce(range);
+}
+
+// split the out range [a, d) based on the inner range's boundary [b, c). The result would be [a,b), [b,c), [c,d). The
+// size of the new shards are randomly split from old size of [a, d)
+void MockStorageServer::threeWayShardSplitting(KeyRangeRef outerRange,
+                                               KeyRangeRef innerRange,
+                                               uint64_t outerRangeSize,
+                                               bool restrictSize) {
+	ASSERT(outerRange.contains(innerRange));
+
+	Key left = outerRange.begin;
+	// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
+	int leftSize = deterministicRandom()->randomInt(
+	    SERVER_KNOBS->MIN_SHARD_BYTES,
+	    restrictSize ? outerRangeSize - 2 * SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
+	int midSize = deterministicRandom()->randomInt(
+	    SERVER_KNOBS->MIN_SHARD_BYTES,
+	    restrictSize ? outerRangeSize - leftSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
+	int rightSize =
+	    restrictSize ? outerRangeSize - leftSize - midSize
+	                 : deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
+
+	serverKeys.insert(innerRange, { serverKeys[left].status, (uint64_t)midSize });
+	serverKeys[left].shardSize = leftSize;
+	serverKeys[innerRange.end].shardSize = rightSize;
+}
+
+// split the range [a,c) with split point b. The result would be [a, b), [b, c). The
+// size of the new shards are randomly split from old size of [a, c)
+void MockStorageServer::twoWayShardSplitting(KeyRangeRef range,
+                                             KeyRef splitPoint,
+                                             uint64_t rangeSize,
+                                             bool restrictSize) {
+	Key left = range.begin;
+	// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
+	int leftSize = deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES,
+	                                                restrictSize ? rangeSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1
+	                                                             : SERVER_KNOBS->MAX_SHARD_BYTES);
+	int rightSize =
+	    restrictSize ? rangeSize - leftSize
+	                 : deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
+	serverKeys.rawInsert(splitPoint, { serverKeys[left].status, (uint64_t)rightSize });
+	serverKeys[left].shardSize = leftSize;
+}
+
+void MockStorageServer::removeShard(KeyRangeRef range) {
+	auto ranges = serverKeys.containedRanges(range);
+	ASSERT(ranges.begin().range() == range);
+	serverKeys.rawErase(range);
+}
+
+uint64_t MockStorageServer::sumRangeSize(KeyRangeRef range) const {
+	auto ranges = serverKeys.intersectingRanges(range);
+	uint64_t totalSize = 0;
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		totalSize += it->cvalue().shardSize;
+	}
+	return totalSize;
+}
+
 void MockGlobalState::initializeAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) {
 	ASSERT(conf.storageTeamSize > 0);
 	configuration = conf;
@ -104,8 +204,78 @@ TEST_CASE("/MockGlobalState/initializeAsEmptyDatabaseMGS/SimpleThree") {
 		auto id = MockGlobalState::indexToUID(i);
 		std::cout << "Check server " << i << "\n";
 		ASSERT(mgs->serverIsSourceForShard(id, allKeys));
-		ASSERT(mgs->allServers.at(id).serverKeys.sumRange(allKeys.begin, allKeys.end) == 0);
+		ASSERT(mgs->allServers.at(id).sumRangeSize(allKeys) == 0);
 	}

 	return Void();
 }
+
+struct MockGlobalStateTester {
+
+	// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, x2), [x2, r0.end)
+	void testThreeWaySplitFirstRange(MockStorageServer& mss) {
+		auto it = mss.serverKeys.ranges().begin();
+		uint64_t oldSize =
+		    deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
+		MockShardStatus oldStatus = it.cvalue().status;
+		it->value().shardSize = oldSize;
+		KeyRangeRef outerRange = it->range();
+		Key x1 = keyAfter(it->range().begin);
+		Key x2 = keyAfter(x1);
+		std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
+
+		mss.threeWayShardSplitting(outerRange, KeyRangeRef(x1, x2), oldSize, false);
+		auto ranges = mss.serverKeys.containedRanges(outerRange);
+		ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
+		ranges.pop_front();
+		ASSERT(ranges.begin().range() == KeyRangeRef(x1, x2));
+		ASSERT(ranges.begin().cvalue().status == oldStatus);
+		ranges.pop_front();
+		ASSERT(ranges.begin().range() == KeyRangeRef(x2, outerRange.end));
+		ranges.pop_front();
+		ASSERT(ranges.empty());
+	}
+
+	// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, r0.end)
+	void testTwoWaySplitFirstRange(MockStorageServer& mss) {
+		auto it = mss.serverKeys.nthRange(0);
+		MockShardStatus oldStatus = it.cvalue().status;
+		uint64_t oldSize =
+		    deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
+		it->value().shardSize = oldSize;
+		KeyRangeRef outerRange = it->range();
+		Key x1 = keyAfter(it->range().begin);
+		std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
+
+		mss.twoWayShardSplitting(it->range(), x1, oldSize, false);
+		auto ranges = mss.serverKeys.containedRanges(outerRange);
+		ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
+		ranges.pop_front();
+		ASSERT(ranges.begin().range() == KeyRangeRef(x1, outerRange.end));
+		ASSERT(ranges.begin().cvalue().status == oldStatus);
+		ranges.pop_front();
+		ASSERT(ranges.empty());
+	}
+};
+
+TEST_CASE("/MockGlobalState/MockStorageServer/SplittingFunctions") {
+	BasicTestConfig testConfig;
+	testConfig.simpleConfig = true;
+	testConfig.minimumReplication = 1;
+	testConfig.logAntiQuorum = 0;
+	DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
+	TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
+
+	auto mgs = std::make_shared<MockGlobalState>();
+	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
+
+	MockGlobalStateTester tester;
+	auto& mss = mgs->allServers.at(MockGlobalState::indexToUID(1));
+	std::cout << "Test 3-way splitting...\n";
+	tester.testThreeWaySplitFirstRange(mss);
+	std::cout << "Test 2-way splitting...\n";
+	mss.serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 }); // reset to empty
+	tester.testTwoWaySplitFirstRange(mss);
+
+	return Void();
+}
--- a/Show More
+++ b/Show More