Merge commit '0ae568a872e474c8c755e648efbbe4524e63e445' into storageserver-pml

# Conflicts: # fdbserver/VersionedBTree.actor.cpp
2022-10-24 22:31:36 -07:00 · 2022-10-24 22:31:36 -07:00 · 27dc180b68
parent e5a5ec36a4 0ae568a872
commit 27dc180b68
134 changed files with 2325 additions and 1009 deletions
--- a/bindings/c/CMakeLists.txt
+++ b/bindings/c/CMakeLists.txt
@ -274,93 +274,21 @@ if(NOT WIN32)
      @CLUSTER_FILE@
      ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
    )
-    add_fdbclient_test(
-      NAME fdb_c_api_tests
-      DISABLE_LOG_DUMP
-      COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
-      --cluster-file
-      @CLUSTER_FILE@
-      --tester-binary
-      $<TARGET_FILE:fdb_c_api_tester>
-      --external-client-library
-      ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
-      --test-dir
-      ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
-      --tmp-dir
-      @TMP_DIR@
-      --log-dir
-      @LOG_DIR@
-      --knob
-      delete-native-lib-after-loading=false # for properly symbolizing xSAN errors
-    )

-    add_fdbclient_test(
-      NAME fdb_c_api_tests_local_only
-      DISABLE_LOG_DUMP
-      COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
-      --cluster-file
-      @CLUSTER_FILE@
-      --tester-binary
-      $<TARGET_FILE:fdb_c_api_tester>
-      --test-dir
-      ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/local_tests
-      --tmp-dir
-      @TMP_DIR@
-      --log-dir
-      @LOG_DIR@
-      --knob
-      delete-native-lib-after-loading=false # for properly symbolizing xSAN errors
-    )
-
-    add_fdbclient_test(
-      NAME fdb_c_api_tests_blob_granule
-      DISABLE_LOG_DUMP
-      API_TEST_BLOB_GRANULES_ENABLED
-      COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
-      --cluster-file
-      @CLUSTER_FILE@
-      --tester-binary
-      $<TARGET_FILE:fdb_c_api_tester>
-      --external-client-library
-      ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
-      --test-dir
-      ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/blobgranuletests
-      --blob-granule-local-file-path
-      @DATA_DIR@/fdbblob/
-      --tmp-dir
-      @TMP_DIR@
-      --log-dir
-      @LOG_DIR@
-      --knob
-      delete-native-lib-after-loading=false # for properly symbolizing xSAN errors
-    )
-
-    add_fdbclient_test(
-      NAME fdb_c_api_tests_with_tls
-      DISABLE_LOG_DUMP
-      TLS_ENABLED
-      COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
-      --cluster-file
-      @CLUSTER_FILE@
-      --tester-binary
-      $<TARGET_FILE:fdb_c_api_tester>
-      --external-client-library
-      ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
-      --test-dir
-      ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
-      --tmp-dir
-      @TMP_DIR@
-      --log-dir
-      @LOG_DIR@
-      --tls-cert-file
-      @CLIENT_CERT_FILE@
-      --tls-key-file
-      @CLIENT_KEY_FILE@
-      --tls-ca-file
-      @SERVER_CA_FILE@
-      --knob
-      delete-native-lib-after-loading=false # for properly symbolizing xSAN errors
-    )
+    file(GLOB API_TEST_FILES "${CMAKE_CURRENT_SOURCE_DIR}/test/apitester/tests/*.toml")
+    foreach(test_file ${API_TEST_FILES})
+      get_filename_component(file_name "${test_file}" NAME_WE)
+      set(test_name "fdb_c_api_test_${file_name}")
+      add_test(NAME "${test_name}"
+        COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
+          --build-dir ${CMAKE_BINARY_DIR}
+          --api-tester-bin $<TARGET_FILE:fdb_c_api_tester>
+          --external-client-library ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
+          --test-file ${test_file}
+          --knob delete-native-lib-after-loading=false
+      )
+      set_tests_properties("${test_name}" PROPERTIES TIMEOUT 300)
+    endforeach()

    add_test(NAME fdb_c_upgrade_to_future_version
      COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
--- a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessSingleThr.toml
+++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessSingleThr.toml
@ -1,15 +0,0 @@
-[[test]]
-title = 'Blob Granule API Correctness Single Threaded'
-minClients = 1
-maxClients = 3
-multiThreaded = false
-
-    [[test.workload]]
-    name = 'ApiBlobGranuleCorrectness'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 50
-	initialSize = 100
-	numRandomOperations = 100
--- a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml
+++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml
@ -1,15 +0,0 @@
-[[test]]
-title = 'Blob Granule Errors Single Threaded'
-minClients = 1
-maxClients = 3
-multiThreaded = false
-
-	[[test.workload]]
-    name = 'BlobGranuleErrors'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 50
-	initialSize = 100
-	numRandomOperations = 100
--- a/bindings/c/test/apitester/fdb_c_api_tester.cpp
+++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp
@ -279,9 +279,9 @@ bool parseArgs(TesterOptions& options, int argc, char** argv) {
 	return true;
 }

-void fdb_check(fdb::Error e) {
-	if (e) {
-		fmt::print(stderr, "Unexpected FDB error: {}({})\n", e.code(), e.what());
+void fdb_check(fdb::Error e, std::string_view msg, fdb::Error::CodeType expectedError = error_code_success) {
+	if (e.code()) {
+		fmt::print(stderr, "{}, Error: {}({})\n", msg, e.code(), e.what());
 		std::abort();
 	}
 }
@ -453,13 +453,13 @@ int main(int argc, char** argv) {
 		applyNetworkOptions(options);
 		fdb::network::setup();

-		std::thread network_thread{ &fdb::network::run };
+		std::thread network_thread{ [] { fdb_check(fdb::network::run(), "FDB network thread failed"); } };

 		if (!runWorkloads(options)) {
 			retCode = 1;
 		}

-		fdb_check(fdb::network::stop());
+		fdb_check(fdb::network::stop(), "Failed to stop FDB thread");
 		network_thread.join();
 	} catch (const std::exception& err) {
 		fmt::print(stderr, "ERROR: {}\n", err.what());
--- a/bindings/c/test/apitester/local_tests/CApiCorrectnessSingleThr.toml
+++ b/bindings/c/test/apitester/local_tests/CApiCorrectnessSingleThr.toml
@ -1,29 +0,0 @@
-[[test]]
-title = 'API Correctness Single Threaded'
-minClients = 1
-maxClients = 3
-minDatabases = 1
-maxDatabases = 3
-multiThreaded = false
-disableClientBypass = true
-
-    [[test.workload]]
-    name = 'ApiCorrectness'
-    minKeyLength = 1
-    maxKeyLength = 64
-    minValueLength = 1
-    maxValueLength = 1000
-    maxKeysPerTransaction = 50
-    initialSize = 100
-    numRandomOperations = 100
-    readExistingKeysRatio = 0.9
-
-    [[test.workload]]
-    name = 'AtomicOpsCorrectness'
-    initialSize = 0
-    numRandomOperations = 100
-
-    [[test.workload]]
-    name = 'WatchAndWait'
-    initialSize = 0
-    numRandomOperations = 10
--- a/bindings/c/test/apitester/run_c_api_tests.py
+++ b/bindings/c/test/apitester/run_c_api_tests.py
@ -29,31 +29,39 @@ from pathlib import Path
 import glob
 import random
 import string
+import toml
+
+sys.path[:0] = [os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "tests", "TestRunner")]
+
+# fmt: off
+from tmp_cluster import TempCluster
+from local_cluster import TLSConfig
+# fmt: on

 TESTER_STATS_INTERVAL_SEC = 5


 def random_string(len):
-    return ''.join(random.choice(string.ascii_letters + string.digits) for i in range(len))
+    return "".join(random.choice(string.ascii_letters + string.digits) for i in range(len))


 def get_logger():
-    return logging.getLogger('foundationdb.run_c_api_tests')
+    return logging.getLogger("foundationdb.run_c_api_tests")


 def initialize_logger_level(logging_level):
    logger = get_logger()

-    assert logging_level in ['DEBUG', 'INFO', 'WARNING', 'ERROR']
+    assert logging_level in ["DEBUG", "INFO", "WARNING", "ERROR"]

-    logging.basicConfig(format='%(message)s')
-    if logging_level == 'DEBUG':
+    logging.basicConfig(format="%(message)s")
+    if logging_level == "DEBUG":
        logger.setLevel(logging.DEBUG)
-    elif logging_level == 'INFO':
+    elif logging_level == "INFO":
        logger.setLevel(logging.INFO)
-    elif logging_level == 'WARNING':
+    elif logging_level == "WARNING":
        logger.setLevel(logging.WARNING)
-    elif logging_level == 'ERROR':
+    elif logging_level == "ERROR":
        logger.setLevel(logging.ERROR)


@ -65,39 +73,52 @@ def dump_client_logs(log_dir):
        print(">>>>>>>>>>>>>>>>>>>> End of {}:".format(log_file))


-def run_tester(args, test_file):
-    cmd = [args.tester_binary,
-           "--cluster-file", args.cluster_file,
-           "--test-file", test_file,
-           "--stats-interval", str(TESTER_STATS_INTERVAL_SEC*1000)]
+def run_tester(args, cluster, test_file):
+    build_dir = Path(args.build_dir).resolve()
+    tester_binary = Path(args.api_tester_bin).resolve()
+    external_client_library = build_dir.joinpath("bindings", "c", "libfdb_c_external.so")
+    log_dir = Path(cluster.log).joinpath("client")
+    log_dir.mkdir(exist_ok=True)
+    cmd = [
+        tester_binary,
+        "--cluster-file",
+        cluster.cluster_file,
+        "--test-file",
+        test_file,
+        "--stats-interval",
+        str(TESTER_STATS_INTERVAL_SEC * 1000),
+        "--tmp-dir",
+        cluster.tmp_dir,
+        "--log",
+        "--log-dir",
+        str(log_dir),
+    ]
+
    if args.external_client_library is not None:
-        cmd += ["--external-client-library", args.external_client_library]
-    if args.tmp_dir is not None:
-        cmd += ["--tmp-dir", args.tmp_dir]
-    log_dir = None
-    if args.log_dir is not None:
-        log_dir = Path(args.log_dir).joinpath(random_string(8))
-        log_dir.mkdir(exist_ok=True)
-        cmd += ['--log', "--log-dir", str(log_dir)]
+        external_client_library = Path(args.external_client_library).resolve()
+        cmd += ["--external-client-library", external_client_library]

-    if args.blob_granule_local_file_path is not None:
-        cmd += ["--blob-granule-local-file-path",
-                args.blob_granule_local_file_path]
+    if cluster.blob_granules_enabled:
+        cmd += [
+            "--blob-granule-local-file-path",
+            str(cluster.data.joinpath("fdbblob")) + os.sep,
+        ]

-    if args.tls_ca_file is not None:
-        cmd += ["--tls-ca-file", args.tls_ca_file]
-
-    if args.tls_key_file is not None:
-        cmd += ["--tls-key-file", args.tls_key_file]
-
-    if args.tls_cert_file is not None:
-        cmd += ["--tls-cert-file", args.tls_cert_file]
+    if cluster.tls_config is not None:
+        cmd += [
+            "--tls-ca-file",
+            cluster.server_ca_file,
+            "--tls-key-file",
+            cluster.client_key_file,
+            "--tls-cert-file",
+            cluster.client_cert_file,
+        ]

    for knob in args.knobs:
        knob_name, knob_value = knob.split("=")
        cmd += ["--knob-" + knob_name, knob_value]

-    get_logger().info('\nRunning tester \'%s\'...' % ' '.join(cmd))
+    get_logger().info("\nRunning tester '%s'..." % " ".join(map(str, cmd)))
    proc = Popen(cmd, stdout=sys.stdout, stderr=sys.stderr)
    timed_out = False
    ret_code = 1
@ -107,34 +128,76 @@ def run_tester(args, test_file):
        proc.kill()
        timed_out = True
    except Exception as e:
-        raise Exception('Unable to run tester (%s)' % e)
+        raise Exception("Unable to run tester (%s)" % e)

    if ret_code != 0:
        if timed_out:
-            reason = 'timed out after %d seconds' % args.timeout
+            reason = "timed out after %d seconds" % args.timeout
        elif ret_code < 0:
            reason = signal.Signals(-ret_code).name
        else:
-            reason = 'exit code: %d' % ret_code
-        get_logger().error('\n\'%s\' did not complete succesfully (%s)' %
-                           (cmd[0], reason))
-        if (log_dir is not None):
+            reason = "exit code: %d" % ret_code
+        get_logger().error("\n'%s' did not complete succesfully (%s)" % (cmd[0], reason))
+        if log_dir is not None:
            dump_client_logs(log_dir)

-    get_logger().info('')
+    get_logger().info("")
    return ret_code


+class TestConfig:
+    def __init__(self, test_file):
+        config = toml.load(test_file)
+        server_config = config.get("server", [{}])[0]
+        self.tenants_enabled = server_config.get("tenants_enabled", True)
+        self.blob_granules_enabled = server_config.get("blob_granules_enabled", False)
+        self.tls_enabled = server_config.get("tls_enabled", False)
+        self.client_chain_len = server_config.get("tls_client_chain_len", 2)
+        self.server_chain_len = server_config.get("tls_server_chain_len", 3)
+        self.min_num_processes = server_config.get("min_num_processes", 1)
+        self.max_num_processes = server_config.get("max_num_processes", 3)
+        self.num_processes = random.randint(self.min_num_processes, self.max_num_processes)
+
+
+def run_test(args, test_file):
+    config = TestConfig(test_file)
+
+    tls_config = None
+    if config.tls_enabled:
+        tls_config = TLSConfig(
+            server_chain_len=config.client_chain_len,
+            client_chain_len=config.server_chain_len,
+        )
+
+    with TempCluster(
+        args.build_dir,
+        config.num_processes,
+        enable_tenants=config.tenants_enabled,
+        blob_granules_enabled=config.blob_granules_enabled,
+        tls_config=tls_config,
+    ) as cluster:
+        ret_code = run_tester(args, cluster, test_file)
+        if not cluster.check_cluster_logs():
+            ret_code = 1 if ret_code == 0 else ret_code
+        return ret_code
+
+
 def run_tests(args):
    num_failed = 0
-    test_files = [f for f in os.listdir(args.test_dir) if os.path.isfile(
-        os.path.join(args.test_dir, f)) and f.endswith(".toml")]
+    if args.test_file is not None:
+        test_files = [Path(args.test_file).resolve()]
+    else:
+        test_files = [
+            f
+            for f in os.listdir(args.test_dir)
+            if os.path.isfile(os.path.join(args.test_dir, f)) and f.endswith(".toml")
+        ]

    for test_file in test_files:
-        get_logger().info('=========================================================')
-        get_logger().info('Running test %s' % test_file)
-        get_logger().info('=========================================================')
-        ret_code = run_tester(args, os.path.join(args.test_dir, test_file))
+        get_logger().info("=========================================================")
+        get_logger().info("Running test %s" % test_file)
+        get_logger().info("=========================================================")
+        ret_code = run_test(args, os.path.join(args.test_dir, test_file))
        if ret_code != 0:
            num_failed += 1

@ -142,34 +205,49 @@ def run_tests(args):


 def parse_args(argv):
-    parser = argparse.ArgumentParser(description='FoundationDB C API Tester')
-
-    parser.add_argument('--cluster-file', type=str, default="fdb.cluster",
-                        help='The cluster file for the cluster being connected to. (default: fdb.cluster)')
-    parser.add_argument('--tester-binary', type=str, default="fdb_c_api_tester",
-                        help='Path to the fdb_c_api_tester executable. (default: fdb_c_api_tester)')
-    parser.add_argument('--external-client-library', type=str, default=None,
-                        help='Path to the external client library. (default: None)')
-    parser.add_argument('--test-dir', type=str, default="./",
-                        help='Path to a directory with test definitions. (default: ./)')
-    parser.add_argument('--timeout', type=int, default=300,
-                        help='The timeout in seconds for running each individual test. (default 300)')
-    parser.add_argument('--log-dir', type=str, default=None,
-                        help='The directory for storing logs (default: None)')
-    parser.add_argument('--logging-level', type=str, default='INFO',
-                        choices=['ERROR', 'WARNING', 'INFO', 'DEBUG'], help='Specifies the level of detail in the tester output (default=\'INFO\').')
-    parser.add_argument('--tmp-dir', type=str, default=None,
-                        help='The directory for storing temporary files (default: None)')
-    parser.add_argument('--blob-granule-local-file-path', type=str, default=None,
-                        help='Enable blob granule tests if set, value is path to local blob granule files')
-    parser.add_argument('--tls-ca-file', type=str, default=None,
-                        help='Path to client\'s TLS CA file: i.e. certificate of CA that signed the server certificate')
-    parser.add_argument('--tls-cert-file', type=str, default=None,
-                        help='Path to client\'s TLS certificate file')
-    parser.add_argument('--tls-key-file', type=str, default=None,
-                        help='Path to client\'s TLS private key file')
-    parser.add_argument('--knob', type=str, default=[], action="append", dest="knobs",
-                        help='[lowercase-knob-name]=[knob-value] (there may be multiple --knob options)')
+    parser = argparse.ArgumentParser(description="FoundationDB C API Tester")
+    parser.add_argument("--build-dir", "-b", type=str, required=True, help="FDB build directory")
+    parser.add_argument("--api-tester-bin", type=str, help="Path to the fdb_c_api_tester executable.", required=True)
+    parser.add_argument("--external-client-library", type=str, help="Path to the external client library.")
+    parser.add_argument(
+        "--cluster-file",
+        type=str,
+        default="fdb.cluster",
+        help="The cluster file for the cluster being connected to. (default: fdb.cluster)",
+    )
+    parser.add_argument(
+        "--test-dir",
+        type=str,
+        default="./",
+        help="Path to a directory with test definitions. (default: ./)",
+    )
+    parser.add_argument(
+        "--test-file",
+        type=str,
+        default=None,
+        help="Path to a single test definition to be executed, overrides --test-dir if set.",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=300,
+        help="The timeout in seconds for running each individual test. (default 300)",
+    )
+    parser.add_argument(
+        "--logging-level",
+        type=str,
+        default="INFO",
+        choices=["ERROR", "WARNING", "INFO", "DEBUG"],
+        help="Specifies the level of detail in the tester output (default='INFO').",
+    )
+    parser.add_argument(
+        "--knob",
+        type=str,
+        default=[],
+        action="append",
+        dest="knobs",
+        help="[lowercase-knob-name]=[knob-value] (there may be multiple --knob options)",
+    )

    return parser.parse_args(argv)

@ -180,5 +258,5 @@ def main(argv):
    return run_tests(args)


-if __name__ == '__main__':
+if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))
--- a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessBlocking.toml
+++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessBlocking.toml
@ -12,13 +12,15 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

+[[server]]
+blob_granules_enabled = true

-    [[test.workload]]
-    name = 'ApiBlobGranuleCorrectness'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 50
-	initialSize = 100
-	numRandomOperations = 100
+[[test.workload]]
+name = 'ApiBlobGranuleCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
--- a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessMultiThr.toml
+++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessMultiThr.toml
@ -11,13 +11,15 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-    [[test.workload]]
-    name = 'ApiBlobGranuleCorrectness'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 50
-	initialSize = 100
-	numRandomOperations = 100
+[[server]]
+blob_granules_enabled = true

+[[test.workload]]
+name = 'ApiBlobGranuleCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
--- a/bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessSingleThr.toml
+++ b/bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessSingleThr.toml
@ -0,0 +1,18 @@
+[[test]]
+title = 'Blob Granule API Correctness Single Threaded'
+minClients = 1
+maxClients = 3
+multiThreaded = false
+
+[[server]]
+blob_granules_enabled = true
+
+[[test.workload]]
+name = 'ApiBlobGranuleCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
--- a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml
+++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml
@ -11,12 +11,15 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-	[[test.workload]]
-    name = 'BlobGranuleErrors'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 50
-	initialSize = 100
-	numRandomOperations = 100
+[[server]]
+blob_granules_enabled = true
+
+[[test.workload]]
+name = 'BlobGranuleErrors'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
--- a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml
+++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml
@ -11,12 +11,15 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-	[[test.workload]]
-    name = 'BlobGranuleErrors'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 50
-	initialSize = 100
-	numRandomOperations = 100
+[[server]]
+blob_granules_enabled = true
+
+[[test.workload]]
+name = 'BlobGranuleErrors'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
--- a/bindings/c/test/apitester/tests/CApiBlobGranuleErrorsSingleThr.toml
+++ b/bindings/c/test/apitester/tests/CApiBlobGranuleErrorsSingleThr.toml
@ -0,0 +1,18 @@
+[[test]]
+title = 'Blob Granule Errors Single Threaded'
+minClients = 1
+maxClients = 3
+multiThreaded = false
+
+[[server]]
+blob_granules_enabled = true
+
+[[test.workload]]
+name = 'BlobGranuleErrors'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
--- a/bindings/c/test/apitester/tests/CApiCancelTransactionBlocking.toml
+++ b/bindings/c/test/apitester/tests/CApiCancelTransactionBlocking.toml
@ -12,13 +12,13 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-	[[test.workload]]
-    name = 'CancelTransaction'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 50
-	initialSize = 100
-	numRandomOperations = 100
-	readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'CancelTransaction'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiCancelTransactionCB.toml
+++ b/bindings/c/test/apitester/tests/CApiCancelTransactionCB.toml
@ -11,13 +11,13 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-	[[test.workload]]
-    name = 'CancelTransaction'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 50
-	initialSize = 100
-	numRandomOperations = 100
-	readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'CancelTransaction'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX.toml
+++ b/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX.toml
@ -12,13 +12,13 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-	[[test.workload]]
-    name = 'CancelTransaction'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 50
-	initialSize = 100
-	numRandomOperations = 100
-	readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'CancelTransaction'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX_TLS.toml
+++ b/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX_TLS.toml
@ -0,0 +1,28 @@
+[[test]]
+title = 'Cancel Transaction with Database per Transaction with TLS'
+multiThreaded = true
+buggify = true
+databasePerTransaction = true
+minFdbThreads = 2
+maxFdbThreads = 8
+minDatabases = 2
+maxDatabases = 8
+minClientThreads = 2
+maxClientThreads = 8
+minClients = 2
+maxClients = 8
+
+[[server]]
+tls_enabled = true
+max_num_processes = 1
+
+[[test.workload]]
+name = 'CancelTransaction'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiCancelTransactionWithTimeout.toml
+++ b/bindings/c/test/apitester/tests/CApiCancelTransactionWithTimeout.toml
@ -11,15 +11,15 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-    [[test.workload]]
-    name = 'CancelTransaction'
-    minKeyLength = 1
-    maxKeyLength = 64
-    minValueLength = 1
-    maxValueLength = 1000
-    maxKeysPerTransaction = 50
-    initialSize = 100
-    numRandomOperations = 100
-    readExistingKeysRatio = 0.9
-    minTxTimeoutMs = 10
-    maxTxTimeoutMs = 10000
+[[test.workload]]
+name = 'CancelTransaction'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
+minTxTimeoutMs = 10
+maxTxTimeoutMs = 10000
--- a/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml
@ -12,23 +12,23 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-    [[test.workload]]
-    name = 'ApiCorrectness'
-    minKeyLength = 1
-    maxKeyLength = 64
-    minValueLength = 1
-    maxValueLength = 1000
-    maxKeysPerTransaction = 50
-    initialSize = 100
-    numRandomOperations = 100
-    readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9

-    [[test.workload]]
-    name = 'AtomicOpsCorrectness'
-    initialSize = 0
-    numRandomOperations = 100
+[[test.workload]]
+name = 'AtomicOpsCorrectness'
+initialSize = 0
+numRandomOperations = 100

-    [[test.workload]]
-    name = 'WatchAndWait'
-    initialSize = 0
-    numRandomOperations = 10
+[[test.workload]]
+name = 'WatchAndWait'
+initialSize = 0
+numRandomOperations = 10
--- a/bindings/c/test/apitester/tests/CApiCorrectnessCallbacksOnExtThr.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessCallbacksOnExtThr.toml
@ -12,23 +12,23 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-    [[test.workload]]
-    name = 'ApiCorrectness'
-    minKeyLength = 1
-    maxKeyLength = 64
-    minValueLength = 1
-    maxValueLength = 1000
-    maxKeysPerTransaction = 50
-    initialSize = 100
-    numRandomOperations = 100
-    readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9

-    [[test.workload]]
-    name = 'AtomicOpsCorrectness'
-    initialSize = 0
-    numRandomOperations = 100
+[[test.workload]]
+name = 'AtomicOpsCorrectness'
+initialSize = 0
+numRandomOperations = 100

-    [[test.workload]]
-    name = 'WatchAndWait'
-    initialSize = 0
-    numRandomOperations = 10
+[[test.workload]]
+name = 'WatchAndWait'
+initialSize = 0
+numRandomOperations = 10
--- a/bindings/c/test/apitester/tests/CApiCorrectnessDBPerTX.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessDBPerTX.toml
@ -12,23 +12,23 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-    [[test.workload]]
-    name = 'ApiCorrectness'
-    minKeyLength = 1
-    maxKeyLength = 64
-    minValueLength = 1
-    maxValueLength = 1000
-    maxKeysPerTransaction = 50
-    initialSize = 100
-    numRandomOperations = 100
-    readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9

-    [[test.workload]]
-    name = 'AtomicOpsCorrectness'
-    initialSize = 0
-    numRandomOperations = 100
+[[test.workload]]
+name = 'AtomicOpsCorrectness'
+initialSize = 0
+numRandomOperations = 100

-    [[test.workload]]
-    name = 'WatchAndWait'
-    initialSize = 0
-    numRandomOperations = 10
+[[test.workload]]
+name = 'WatchAndWait'
+initialSize = 0
+numRandomOperations = 10
--- a/bindings/c/test/apitester/tests/CApiCorrectnessDisableBypass.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessDisableBypass.toml
@ -0,0 +1,29 @@
+[[test]]
+title = 'API Correctness Single Threaded'
+minClients = 1
+maxClients = 3
+minDatabases = 1
+maxDatabases = 3
+multiThreaded = false
+disableClientBypass = true
+
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
+
+[[test.workload]]
+name = 'AtomicOpsCorrectness'
+initialSize = 0
+numRandomOperations = 100
+
+[[test.workload]]
+name = 'WatchAndWait'
+initialSize = 0
+numRandomOperations = 10
--- a/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml
@ -11,23 +11,23 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-    [[test.workload]]
-    name = 'ApiCorrectness'
-    minKeyLength = 1
-    maxKeyLength = 64
-    minValueLength = 1
-    maxValueLength = 1000
-    maxKeysPerTransaction = 50
-    initialSize = 100
-    numRandomOperations = 100
-    readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9

-    [[test.workload]]
-    name = 'AtomicOpsCorrectness'
-    initialSize = 0
-    numRandomOperations = 100
+[[test.workload]]
+name = 'AtomicOpsCorrectness'
+initialSize = 0
+numRandomOperations = 100

-    [[test.workload]]
-    name = 'WatchAndWait'
-    initialSize = 0
-    numRandomOperations = 10
+[[test.workload]]
+name = 'WatchAndWait'
+initialSize = 0
+numRandomOperations = 10
--- a/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml
@ -4,23 +4,23 @@ minClients = 1
 maxClients = 3
 multiThreaded = false

-    [[test.workload]]
-    name = 'ApiCorrectness'
-    minKeyLength = 1
-    maxKeyLength = 64
-    minValueLength = 1
-    maxValueLength = 1000
-    maxKeysPerTransaction = 50
-    initialSize = 100
-    numRandomOperations = 100
-    readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9

-    [[test.workload]]
-    name = 'AtomicOpsCorrectness'
-    initialSize = 0
-    numRandomOperations = 100
+[[test.workload]]
+name = 'AtomicOpsCorrectness'
+initialSize = 0
+numRandomOperations = 100

-    [[test.workload]]
-    name = 'WatchAndWait'
-    initialSize = 0
-    numRandomOperations = 10
+[[test.workload]]
+name = 'WatchAndWait'
+initialSize = 0
+numRandomOperations = 10
--- a/bindings/c/test/apitester/tests/CApiCorrectnessTLS.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessTLS.toml
@ -0,0 +1,37 @@
+[[test]]
+title = 'API Correctness with TLS'
+multiThreaded = true
+buggify = true
+minFdbThreads = 2
+maxFdbThreads = 8
+minDatabases = 2
+maxDatabases = 8
+minClientThreads = 2
+maxClientThreads = 8
+minClients = 2
+maxClients = 8
+
+[[server]]
+tls_enabled = true
+max_num_processes = 1
+
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
+
+[[test.workload]]
+name = 'AtomicOpsCorrectness'
+initialSize = 0
+numRandomOperations = 100
+
+[[test.workload]]
+name = 'WatchAndWait'
+initialSize = 0
+numRandomOperations = 10
--- a/bindings/c/test/apitester/tests/CApiCorrectnessWithTimeout.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessWithTimeout.toml
@ -11,23 +11,22 @@ maxClientThreads = 8
 minClients = 2
 maxClients = 8

-    [[test.workload]]
-    name = 'ApiCorrectness'
-    minKeyLength = 1
-    maxKeyLength = 64
-    minValueLength = 1
-    maxValueLength = 1000
-    maxKeysPerTransaction = 50
-    initialSize = 100
-    numRandomOperations = 100
-    readExistingKeysRatio = 0.9
-    minTxTimeoutMs = 100
-    maxTxTimeoutMs = 10000
-
-    [[test.workload]]
-    name = 'AtomicOpsCorrectness'
-    initialSize = 0
-    numRandomOperations = 100
-    minTxTimeoutMs = 100
-    maxTxTimeoutMs = 10000
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
+minTxTimeoutMs = 100
+maxTxTimeoutMs = 10000

+[[test.workload]]
+name = 'AtomicOpsCorrectness'
+initialSize = 0
+numRandomOperations = 100
+minTxTimeoutMs = 100
+maxTxTimeoutMs = 10000
--- a/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessMultiThr.toml
+++ b/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessMultiThr.toml
@ -9,13 +9,13 @@ maxClients = 8
 minTenants = 2
 maxTenants = 5

-    [[test.workload]]
-    name = 'ApiCorrectness'
-    minKeyLength = 1
-	maxKeyLength = 64
-	minValueLength = 1
-	maxValueLength = 1000
-	maxKeysPerTransaction = 5
-	initialSize = 100
-	numRandomOperations = 200
-	readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 5
+initialSize = 100
+numRandomOperations = 200
+readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessTLS.toml
+++ b/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessTLS.toml
@ -0,0 +1,25 @@
+[[test]]
+title = 'Multi-tenant API Correctness Multi Threaded'
+multiThreaded = true
+buggify = true
+minFdbThreads = 2
+maxFdbThreads = 8
+minClients = 2
+maxClients = 8
+minTenants = 2
+maxTenants = 5
+
+[[server]]
+tls_enabled = true
+max_num_processes = 1
+
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 5
+initialSize = 100
+numRandomOperations = 200
+readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiTamperClusterFile.toml
+++ b/bindings/c/test/apitester/tests/CApiTamperClusterFile.toml
@ -12,13 +12,13 @@ maxClientThreads = 4
 minClients = 2
 maxClients = 4

-    [[test.workload]]
-    name = 'ApiCorrectness'
-    minKeyLength = 1
-    maxKeyLength = 64
-    minValueLength = 1
-    maxValueLength = 1000
-    maxKeysPerTransaction = 50
-    initialSize = 100
-    numRandomOperations = 100
-    readExistingKeysRatio = 0.9
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiTamperClusterFileTLS.toml
+++ b/bindings/c/test/apitester/tests/CApiTamperClusterFileTLS.toml
@ -0,0 +1,28 @@
+[[test]]
+title = 'Test tampering the cluster file with TLS'
+multiThreaded = true
+buggify = true
+tamperClusterFile = true
+minFdbThreads = 2
+maxFdbThreads = 4
+minDatabases = 2
+maxDatabases = 4
+minClientThreads = 2
+maxClientThreads = 4
+minClients = 2
+maxClients = 4
+
+[[server]]
+tls_enabled = true
+max_num_processes = 1
+
+[[test.workload]]
+name = 'ApiCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+initialSize = 100
+numRandomOperations = 100
+readExistingKeysRatio = 0.9
--- a/bindings/c/test/client_memory_test.cpp
+++ b/bindings/c/test/client_memory_test.cpp
@ -46,7 +46,7 @@ int main(int argc, char** argv) {
 	}
 	fdb_check(fdb_select_api_version(FDB_API_VERSION));
 	fdb_check(fdb_setup_network());
-	std::thread network_thread{ &fdb_run_network };
+	std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };

 	fdb_check(
 	    fdb_network_set_option(FDBNetworkOption::FDB_NET_OPTION_TRACE_ENABLE, reinterpret_cast<const uint8_t*>(""), 0));
--- a/bindings/c/test/mako/mako.cpp
+++ b/bindings/c/test/mako/mako.cpp
@ -321,7 +321,16 @@ int populate(Database db,
 		const auto key_begin = insertBegin(args.rows, worker_id, thread_id, args.num_processes, args.num_threads);
 		const auto key_end = insertEnd(args.rows, worker_id, thread_id, args.num_processes, args.num_threads);
 		auto key_checkpoint = key_begin; // in case of commit failure, restart from this key
+		double required_keys = (key_end - key_begin + 1) * args.load_factor;
 		for (auto i = key_begin; i <= key_end; i++) {
+			// Choose required_keys out of (key_end -i + 1) randomly, so the probability is required_keys / (key_end - i
+			// + 1). Generate a random number in range [0, 1), if the generated number is smaller or equal to
+			// required_keys / (key_end - i + 1), then choose this key.
+			double r = rand() / (1.0 + RAND_MAX);
+			if (r > required_keys / (key_end - i + 1)) {
+				continue;
+			}
+			--required_keys;
 			/* sequential keys */
 			genKey(keystr.data(), KEY_PREFIX, args, i);
 			/* random values */
@ -984,6 +993,7 @@ int initArguments(Arguments& args) {
 	args.async_xacts = 0;
 	args.mode = MODE_INVALID;
 	args.rows = 100000;
+	args.load_factor = 1.0;
 	args.row_digits = digits(args.rows);
 	args.seconds = 30;
 	args.iteration = 0;
@ -1166,6 +1176,7 @@ void usage() {
 	printf("%-24s %s\n", "-t, --threads=THREADS", "Specify number of worker threads");
 	printf("%-24s %s\n", "    --async_xacts", "Specify number of concurrent transactions to be run in async mode");
 	printf("%-24s %s\n", "-r, --rows=ROWS", "Specify number of records");
+	printf("%-24s %s\n", "-l, --load_factor=LOAD_FACTOR", "Specify load factor");
 	printf("%-24s %s\n", "-s, --seconds=SECONDS", "Specify the test duration in seconds\n");
 	printf("%-24s %s\n", "", "This option cannot be specified with --iteration.");
 	printf("%-24s %s\n", "-i, --iteration=ITERS", "Specify the number of iterations.\n");
@ -1228,6 +1239,7 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
 			{ "threads", required_argument, NULL, 't' },
 			{ "async_xacts", required_argument, NULL, ARG_ASYNC },
 			{ "rows", required_argument, NULL, 'r' },
+			{ "load_factor", required_argument, NULL, 'l' },
 			{ "seconds", required_argument, NULL, 's' },
 			{ "iteration", required_argument, NULL, 'i' },
 			{ "keylen", required_argument, NULL, ARG_KEYLEN },
@ -1304,6 +1316,9 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
 			args.rows = atoi(optarg);
 			args.row_digits = digits(args.rows);
 			break;
+		case 'l':
+			args.load_factor = atof(optarg);
+			break;
 		case 's':
 			args.seconds = atoi(optarg);
 			break;
@ -1523,6 +1538,10 @@ int validateArguments(Arguments const& args) {
 		logr.error("--rows must be a positive integer");
 		return -1;
 	}
+	if (args.load_factor <= 0 || args.load_factor > 1) {
+		logr.error("--load_factor must be in range (0, 1]");
+		return -1;
+	}
 	if (args.key_length < 0) {
 		logr.error("--keylen must be a positive integer");
 		return -1;
@ -2118,6 +2137,7 @@ int statsProcessMain(Arguments const& args,
 		fmt::fprintf(fp, "\"async_xacts\": %d,", args.async_xacts);
 		fmt::fprintf(fp, "\"mode\": %d,", args.mode);
 		fmt::fprintf(fp, "\"rows\": %d,", args.rows);
+		fmt::fprintf(fp, "\"load_factor\": %lf,", args.load_factor);
 		fmt::fprintf(fp, "\"seconds\": %d,", args.seconds);
 		fmt::fprintf(fp, "\"iteration\": %d,", args.iteration);
 		fmt::fprintf(fp, "\"tpsmax\": %d,", args.tpsmax);
--- a/bindings/c/test/mako/mako.hpp
+++ b/bindings/c/test/mako/mako.hpp
@ -138,6 +138,7 @@ struct Arguments {
 	int async_xacts;
 	int mode;
 	int rows; /* is 2 billion enough? */
+	double load_factor;
 	int row_digits;
 	int seconds;
 	int iteration;
--- a/bindings/c/test/shim_lib_tester.cpp
+++ b/bindings/c/test/shim_lib_tester.cpp
@ -233,7 +233,7 @@ int main(int argc, char** argv) {
 		applyNetworkOptions(options);
 		fdb::network::setup();

-		std::thread network_thread{ &fdb::network::run };
+		std::thread network_thread{ [] { fdb_check(fdb::network::run(), "FDB network thread failed"); } };

 		// Try calling some basic functionality that is available
 		// in all recent API versions
--- a/bindings/c/test/unit/disconnected_timeout_tests.cpp
+++ b/bindings/c/test/unit/disconnected_timeout_tests.cpp
@ -271,7 +271,7 @@ int main(int argc, char** argv) {
 	context.applyCommandLine(argc, argv);

 	fdb_check(fdb_setup_network());
-	std::thread network_thread{ &fdb_run_network };
+	std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };

 	db = fdb_open_database(argv[1]);
 	timeoutDb = fdb_open_database(argv[1]);
--- a/bindings/c/test/unit/setup_tests.cpp
+++ b/bindings/c/test/unit/setup_tests.cpp
@ -66,7 +66,7 @@ TEST_CASE("setup") {
 	    },
 	    &context));

-	std::thread network_thread{ &fdb_run_network };
+	std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };

 	CHECK(!context.called);
 	fdb_check(fdb_stop_network());
--- a/bindings/c/test/unit/trace_partial_file_suffix_test.cpp
+++ b/bindings/c/test/unit/trace_partial_file_suffix_test.cpp
@ -68,7 +68,7 @@ int main(int argc, char** argv) {
 	set_net_opt(FDBNetworkOption::FDB_NET_OPTION_TRACE_PARTIAL_FILE_SUFFIX, trace_partial_file_suffix);

 	fdb_check(fdb_setup_network());
-	std::thread network_thread{ &fdb_run_network };
+	std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };

 	// Apparently you need to open a database to initialize logging
 	FDBDatabase* out;
--- a/bindings/c/test/unit/unit_tests.cpp
+++ b/bindings/c/test/unit/unit_tests.cpp
@ -2998,7 +2998,7 @@ int main(int argc, char** argv) {
 	context.applyCommandLine(argc, argv);

 	fdb_check(fdb_setup_network());
-	std::thread network_thread{ &fdb_run_network };
+	std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };

 	db = fdb_open_database(argv[1]);
 	clusterFilePath = std::string(argv[1]);
--- a/bindings/c/test/unit/unit_tests_version_510.cpp
+++ b/bindings/c/test/unit/unit_tests_version_510.cpp
@ -88,7 +88,7 @@ int main(int argc, char** argv) {
 	context.applyCommandLine(argc, argv);

 	fdb_check(fdb_setup_network());
-	std::thread network_thread{ &fdb_run_network };
+	std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };

 	{
 		FDBCluster* cluster;
--- a/bindings/go/src/fdb/generated.go
+++ b/bindings/go/src/fdb/generated.go
@ -392,11 +392,6 @@ func (o DatabaseOptions) SetTransactionIncludePortInAddress() error {
 	return o.setOpt(505, nil)
 }

-// Set a random idempotency id for all transactions. See the transaction option description for more information.
-func (o DatabaseOptions) SetTransactionAutomaticIdempotency() error {
-	return o.setOpt(506, nil)
-}
-
 // Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. This sets the ``bypass_unreadable`` option of each transaction created by this database. See the transaction option description for more information.
 func (o DatabaseOptions) SetTransactionBypassUnreadable() error {
 	return o.setOpt(700, nil)
@ -556,18 +551,6 @@ func (o TransactionOptions) SetSizeLimit(param int64) error {
 	return o.setOpt(503, int64ToBytes(param))
 }

-// Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes.
-//
-// Parameter: Unique ID
-func (o TransactionOptions) SetIdempotencyId(param string) error {
-	return o.setOpt(504, []byte(param))
-}
-
-// Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future.
-func (o TransactionOptions) SetAutomaticIdempotency() error {
-	return o.setOpt(505, nil)
-}
-
 // Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior.
 func (o TransactionOptions) SetSnapshotRywEnable() error {
 	return o.setOpt(600, nil)
--- a/cmake/AddFdbTest.cmake
+++ b/cmake/AddFdbTest.cmake
@ -320,11 +320,11 @@ function(create_long_running_correctness_package)
  add_custom_command(
    OUTPUT ${tar_file}
    DEPENDS ${package_files}
-            ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
-            ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
+            ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTest.sh
+            ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTest.sh
                                    ${out_dir}/joshua_test
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh
                                    ${out_dir}/joshua_timeout
    COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} ${package_files}
                                                    ${out_dir}/joshua_test
--- a/contrib/Joshua/scripts/longRunningCorrectnessTest.sh
+++ b/contrib/Joshua/scripts/longRunningCorrectnessTest.sh
@ -0,0 +1,9 @@
+#!/bin/sh
+
+# Simulation currently has memory leaks. We need to investigate before we can enable leak detection in joshua.
+export ASAN_OPTIONS="detect_leaks=0"
+
+OLDBINDIR="${OLDBINDIR:-/app/deploy/global_data/oldBinaries}"
+#mono bin/TestHarness.exe joshua-run "${OLDBINDIR}" false
+
+python3 -m test_harness.app -s ${JOSHUA_SEED} --old-binaries-path ${OLDBINDIR} --long-running
--- a/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh
+++ b/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh
@ -0,0 +1,3 @@
+#!/bin/bash -u
+
+python3 -m test_harness.timeout --long-running
--- a/contrib/TestHarness2/test_harness/config.py
+++ b/contrib/TestHarness2/test_harness/config.py
@ -184,6 +184,8 @@ class Config:
        self.reproduce_prefix: str | None = None
        self.reproduce_prefix_args = {'type': str, 'required': False,
                                      'help': 'When printing the results, prepend this string to the command'}
+        self.long_running: bool = False
+        self.long_running_args = {'action': 'store_true'}
        self._env_names: Dict[str, str] = {}
        self._config_map = self._build_map()
        self._read_env()
--- a/contrib/TestHarness2/test_harness/run.py
+++ b/contrib/TestHarness2/test_harness/run.py
@ -303,6 +303,7 @@ class TestRun:
        self.stats: str | None = stats
        self.expected_unseed: int | None = expected_unseed
        self.use_valgrind: bool = config.use_valgrind
+        self.long_running: bool = config.long_running
        self.old_binary_path: Path = config.old_binaries_path
        self.buggify_enabled: bool = buggify_enabled
        self.fault_injection_enabled: bool = True
@ -375,7 +376,7 @@ class TestRun:
        process = subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, cwd=self.temp_path,
                                   text=True, env=env)
        did_kill = False
-        timeout = 20 * config.kill_seconds if self.use_valgrind else config.kill_seconds
+        timeout = 20 * config.kill_seconds if self.use_valgrind or self.long_running else config.kill_seconds
        err_out: str
        try:
            _, err_out = process.communicate(timeout=timeout)
--- a/contrib/TestHarness2/test_harness/summarize.py
+++ b/contrib/TestHarness2/test_harness/summarize.py
@ -384,6 +384,7 @@ class Summary:
            child.attributes['Severity'] = '40'
            child.attributes['ErrorCount'] = str(self.errors)
            self.out.append(child)
+            self.error = True
        if self.was_killed:
            child = SummaryTree('ExternalTimeout')
            child.attributes['Severity'] = '40'
@ -420,6 +421,7 @@ class Summary:
            child = SummaryTree('TestUnexpectedlyNotFinished')
            child.attributes['Severity'] = '40'
            self.out.append(child)
+            self.error = True
        if self.error_out is not None and len(self.error_out) > 0:
            lines = self.error_out.splitlines()
            stderr_bytes = 0
--- a/documentation/sphinx/source/backups.rst
+++ b/documentation/sphinx/source/backups.rst
@ -524,6 +524,12 @@ The ``start`` command will start a new restore on the specified (or default) tag
 ``--inconsistent-snapshot-only``
  Ignore mutation log files during the restore to speedup the process. Because only range files are restored, this option gives an inconsistent snapshot in most cases and is not recommended to use.

+``--user-data``
+  Restore only the user keyspace. This option should NOT be used alongside --system-metadata (below) and CANNOT be used alongside other specified key ranges.
+
+``--system-metadata``
+  Restore only the relevant system keyspace. This option should NOT be used alongside --user-data (above) and CANNOT be used alongside other specified key ranges.
+
 .. program:: fdbrestore abort

 ``abort``
--- a/documentation/sphinx/source/client-testing.rst
+++ b/documentation/sphinx/source/client-testing.rst
@ -648,6 +648,16 @@ The subclasses of the ``ApiWorkload`` inherit the following configuration option
   initiated by a test script to check if the client workload is successfully progressing after a
   cluster change.

+The FDB server configuration can be specialized in the section ``[[server]]``:
+
+- ``tenants_enabled``: enable multitenancy (default: true)
+- ``blob_granules_enabled``:  enable support for blob granules (default: false)
+- ``tls_enabled``: enable TLS (default: false)
+- ``tls_client_chain_len``: the length of the client-side TLS chain (default: 2)
+- ``tls_server_chain_len``: the length of the server-side TLS chain (default: 3)
+- ``min_num_processes`` and ``max_num_processes``: the number of FDB server processes to be
+  randomly selected from the given range (default 1-3)
+
 Executing the Tests
 ===================

@ -656,19 +666,35 @@ according to its specification. Before that we must create a FDB cluster and pas
 a parameter to ``fdb_c_api_tester``. Note that multithreaded tests also need to be provided with an
 external client library. 

-For example, we can create a temporary cluster and use it for execution of one of the existing API tests:
+The ``run_c_api_tests.py`` script automates execution of the API tests on a local cluster. The cluster
+is created according to the options specified in the ``[[server]]`` section of the given test file.

 .. code-block:: bash

-   ${srcDir}/tests/TestRunner/tmp_cluster.py --build-dir ${buildDir} -- \
-      ${buildDir}/bin/fdb_c_api_tester \
-      --cluster-file @CLUSTER_FILE@ \
-      --external-client-library=${buildDir}/bindings/c/libfdb_c_external.so \
+   ${srcDir}/bindings/c/test/apitester/run_c_api_tests.py
+      --build-dir ${buildDir}
+      --api-tester-bin ${buildDir}/bin/fdb_c_api_tester
+      --external-client-library ${buildDir}/bindings/c/libfdb_c_external.so
      --test-file ${srcDir}/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml

 The test specifications added to the ``bindings/c/test/apitester/tests/`` directory are executed as a part
-of the regression test suite. They can be executed using the ``ctest`` target ``fdb_c_api_tests``:
+of the regression test suite as ``ctest`` targets with names ``fdb_c_api_test_{file_name}``.
+
+The ``ctest`` targets provide a more convenient way for executing the API tests. We can execute 
+a single test:

 .. code-block:: bash
   
-   ctest -R fdb_c_api_tests -VV
+   ctest -R fdb_c_api_test_CApiCorrectnessMultiThr -VV
+
+or execute all of them in parallel (here ``-j20`` specifies the parallelization level):
+
+.. code-block:: bash
+   
+   ctest -R fdb_c_api_test_ -j20 --output-on-failure
+
+More sophisticated filters can be applied to execute a selected set of tests, e.g. the tests using TLS:
+
+.. code-block:: bash
+
+   ctest -R 'fdb_c_api_test_.*TLS' -j20 --output_on_failure
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@ -47,6 +47,7 @@
 #include "fdbclient/IKnobCollection.h"
 #include "fdbclient/RunTransaction.actor.h"
 #include "fdbclient/S3BlobStore.h"
+#include "fdbclient/SystemData.h"
 #include "fdbclient/json_spirit/json_spirit_writer_template.h"

 #include "flow/Platform.h"
@ -155,6 +156,11 @@ enum {
 	OPT_RESTORE_CLUSTERFILE_ORIG,
 	OPT_RESTORE_BEGIN_VERSION,
 	OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY,
+	// The two restore options below allow callers of fdbrestore to divide a normal restore into one which restores just
+	// the system keyspace and another that restores just the user key space. This is unlike the backup command where
+	// all keys (both system and user) will be backed up together
+	OPT_RESTORE_USER_DATA,
+	OPT_RESTORE_SYSTEM_DATA,

 	// Shared constants
 	OPT_CLUSTERFILE,
@ -696,6 +702,8 @@ CSimpleOpt::SOption g_rgRestoreOptions[] = {
 	{ OPT_BACKUPKEYS, "--keys", SO_REQ_SEP },
 	{ OPT_WAITFORDONE, "-w", SO_NONE },
 	{ OPT_WAITFORDONE, "--waitfordone", SO_NONE },
+	{ OPT_RESTORE_USER_DATA, "--user-data", SO_NONE },
+	{ OPT_RESTORE_SYSTEM_DATA, "--system-metadata", SO_NONE },
 	{ OPT_RESTORE_VERSION, "--version", SO_REQ_SEP },
 	{ OPT_RESTORE_VERSION, "-v", SO_REQ_SEP },
 	{ OPT_TRACE, "--log", SO_NONE },
@ -1187,6 +1195,13 @@ static void printRestoreUsage(bool devhelp) {
 	printf("                 The cluster file for the original database from which the backup was created.  The "
 	       "original database\n");
 	printf("                 is only needed to convert a --timestamp argument to a database version.\n");
+	printf("  --user-data\n"
+	       "                  Restore only the user keyspace. This option should NOT be used alongside "
+	       "--system-metadata (below) and CANNOT be used alongside other specified key ranges.\n");
+	printf(
+	    "  --system-metadata\n"
+	    "                 Restore only the relevant system keyspace. This option "
+	    "should NOT be used alongside --user-data (above) and CANNOT be used alongside other specified key ranges.\n");

 	if (devhelp) {
 #ifdef _WIN32
@ -3367,6 +3382,8 @@ int main(int argc, char* argv[]) {
 		bool trace = false;
 		bool quietDisplay = false;
 		bool dryRun = false;
+		bool restoreSystemKeys = false;
+		bool restoreUserKeys = false;
 		// TODO (Nim): Set this value when we add optional encrypt_files CLI argument to backup agent start
 		bool encryptionEnabled = true;
 		std::string traceDir = "";
@ -3691,6 +3708,14 @@ int main(int argc, char* argv[]) {
 				restoreVersion = ver;
 				break;
 			}
+			case OPT_RESTORE_USER_DATA: {
+				restoreUserKeys = true;
+				break;
+			}
+			case OPT_RESTORE_SYSTEM_DATA: {
+				restoreSystemKeys = true;
+				break;
+			}
 			case OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY: {
 				inconsistentSnapshotOnly.set(true);
 				break;
@ -3838,6 +3863,11 @@ int main(int argc, char* argv[]) {
 			}
 		}

+		if (restoreSystemKeys && restoreUserKeys) {
+			fprintf(stderr, "ERROR: Please only specify one of --user-data or --system-metadata, not both\n");
+			return FDB_EXIT_ERROR;
+		}
+
 		if (trace) {
 			if (!traceLogGroup.empty())
 				setNetworkOption(FDBNetworkOptions::TRACE_LOG_GROUP, StringRef(traceLogGroup));
@ -3938,10 +3968,30 @@ int main(int argc, char* argv[]) {

 		// The fastrestore tool does not yet support multiple ranges and is incompatible with tenants
 		// or other features that back up data in the system keys
-		if (backupKeys.empty() && programExe != ProgramExe::FASTRESTORE_TOOL) {
+		if (!restoreSystemKeys && !restoreUserKeys && backupKeys.empty() &&
+		    programExe != ProgramExe::FASTRESTORE_TOOL) {
 			addDefaultBackupRanges(backupKeys);
 		}

+		if ((restoreSystemKeys || restoreUserKeys) && programExe == ProgramExe::FASTRESTORE_TOOL) {
+			fprintf(stderr, "ERROR: Options: --user-data and --system-metadata are not supported with fastrestore\n");
+			return FDB_EXIT_ERROR;
+		}
+
+		if ((restoreUserKeys || restoreSystemKeys) && !backupKeys.empty()) {
+			fprintf(stderr,
+			        "ERROR: Cannot specify additional ranges when using --user-data or --system-metadata "
+			        "options\n");
+			return FDB_EXIT_ERROR;
+		}
+		if (restoreUserKeys) {
+			backupKeys.push_back_deep(backupKeys.arena(), normalKeys);
+		} else if (restoreSystemKeys) {
+			for (const auto& r : getSystemBackupRanges()) {
+				backupKeys.push_back_deep(backupKeys.arena(), r);
+			}
+		}
+
 		switch (programExe) {
 		case ProgramExe::AGENT:
 			if (!initCluster())
--- a/fdbcli/QuotaCommand.actor.cpp
+++ b/fdbcli/QuotaCommand.actor.cpp
@ -93,8 +93,12 @@ ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
 			} else if (limitType == LimitType::RESERVED) {
 				quota.reservedQuota = (value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1;
 			}
+			if (!quota.isValid()) {
+				throw invalid_throttle_quota_value();
+			}
 			ThrottleApi::setTagQuota(tr, tag, quota.reservedQuota, quota.totalQuota);
 			wait(safeThreadFutureToFuture(tr->commit()));
+			fmt::print("Successfully updated quota.\n");
 			return Void();
 		} catch (Error& e) {
 			wait(safeThreadFutureToFuture(tr->onError(e)));
@ -109,6 +113,7 @@ ACTOR Future<Void> clearQuota(Reference<IDatabase> db, TransactionTag tag) {
 		try {
 			tr->clear(ThrottleApi::getTagQuotaKey(tag));
 			wait(safeThreadFutureToFuture(tr->commit()));
+			fmt::print("Successfully cleared quota.\n");
 			return Void();
 		} catch (Error& e) {
 			wait(safeThreadFutureToFuture(tr->onError(e)));
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@ -1480,6 +1480,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
 						if (isCommitDesc && tokens.size() == 1) {
 							// prompt for description and add to txn
 							state Optional<std::string> raw;
+							warn.cancel();
 							while (!raw.present() || raw.get().empty()) {
 								fprintf(stdout,
 								        "Please set a description for the change. Description must be non-empty.\n");
@ -1490,6 +1491,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
 							std::string line = raw.get();
 							config_tr->set("\xff\xff/description"_sr, line);
 						}
+						warn =
+						    checkStatus(timeWarning(5.0, "\nWARNING: Long delay (Ctrl-C to interrupt)\n"), db, localDb);
 						if (transtype == TransType::Db) {
 							wait(commitTransaction(tr));
 						} else {
@ -1821,6 +1824,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
 						if (!intrans) {
 							// prompt for description and add to txn
 							state Optional<std::string> raw_desc;
+							warn.cancel();
 							while (!raw_desc.present() || raw_desc.get().empty()) {
 								fprintf(stdout,
 								        "Please set a description for the change. Description must be non-empty\n");
@ -1830,6 +1834,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
 							}
 							std::string line = raw_desc.get();
 							config_tr->set("\xff\xff/description"_sr, line);
+							warn = checkStatus(
+							    timeWarning(5.0, "\nWARNING: Long delay (Ctrl-C to interrupt)\n"), db, localDb);
 							wait(commitTransaction(config_tr));
 						} else {
 							isCommitDesc = true;
--- a/fdbcli/tests/fdbcli_tests.py
+++ b/fdbcli/tests/fdbcli_tests.py
@ -109,7 +109,7 @@ def quota(logger):
    command = 'quota clear green'
    output = run_fdbcli_command(command)
    logger.debug(command + ' : ' + output)
-    assert output == ''
+    assert output == 'Successfully cleared quota.'

    command = 'quota get green total_throughput'
    output = run_fdbcli_command(command)
@ -120,17 +120,17 @@ def quota(logger):
    command = 'quota set red total_throughput 49152'
    output = run_fdbcli_command(command)
    logger.debug(command + ' : ' + output)
-    assert output == ''
+    assert output == 'Successfully updated quota.'

    command = 'quota set green total_throughput 32768'
    output = run_fdbcli_command(command)
    logger.debug(command + ' : ' + output)
-    assert output == ''
+    assert output == 'Successfully updated quota.'

    command = 'quota set green reserved_throughput 16384'
    output = run_fdbcli_command(command)
    logger.debug(command + ' : ' + output)
-    assert output == ''
+    assert output == 'Successfully updated quota.'

    command = 'quota get green total_throughput'
    output = run_fdbcli_command(command)
@ -145,7 +145,7 @@ def quota(logger):
    command = 'quota clear green'
    output = run_fdbcli_command(command)
    logger.debug(command + ' : ' + output)
-    assert output == ''
+    assert output == 'Successfully cleared quota.'

    command = 'quota get green total_throughput'
    output = run_fdbcli_command(command)
--- a/fdbclient/BackupContainerLocalDirectory.actor.cpp
+++ b/fdbclient/BackupContainerLocalDirectory.actor.cpp
@ -63,7 +63,7 @@ public:
 		m_buffer = Standalone<VectorRef<uint8_t>>(old.slice(size, old.size()));

 		// Write the old buffer to the underlying file and update the write offset
-		Future<Void> r = holdWhile(old, m_file->write(old.begin(), size, m_writeOffset));
+		Future<Void> r = uncancellable(holdWhile(old, m_file->write(old.begin(), size, m_writeOffset)));
 		m_writeOffset += size;

 		return r;
--- a/fdbclient/BlobGranuleFiles.cpp
+++ b/fdbclient/BlobGranuleFiles.cpp
@ -1057,6 +1057,9 @@ ParsedDeltaBoundaryRef deltaAtVersion(const DeltaBoundaryRef& delta, Version beg
 	                  beginVersion <= delta.clearVersion.get();
 	if (delta.values.empty()) {
 		return ParsedDeltaBoundaryRef(delta.key, clearAfter);
+	} else if (readVersion >= delta.values.back().version && beginVersion <= delta.values.back().version) {
+		// for all but zero or one delta files, readVersion >= the entire delta file. optimize this case
+		return ParsedDeltaBoundaryRef(delta.key, clearAfter, delta.values.back());
 	}
 	auto valueAtVersion = std::lower_bound(delta.values.begin(),
 	                                       delta.values.end(),
@ -1338,6 +1341,10 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
 	std::set<int16_t, std::greater<int16_t>> activeClears;
 	int16_t maxActiveClear = -1;

+	// trade off memory for cpu performance by assuming all inserts
+	RangeResult result;
+	int maxExpectedSize = 0;
+
 	// check if a given stream is actively clearing
 	bool clearActive[streams.size()];
 	for (int16_t i = 0; i < streams.size(); i++) {
@ -1355,14 +1362,12 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
 			item.streamIdx = i;
 			item.dataIdx = 0;
 			next.push(item);
+			maxExpectedSize += streams[i].size();
+			result.arena().dependsOn(streams[i].arena());
 		}
 	}
+	result.reserve(result.arena(), maxExpectedSize);

-	if (chunk.snapshotFile.present()) {
-		stats.snapshotRows += streams[0].size();
-	}
-
-	RangeResult result;
 	std::vector<MergeStreamNext> cur;
 	cur.reserve(streams.size());
 	while (!next.empty()) {
@ -1397,7 +1402,7 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
 				if (v.isSet() && maxActiveClear < it.streamIdx) {
 					KeyRef finalKey =
 					    chunk.tenantPrefix.present() ? v.key.removePrefix(chunk.tenantPrefix.get()) : v.key;
-					result.push_back_deep(result.arena(), KeyValueRef(finalKey, v.value));
+					result.push_back(result.arena(), KeyValueRef(finalKey, v.value));
 					if (!includesSnapshot) {
 						stats.rowsInserted++;
 					} else if (it.streamIdx > 0) {
@ -1426,11 +1431,39 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
 		}
 	}

+	// FIXME: if memory assumption was wrong and result is significantly smaller than total input size, could copy it
+	// with push_back_deep to a new result. This is rare though
+
 	stats.outputBytes += result.expectedSize();

 	return result;
 }

+RangeResult materializeJustSnapshot(const BlobGranuleChunkRef& chunk,
+                                    Optional<StringRef> snapshotData,
+                                    const KeyRange& requestRange,
+                                    GranuleMaterializeStats& stats) {
+	stats.inputBytes += snapshotData.get().size();
+
+	Standalone<VectorRef<ParsedDeltaBoundaryRef>> snapshotRows = loadSnapshotFile(
+	    chunk.snapshotFile.get().filename, snapshotData.get(), requestRange, chunk.snapshotFile.get().cipherKeysCtx);
+	RangeResult result;
+	if (!snapshotRows.empty()) {
+		result.arena().dependsOn(snapshotRows.arena());
+		result.reserve(result.arena(), snapshotRows.size());
+		for (auto& it : snapshotRows) {
+			// TODO REMOVE validation
+			ASSERT(it.op == MutationRef::Type::SetValue);
+			KeyRef finalKey = chunk.tenantPrefix.present() ? it.key.removePrefix(chunk.tenantPrefix.get()) : it.key;
+			result.push_back(result.arena(), KeyValueRef(finalKey, it.value));
+		}
+		stats.outputBytes += result.expectedSize();
+		stats.snapshotRows += result.size();
+	}
+
+	return result;
+}
+
 RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
                                   KeyRangeRef keyRange,
                                   Version beginVersion,
@ -1454,6 +1487,11 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
 		requestRange = keyRange;
 	}

+	// fast case for only-snapshot read
+	if (chunk.snapshotFile.present() && chunk.deltaFiles.empty() && chunk.newDeltas.empty()) {
+		return materializeJustSnapshot(chunk, snapshotData, requestRange, stats);
+	}
+
 	std::vector<Standalone<VectorRef<ParsedDeltaBoundaryRef>>> streams;
 	std::vector<bool> startClears;
 	// +1 for possible snapshot, +1 for possible memory deltas
@ -1471,7 +1509,10 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
 			streams.push_back(snapshotRows);
 			startClears.push_back(false);
 			arena.dependsOn(streams.back().arena());
+			stats.snapshotRows += snapshotRows.size();
 		}
+	} else {
+		ASSERT(!chunk.snapshotFile.present());
 	}

 	if (BG_READ_DEBUG) {
@ -2675,6 +2716,14 @@ struct CommonPrefixStats {
 	int totalKeys = 0;
 	int minKeySize = 1000000000;
 	int maxKeySize = 0;
+	int64_t logicalBytes = 0;
+	int64_t totalLogicalBytes = 0;
+
+	int deltas = 0;
+	int deltasSet = 0;
+	int deltasClear = 0;
+	int deltasNoOp = 0;
+	int deltasClearAfter = 0;

 	void addKey(const KeyRef& k) {
 		if (len == -1) {
@ -2689,7 +2738,38 @@ struct CommonPrefixStats {
 		maxKeySize = std::max(maxKeySize, k.size());
 	}

+	void addKeyValue(const KeyRef& k, const ValueRef& v) {
+		addKey(k);
+		logicalBytes += k.size();
+		logicalBytes += v.size();
+	}
+
+	void addBoundary(const ParsedDeltaBoundaryRef& d) {
+		addKey(d.key);
+
+		deltas++;
+		if (d.isSet()) {
+			deltasSet++;
+			logicalBytes += d.value.size();
+		} else if (d.isClear()) {
+			deltasClear++;
+		} else {
+			ASSERT(d.isNoOp());
+			deltasNoOp++;
+		}
+		if (d.clearAfter) {
+			deltasClearAfter++;
+		}
+	}
+
+	void doneFile() {
+		totalLogicalBytes += logicalBytes;
+		fmt::print("Logical Size: {0}\n", logicalBytes);
+		logicalBytes = 0;
+	}
+
 	Key done() {
+		doneFile();
 		ASSERT(len >= 0);
 		fmt::print("Common prefix: {0}\nCommon Prefix Length: {1}\nAverage Key Size: {2}\nMin Key Size: {3}, Max Key "
 		           "Size: {4}\n",
@ -2698,11 +2778,21 @@ struct CommonPrefixStats {
 		           totalKeySize / totalKeys,
 		           minKeySize,
 		           maxKeySize);
+
+		if (deltas > 0) {
+			fmt::print("Delta stats: {0} deltas, {1} sets, {2} clears, {3} noops, {4} clearAfters\n",
+			           deltas,
+			           deltasSet,
+			           deltasClear,
+			           deltasNoOp,
+			           deltasClearAfter);
+		}
+		fmt::print("Logical Size: {0}\n", totalLogicalBytes);
 		return key.substr(0, len);
 	}
 };

-FileSet loadFileSet(std::string basePath, const std::vector<std::string>& filenames) {
+FileSet loadFileSet(std::string basePath, const std::vector<std::string>& filenames, bool newFormat) {
 	FileSet files;
 	CommonPrefixStats stats;
 	for (int i = 0; i < filenames.size(); i++) {
@ -2713,40 +2803,66 @@ FileSet loadFileSet(std::string basePath, const std::vector<std::string>& filena
 			std::string fpath = basePath + filenames[i];
 			Value data = loadFileData(fpath);

-			Arena arena;
-			GranuleSnapshot file;
-			ObjectReader dataReader(data.begin(), Unversioned());
-			dataReader.deserialize(FileIdentifierFor<GranuleSnapshot>::value, file, arena);
-			Standalone<GranuleSnapshot> parsed(file, arena);
+			Standalone<GranuleSnapshot> parsed;
+			if (!newFormat) {
+				Arena arena;
+				GranuleSnapshot file;
+				ObjectReader dataReader(data.begin(), Unversioned());
+				dataReader.deserialize(FileIdentifierFor<GranuleSnapshot>::value, file, arena);
+				parsed = Standalone<GranuleSnapshot>(file, arena);
+				fmt::print("Loaded {0} rows from snapshot file\n", parsed.size());
+
+				for (auto& it : parsed) {
+					stats.addKeyValue(it.key, it.value);
+				}
+			} else {
+				Standalone<VectorRef<ParsedDeltaBoundaryRef>> res = loadSnapshotFile(""_sr, data, normalKeys, {});
+				fmt::print("Loaded {0} rows from snapshot file\n", res.size());
+				for (auto& it : res) {
+					stats.addKeyValue(it.key, it.value);
+				}
+			}

-			fmt::print("Loaded {0} rows from snapshot file\n", parsed.size());
 			files.snapshotFile = { filenames[i], version, data, parsed };

-			for (auto& it : parsed) {
-				stats.addKey(it.key);
-			}
 		} else {
 			std::string fpath = basePath + filenames[i];
 			Value data = loadFileData(fpath);

-			Arena arena;
-			GranuleDeltas file;
-			ObjectReader dataReader(data.begin(), Unversioned());
-			dataReader.deserialize(FileIdentifierFor<GranuleDeltas>::value, file, arena);
-			Standalone<GranuleDeltas> parsed(file, arena);
+			if (!newFormat) {
+				Arena arena;
+				GranuleDeltas file;
+				ObjectReader dataReader(data.begin(), Unversioned());
+				dataReader.deserialize(FileIdentifierFor<GranuleDeltas>::value, file, arena);
+				Standalone<GranuleDeltas> parsed(file, arena);

-			fmt::print("Loaded {0} deltas from delta file\n", parsed.size());
-			files.deltaFiles.push_back({ filenames[i], version, data, parsed });
+				fmt::print("Loaded {0} deltas from delta file\n", parsed.size());
+				files.deltaFiles.push_back({ filenames[i], version, data, parsed });

-			for (auto& it : parsed) {
-				for (auto& it2 : it.mutations) {
-					stats.addKey(it2.param1);
-					if (it2.type == MutationRef::Type::ClearRange) {
-						stats.addKey(it2.param2);
+				for (auto& it : parsed) {
+					for (auto& it2 : it.mutations) {
+						stats.addKey(it2.param1);
+						if (it2.type == MutationRef::Type::ClearRange) {
+							stats.addKey(it2.param2);
+						}
 					}
 				}
+			} else {
+				bool startClear = false;
+				Standalone<VectorRef<ParsedDeltaBoundaryRef>> res =
+				    loadChunkedDeltaFile(""_sr, data, normalKeys, 0, version, {}, startClear);
+				ASSERT(!startClear);
+
+				Standalone<GranuleDeltas> parsed;
+				fmt::print("Loaded {0} boundaries from delta file\n", res.size());
+				files.deltaFiles.push_back({ filenames[i], version, data, parsed });
+
+				for (auto& it : res) {
+					stats.addBoundary(it);
+				}
 			}
 		}
+		stats.doneFile();
 	}

 	files.commonPrefix = stats.done();
@ -2804,6 +2920,28 @@ std::pair<int64_t, double> doDeltaWriteBench(const Standalone<GranuleDeltas>& da
 	return { serializedBytes, elapsed };
 }

+void chunkFromFileSet(const FileSet& fileSet,
+                      Standalone<BlobGranuleChunkRef>& chunk,
+                      StringRef* deltaPtrs,
+                      Version readVersion,
+                      Optional<BlobGranuleCipherKeysCtx> keys,
+                      int numDeltaFiles) {
+	size_t snapshotSize = std::get<3>(fileSet.snapshotFile).size();
+	chunk.snapshotFile =
+	    BlobFilePointerRef(chunk.arena(), std::get<0>(fileSet.snapshotFile), 0, snapshotSize, snapshotSize, keys);
+
+	for (int i = 0; i < numDeltaFiles; i++) {
+		size_t deltaSize = std::get<3>(fileSet.deltaFiles[i]).size();
+		chunk.deltaFiles.emplace_back_deep(
+		    chunk.arena(), std::get<0>(fileSet.deltaFiles[i]), 0, deltaSize, deltaSize, keys);
+		deltaPtrs[i] = std::get<2>(fileSet.deltaFiles[i]);
+	}
+
+	chunk.keyRange = fileSet.range;
+	chunk.includedVersion = readVersion;
+	chunk.snapshotVersion = std::get<1>(fileSet.snapshotFile);
+}
+
 FileSet rewriteChunkedFileSet(const FileSet& fileSet,
                              Optional<BlobGranuleCipherKeysCtx> keys,
                              Optional<CompressionFilter> compressionFilter) {
@ -2830,41 +2968,30 @@ std::pair<int64_t, double> doReadBench(const FileSet& fileSet,
                                       KeyRange readRange,
                                       bool clearAllAtEnd,
                                       Optional<BlobGranuleCipherKeysCtx> keys,
-                                       Optional<CompressionFilter> compressionFilter) {
+                                       int numDeltaFiles,
+                                       bool printStats = false) {
 	Version readVersion = std::get<1>(fileSet.deltaFiles.back());

 	Standalone<BlobGranuleChunkRef> chunk;
 	GranuleMaterializeStats stats;
-	StringRef deltaPtrs[fileSet.deltaFiles.size()];
+	ASSERT(numDeltaFiles >= 0 && numDeltaFiles <= fileSet.deltaFiles.size());
+	StringRef deltaPtrs[numDeltaFiles];

 	MutationRef clearAllAtEndMutation;
 	if (clearAllAtEnd) {
 		clearAllAtEndMutation = MutationRef(MutationRef::Type::ClearRange, readRange.begin, readRange.end);
 	}
 	if (chunked) {
-		size_t snapshotSize = std::get<3>(fileSet.snapshotFile).size();
-		chunk.snapshotFile =
-		    BlobFilePointerRef(chunk.arena(), std::get<0>(fileSet.snapshotFile), 0, snapshotSize, snapshotSize, keys);
-
-		for (int i = 0; i < fileSet.deltaFiles.size(); i++) {
-			size_t deltaSize = std::get<3>(fileSet.deltaFiles[i]).size();
-			chunk.deltaFiles.emplace_back_deep(
-			    chunk.arena(), std::get<0>(fileSet.deltaFiles[i]), 0, deltaSize, deltaSize, keys);
-			deltaPtrs[i] = std::get<2>(fileSet.deltaFiles[i]);
-		}
-
+		chunkFromFileSet(fileSet, chunk, deltaPtrs, readVersion, keys, numDeltaFiles);
 		if (clearAllAtEnd) {
 			readVersion++;
 			MutationsAndVersionRef lastDelta;
 			lastDelta.version = readVersion;
 			lastDelta.mutations.push_back(chunk.arena(), clearAllAtEndMutation);
+			chunk.includedVersion = readVersion;

 			chunk.newDeltas.push_back_deep(chunk.arena(), lastDelta);
 		}
-
-		chunk.keyRange = fileSet.range;
-		chunk.includedVersion = readVersion;
-		chunk.snapshotVersion = std::get<1>(fileSet.snapshotFile);
 	}

 	int64_t serializedBytes = 0;
@ -2897,15 +3024,16 @@ std::pair<int64_t, double> doReadBench(const FileSet& fileSet,
 	elapsed /= READ_RUNS;
 	serializedBytes /= READ_RUNS;

-	// TODO REMOVE
-	fmt::print("Materialize stats:\n");
-	fmt::print("  Input bytes:  {0}\n", stats.inputBytes);
-	fmt::print("  Output bytes: {0}\n", stats.outputBytes);
-	fmt::print("    Write Amp:  {0}\n", (1.0 * stats.inputBytes) / stats.outputBytes);
-	fmt::print("  Snapshot Rows: {0}\n", stats.snapshotRows);
-	fmt::print("  Rows Cleared:  {0}\n", stats.rowsCleared);
-	fmt::print("  Rows Inserted: {0}\n", stats.rowsInserted);
-	fmt::print("  Rows Updated:  {0}\n", stats.rowsUpdated);
+	if (printStats) {
+		fmt::print("Materialize stats:\n");
+		fmt::print("  Input bytes:  {0}\n", stats.inputBytes / READ_RUNS);
+		fmt::print("  Output bytes: {0}\n", stats.outputBytes / READ_RUNS);
+		fmt::print("    Write Amp:  {0}\n", (1.0 * stats.inputBytes) / stats.outputBytes);
+		fmt::print("  Snapshot Rows: {0}\n", stats.snapshotRows / READ_RUNS);
+		fmt::print("  Rows Cleared:  {0}\n", stats.rowsCleared / READ_RUNS);
+		fmt::print("  Rows Inserted: {0}\n", stats.rowsInserted / READ_RUNS);
+		fmt::print("  Rows Updated:  {0}\n", stats.rowsUpdated / READ_RUNS);
+	}

 	return { serializedBytes, elapsed };
 }
@ -2937,7 +3065,7 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
 	int64_t logicalSnapshotSize = 0;
 	int64_t logicalDeltaSize = 0;
 	for (auto& it : fileSetNames) {
-		FileSet fileSet = loadFileSet(basePath, it);
+		FileSet fileSet = loadFileSet(basePath, it, false);
 		fileSets.push_back(fileSet);
 		logicalSnapshotSize += std::get<3>(fileSet.snapshotFile).expectedSize();
 		for (auto& deltaFile : fileSet.deltaFiles) {
@ -2968,7 +3096,7 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
 					if (encrypt) {
 						name += "ENC";
 					}
-					if (compressionFilter.present()) {
+					if (compressionFilter.present() && compressionFilter.get() != CompressionFilter::NONE) {
 						name += "CMP";
 					}
 					if (name.empty()) {
@ -3024,9 +3152,16 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
 	std::vector<std::string> readRunNames = {};
 	std::vector<std::pair<int64_t, double>> readMetrics;

-	bool doEdgeCaseReadTests = true;
+	bool doEdgeCaseReadTests = false;
+	bool doVaryingDeltaTests = false;
 	std::vector<double> clearAllReadMetrics;
 	std::vector<double> readSingleKeyMetrics;
+	std::vector<std::vector<std::pair<int64_t, double>>> varyingDeltaMetrics;
+
+	size_t maxDeltaFiles = 100000;
+	for (auto& f : fileSets) {
+		maxDeltaFiles = std::min(maxDeltaFiles, f.deltaFiles.size());
+	}

 	for (bool chunk : chunkModes) {
 		for (bool encrypt : encryptionModes) {
@ -3049,7 +3184,7 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
 					if (encrypt) {
 						name += "ENC";
 					}
-					if (compressionFilter.present()) {
+					if (compressionFilter.present() && compressionFilter.get() != CompressionFilter::NONE) {
 						name += "CMP";
 					}
 					if (name.empty()) {
@ -3062,6 +3197,10 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
 				double totalElapsed = 0.0;
 				double totalElapsedClearAll = 0.0;
 				double totalElapsedSingleKey = 0.0;
+				std::vector<std::pair<int64_t, double>> varyingDeltas;
+				for (int i = 0; i <= maxDeltaFiles; i++) {
+					varyingDeltas.push_back({ 0, 0.0 });
+				}
 				for (auto& fileSet : fileSets) {
 					FileSet newFileSet;
 					if (!chunk) {
@ -3070,24 +3209,38 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
 						newFileSet = rewriteChunkedFileSet(fileSet, keys, compressionFilter);
 					}

-					auto res = doReadBench(newFileSet, chunk, fileSet.range, false, keys, compressionFilter);
+					auto res = doReadBench(newFileSet, chunk, fileSet.range, false, keys, newFileSet.deltaFiles.size());
 					totalBytesRead += res.first;
 					totalElapsed += res.second;

 					if (doEdgeCaseReadTests) {
 						totalElapsedClearAll +=
-						    doReadBench(newFileSet, chunk, fileSet.range, true, keys, compressionFilter).second;
+						    doReadBench(newFileSet, chunk, fileSet.range, true, keys, newFileSet.deltaFiles.size())
+						        .second;
 						Key k = std::get<3>(fileSet.snapshotFile).front().key;
 						KeyRange singleKeyRange(KeyRangeRef(k, keyAfter(k)));
 						totalElapsedSingleKey +=
-						    doReadBench(newFileSet, chunk, singleKeyRange, false, keys, compressionFilter).second;
+						    doReadBench(newFileSet, chunk, singleKeyRange, false, keys, newFileSet.deltaFiles.size())
+						        .second;
+					}
+
+					if (doVaryingDeltaTests && chunk) {
+						for (int i = 0; i <= maxDeltaFiles; i++) {
+							auto r = doReadBench(newFileSet, chunk, fileSet.range, false, keys, i);
+							varyingDeltas[i].first += r.first;
+							varyingDeltas[i].second += r.second;
+						}
 					}
 				}
 				readMetrics.push_back({ totalBytesRead, totalElapsed });
+
 				if (doEdgeCaseReadTests) {
 					clearAllReadMetrics.push_back(totalElapsedClearAll);
 					readSingleKeyMetrics.push_back(totalElapsedSingleKey);
 				}
+				if (doVaryingDeltaTests) {
+					varyingDeltaMetrics.push_back(varyingDeltas);
+				}
 			}
 		}
 	}
@ -3121,6 +3274,25 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
 		}
 	}

+	if (doVaryingDeltaTests) {
+		ASSERT(readRunNames.size() == varyingDeltaMetrics.size());
+		fmt::print("\n\nVarying Deltas Read Results:\nDF#\t");
+		for (int i = 0; i <= maxDeltaFiles; i++) {
+			fmt::print("{0}\t", i);
+		}
+		fmt::print("\n");
+
+		for (int i = 0; i < readRunNames.size(); i++) {
+			fmt::print("{0}", readRunNames[i]);
+
+			for (auto& it : varyingDeltaMetrics[i]) {
+				double MBperCPUsec = (it.first / 1024.0 / 1024.0) / it.second;
+				fmt::print("\t{:.6}", MBperCPUsec);
+			}
+			fmt::print("\n");
+		}
+	}
+
 	fmt::print("\n\nCombined Results:\n");
 	ASSERT(readRunNames.size() == runNames.size() - 1);
 	for (int i = 0; i < readRunNames.size(); i++) {
@ -3137,3 +3309,22 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {

 	return Void();
 }
+
+TEST_CASE("!/blobgranule/files/repeatFromFiles") {
+	std::string basePath = "SET_ME";
+	std::vector<std::vector<std::string>> fileSetNames = { { "SET_ME" } };
+
+	int64_t totalBytesRead = 0;
+	double totalElapsed = 0.0;
+	for (auto& it : fileSetNames) {
+		FileSet fileSet = loadFileSet(basePath, it, true);
+		auto res = doReadBench(fileSet, true, fileSet.range, false, {}, fileSet.deltaFiles.size(), true);
+		totalBytesRead += res.first;
+		totalElapsed += res.second;
+	}
+
+	double MBperCPUsec = (totalBytesRead / 1024.0 / 1024.0) / totalElapsed;
+	fmt::print("Read Results: {:.6} MB/cpusec\n", MBperCPUsec);
+
+	return Void();
+}
--- a/fdbclient/BlobGranuleReader.actor.cpp
+++ b/fdbclient/BlobGranuleReader.actor.cpp
@ -142,7 +142,6 @@ bool isRangeFullyCovered(KeyRange range, Standalone<VectorRef<BlobGranuleChunkRe
 	for (const BlobGranuleChunkRef& chunk : blobChunks) {
 		blobRanges.push_back(chunk.keyRange);
 	}
-
 	return range.isCovered(blobRanges);
 }

@ -194,7 +193,7 @@ TEST_CASE("/fdbserver/blobgranule/isRangeCoveredByBlob") {
 		testAddChunkRange("key_a1"_sr, "key_a9"_sr, continuedChunks);
 		testAddChunkRange("key_a9"_sr, "key_b1"_sr, continuedChunks);
 		testAddChunkRange("key_b1"_sr, "key_b9"_sr, continuedChunks);
-		ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_b9"_sr), continuedChunks) == false);
+		ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_b9"_sr), continuedChunks));
 	}
 	return Void();
 }
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@ -1040,13 +1040,10 @@ private:
 	Key lastValue;
 };

-ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
-                                        Standalone<VectorRef<KeyValueRef>>* results,
-                                        bool encryptedBlock,
-                                        Optional<Database> cx) {
+void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>* results) {
 	// Read begin key, if this fails then block was invalid.
-	state uint32_t kLen = reader->consumeNetworkUInt32();
-	state const uint8_t* k = reader->consume(kLen);
+	uint32_t kLen = reader->consumeNetworkUInt32();
+	const uint8_t* k = reader->consume(kLen);
 	results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef()));

 	// Read kv pairs and end key
@ -1075,7 +1072,6 @@ ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
 	for (auto b : reader->remainder())
 		if (b != 0xFF)
 			throw restore_corrupted_data_padding();
-	return Void();
 }

 ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file,
@ -1083,7 +1079,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
                                                                      int len,
                                                                      Optional<Database> cx) {
 	state Standalone<StringRef> buf = makeString(len);
-	int rLen = wait(file->read(mutateString(buf), len, offset));
+	int rLen = wait(uncancellable(holdWhile(buf, file->read(mutateString(buf), len, offset))));
 	if (rLen != len)
 		throw restore_bad_read();

@ -1098,7 +1094,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
 		// BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION
 		int32_t file_version = reader.consume<int32_t>();
 		if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {
-			wait(decodeKVPairs(&reader, &results, false, cx));
+			decodeKVPairs(&reader, &results);
 		} else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) {
 			CODE_PROBE(true, "decoding encrypted block");
 			ASSERT(cx.present());
@ -1121,7 +1117,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
 			StringRef decryptedData =
 			    wait(EncryptedRangeFileWriter::decrypt(cx.get(), header, dataPayloadStart, dataLen, &results.arena()));
 			reader = StringRefReader(decryptedData, restore_corrupted_data());
-			wait(decodeKVPairs(&reader, &results, true, cx));
+			decodeKVPairs(&reader, &results);
 		} else {
 			throw restore_unsupported_file_version();
 		}
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@ -2559,19 +2559,19 @@ bool schemaMatch(json_spirit::mValue const& schemaValue,
 	}
 }

-void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota) {
+void setStorageQuota(Transaction& tr, StringRef tenantName, int64_t quota) {
 	tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 	auto key = storageQuotaKey(tenantName);
-	tr.set(key, BinaryWriter::toValue<uint64_t>(quota, Unversioned()));
+	tr.set(key, BinaryWriter::toValue<int64_t>(quota, Unversioned()));
 }

-ACTOR Future<Optional<uint64_t>> getStorageQuota(Transaction* tr, StringRef tenantName) {
+ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantName) {
 	tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
 	state Optional<Value> v = wait(tr->get(storageQuotaKey(tenantName)));
 	if (!v.present()) {
-		return Optional<uint64_t>();
+		return Optional<int64_t>();
 	}
-	return BinaryReader::fromStringRef<uint64_t>(v.get(), Unversioned());
+	return BinaryReader::fromStringRef<int64_t>(v.get(), Unversioned());
 }

 std::string ManagementAPI::generateErrorMessage(const CoordinatorsResult& res) {
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@ -297,7 +297,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC,   isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120;
 	init( DD_TENANT_AWARENESS_ENABLED,                         false );
 	init( TENANT_CACHE_LIST_REFRESH_INTERVAL,                      2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
-	init( TENANT_CACHE_STORAGE_REFRESH_INTERVAL,                   2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
+	init( TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL,             2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
+	init( TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL,            10 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);

 	// TeamRemover
 	init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER,                false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
@ -421,6 +422,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	// Enable this knob only for experminatal purpose, never enable this in production.
 	// If enabled, all the committed in-memory memtable writes are lost on a crash.
 	init( ROCKSDB_DISABLE_WAL_EXPERIMENTAL,                    false );
+	// If ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE is enabled, disable ENABLE_CLEAR_RANGE_EAGER_READS knob.
+	// These knobs have contrary functionality.
+	init( ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE,             false ); if( randomize && BUGGIFY ) ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE = deterministicRandom()->coinflip() ? false : true;
+	init( ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT,              200000 ); // 200KB
 	// Can commit will delay ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD seconds for
 	// ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD times, if rocksdb overloaded.
 	// Set ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD to 0, to disable
@ -788,7 +793,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( RANGESTREAM_LIMIT_BYTES,                               2e6 ); if( randomize && BUGGIFY ) RANGESTREAM_LIMIT_BYTES = 1;
 	init( CHANGEFEEDSTREAM_LIMIT_BYTES,                          1e6 ); if( randomize && BUGGIFY ) CHANGEFEEDSTREAM_LIMIT_BYTES = 1;
 	init( BLOBWORKERSTATUSSTREAM_LIMIT_BYTES,                    1e4 ); if( randomize && BUGGIFY ) BLOBWORKERSTATUSSTREAM_LIMIT_BYTES = 1;
-	init( ENABLE_CLEAR_RANGE_EAGER_READS,                       true );
+	init( ENABLE_CLEAR_RANGE_EAGER_READS,                       true ); if( randomize && BUGGIFY ) ENABLE_CLEAR_RANGE_EAGER_READS = deterministicRandom()->coinflip() ? false : true;
 	init( CHECKPOINT_TRANSFER_BLOCK_BYTES,                      40e6 );
 	init( QUICK_GET_VALUE_FALLBACK,                             true );
 	init( QUICK_GET_KEY_VALUES_FALLBACK,                        true );
--- a/fdbclient/include/fdbclient/BlobMetadataUtils.h
+++ b/fdbclient/include/fdbclient/BlobMetadataUtils.h
@ -62,8 +62,8 @@ struct BlobMetadataDetailsRef {
 	                                BlobMetadataDomainNameRef domainName,
 	                                Optional<StringRef> base,
 	                                VectorRef<StringRef> partitions,
-	                                int64_t refreshAt,
-	                                int64_t expireAt)
+	                                double refreshAt,
+	                                double expireAt)
 	  : domainId(domainId), domainName(ar, domainName), partitions(ar, partitions), refreshAt(refreshAt),
 	    expireAt(expireAt) {
 		if (base.present()) {
--- a/fdbclient/include/fdbclient/FDBTypes.h
+++ b/fdbclient/include/fdbclient/FDBTypes.h
@ -336,12 +336,13 @@ struct KeyRangeRef {
 	bool isCovered(std::vector<KeyRangeRef>& ranges) {
 		ASSERT(std::is_sorted(ranges.begin(), ranges.end(), KeyRangeRef::ArbitraryOrder()));
 		KeyRangeRef clone(begin, end);
+
 		for (auto r : ranges) {
-			if (begin < r.begin)
+			if (clone.begin < r.begin)
 				return false; // uncovered gap between clone.begin and r.begin
-			if (end <= r.end)
+			if (clone.end <= r.end)
 				return true; // range is fully covered
-			if (end > r.begin)
+			if (clone.end > r.begin)
 				// {clone.begin, r.end} is covered. need to check coverage for {r.end, clone.end}
 				clone = KeyRangeRef(r.end, clone.end);
 		}
@ -1402,6 +1403,25 @@ struct TenantMode {
 		serializer(ar, mode);
 	}

+	// This does not go back-and-forth cleanly with toString
+	// The '_experimental' suffix, if present, needs to be removed in order to be parsed.
+	static TenantMode fromString(std::string mode) {
+		if (mode.find("_experimental") != std::string::npos) {
+			mode.replace(mode.find("_experimental"), std::string::npos, "");
+		}
+		if (mode == "disabled") {
+			return TenantMode::DISABLED;
+		} else if (mode == "optional") {
+			return TenantMode::OPTIONAL_TENANT;
+		} else if (mode == "required") {
+			return TenantMode::REQUIRED;
+		} else {
+			TraceEvent(SevError, "UnknownTenantMode").detail("TenantMode", mode);
+			ASSERT(false);
+			throw internal_error();
+		}
+	}
+
 	std::string toString() const {
 		switch (mode) {
 		case DISABLED:
@ -1669,8 +1689,8 @@ struct Versionstamp {

 	template <class Ar>
 	void serialize(Ar& ar) {
-		uint64_t beVersion;
-		uint16_t beBatch;
+		int64_t beVersion;
+		int16_t beBatch;

 		if constexpr (!Ar::isDeserializing) {
 			beVersion = bigEndian64(version);
@ -1680,7 +1700,7 @@ struct Versionstamp {
 		serializer(ar, beVersion, beBatch);

 		if constexpr (Ar::isDeserializing) {
-			version = bigEndian64(version);
+			version = bigEndian64(beVersion);
 			batchNumber = bigEndian16(beBatch);
 		}
 	}
--- a/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h
+++ b/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h
@ -104,6 +104,11 @@ Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>> getL

 	// Collect cached cipher keys.
 	for (auto& domain : domains) {
+		if (domain.first == FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
+			ASSERT(domain.second == FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
+		} else if (domain.first == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
+			ASSERT(domain.second == FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
+		}
 		Reference<BlobCipherKey> cachedCipherKey = cipherKeyCache->getLatestCipherKey(domain.first /*domainId*/);
 		if (cachedCipherKey.isValid()) {
 			cipherKeys[domain.first] = cachedCipherKey;
@ -301,7 +306,7 @@ template <class T>
 Future<TextAndHeaderCipherKeys> getLatestSystemEncryptCipherKeys(const Reference<AsyncVar<T> const>& db,
                                                                 BlobCipherMetrics::UsageType usageType) {
 	return getLatestEncryptCipherKeysForDomain(
-	    db, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME, usageType);
+	    db, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME, usageType);
 }

 ACTOR template <class T>
--- a/fdbclient/include/fdbclient/ManagementAPI.actor.h
+++ b/fdbclient/include/fdbclient/ManagementAPI.actor.h
@ -164,8 +164,8 @@ bool schemaMatch(json_spirit::mValue const& schema,
 ACTOR Future<Void> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID);

 // Set and get the storage quota per tenant
-void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota);
-ACTOR Future<Optional<uint64_t>> getStorageQuota(Transaction* tr, StringRef tenantName);
+void setStorageQuota(Transaction& tr, StringRef tenantName, int64_t quota);
+ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantName);

 #include "flow/unactorcompiler.h"
 #endif
--- a/fdbclient/include/fdbclient/ServerKnobs.h
+++ b/fdbclient/include/fdbclient/ServerKnobs.h
@ -237,8 +237,10 @@ public:
 	    DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC; // Minimal age of a correct-configured server before it's chosen to be wiggled
 	bool DD_TENANT_AWARENESS_ENABLED;
 	int TENANT_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantCache is refreshed
-	int TENANT_CACHE_STORAGE_REFRESH_INTERVAL; // How often the storage bytes used by each tenant in the TenantCache is
-	                                           // refreshed
+	int TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL; // How often the storage bytes used by each tenant is refreshed
+	                                                 // in the TenantCache
+	int TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL; // How often the storage quota allocated to each tenant is
+	                                                 // refreshed in the TenantCache

 	// TeamRemover to remove redundant teams
 	bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor
@ -345,6 +347,8 @@ public:
 	int ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD;
 	int ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD;
 	bool ROCKSDB_DISABLE_WAL_EXPERIMENTAL;
+	bool ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE;
+	int64_t ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT;
 	int64_t ROCKSDB_COMPACTION_READAHEAD_SIZE;
 	int64_t ROCKSDB_BLOCK_SIZE;
 	bool ENABLE_SHARDED_ROCKSDB;
--- a/fdbclient/include/fdbclient/TenantEntryCache.actor.h
+++ b/fdbclient/include/fdbclient/TenantEntryCache.actor.h
@ -535,28 +535,33 @@ public:
 	}

 	void put(const TenantNameEntryPair& pair) {
-		TenantEntryCachePayload<T> payload = createPayloadFunc(pair.first, pair.second);
-		auto idItr = mapByTenantId.find(pair.second.id);
-		auto nameItr = mapByTenantName.find(pair.first);
+		const auto& [name, entry] = pair;
+		TenantEntryCachePayload<T> payload = createPayloadFunc(name, entry);
+		auto idItr = mapByTenantId.find(entry.id);
+		auto nameItr = mapByTenantName.find(name);

 		Optional<TenantName> existingName;
 		Optional<int64_t> existingId;
 		if (nameItr != mapByTenantName.end()) {
 			existingId = nameItr->value.entry.id;
-			mapByTenantId.erase(nameItr->value.entry.id);
 		}
 		if (idItr != mapByTenantId.end()) {
 			existingName = idItr->value.name;
-			mapByTenantName.erase(idItr->value.name);
+		}
+		if (existingId.present()) {
+			mapByTenantId.erase(existingId.get());
+		}
+		if (existingName.present()) {
+			mapByTenantName.erase(existingName.get());
 		}

-		mapByTenantId[pair.second.id] = payload;
-		mapByTenantName[pair.first] = payload;
+		mapByTenantId[entry.id] = payload;
+		mapByTenantName[name] = payload;

 		TraceEvent("TenantEntryCachePut")
-		    .detail("TenantName", pair.first)
+		    .detail("TenantName", name)
 		    .detail("TenantNameExisting", existingName)
-		    .detail("TenantID", pair.second.id)
+		    .detail("TenantID", entry.id)
 		    .detail("TenantIDExisting", existingId)
 		    .detail("TenantPrefix", pair.second.prefix);

@ -582,4 +587,4 @@ public:
 };

 #include "flow/unactorcompiler.h"
-#endif // FDBCLIENT_TENANTENTRYCACHE_ACTOR_H
+#endif // FDBCLIENT_TENANTENTRYCACHE_ACTOR_H
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@ -202,8 +202,9 @@ description is not currently required but encouraged.
            description="Deprecated. Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect."
            defaultFor="23"/>
    <Option name="transaction_automatic_idempotency" code="506"
-            description="Set a random idempotency id for all transactions. See the transaction option description for more information." 
-            defaultFor="505"/>
+            description="Set a random idempotency id for all transactions. See the transaction option description for more information. This feature is in development and not ready for general use." 
+            defaultFor="505"
+            hidden="true"/>
    <Option name="transaction_bypass_unreadable" code="700"
            description="Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. This sets the ``bypass_unreadable`` option of each transaction created by this database. See the transaction option description for more information."
            defaultFor="1100"/>
@ -278,9 +279,11 @@ description is not currently required but encouraged.
            description="Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit." />
    <Option name="idempotency_id" code="504"
            paramType="String" paramDescription="Unique ID"
-            description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes." />
+            description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes. This feature is in development and not ready for general use."
+            hidden="true" />
    <Option name="automatic_idempotency" code="505"
-            description="Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future." />
+            description="Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future. This feature is in development and not ready for general use."
+            hidden="true" />
    <Option name="snapshot_ryw_enable" code="600"
            description="Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior." />
    <Option name="snapshot_ryw_disable" code="601"
--- a/fdbrpc/AsyncFileEncrypted.actor.cpp
+++ b/fdbrpc/AsyncFileEncrypted.actor.cpp
@ -48,15 +48,17 @@ public:
 	ACTOR static Future<Standalone<StringRef>> readBlock(AsyncFileEncrypted* self, uint32_t block) {
 		state Arena arena;
 		state unsigned char* encrypted = new (arena) unsigned char[FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE];
-		int bytes = wait(
-		    self->file->read(encrypted, FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE, FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE * block));
+		int bytes = wait(uncancellable(holdWhile(arena,
+		                                         self->file->read(encrypted,
+		                                                          FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE,
+		                                                          FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE * block))));
 		StreamCipherKey const* cipherKey = StreamCipherKey::getGlobalCipherKey();
 		DecryptionStreamCipher decryptor(cipherKey, self->getIV(block));
 		auto decrypted = decryptor.decrypt(encrypted, bytes, arena);
 		return Standalone<StringRef>(decrypted, arena);
 	}

-	ACTOR static Future<int> read(AsyncFileEncrypted* self, void* data, int length, int64_t offset) {
+	ACTOR static Future<int> read(Reference<AsyncFileEncrypted> self, void* data, int length, int64_t offset) {
 		state const uint32_t firstBlock = offset / FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE;
 		state const uint32_t lastBlock = (offset + length - 1) / FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE;
 		state uint32_t block;
@ -70,7 +72,7 @@ public:
 			if (cachedBlock.present()) {
 				plaintext = cachedBlock.get();
 			} else {
-				wait(store(plaintext, readBlock(self, block)));
+				wait(store(plaintext, readBlock(self.getPtr(), block)));
 				self->readBuffers.insert(block, plaintext);
 			}
 			auto start = (block == firstBlock) ? plaintext.begin() + (offset % FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE)
@ -96,7 +98,7 @@ public:
 		return bytesRead;
 	}

-	ACTOR static Future<Void> write(AsyncFileEncrypted* self, void const* data, int length, int64_t offset) {
+	ACTOR static Future<Void> write(Reference<AsyncFileEncrypted> self, void const* data, int length, int64_t offset) {
 		ASSERT(self->mode == AsyncFileEncrypted::Mode::APPEND_ONLY);
 		// All writes must append to the end of the file:
 		ASSERT_EQ(offset, self->currentBlock * FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE + self->offsetInBlock);
@ -122,7 +124,7 @@ public:
 		return Void();
 	}

-	ACTOR static Future<Void> sync(AsyncFileEncrypted* self) {
+	ACTOR static Future<Void> sync(Reference<AsyncFileEncrypted> self) {
 		ASSERT(self->mode == AsyncFileEncrypted::Mode::APPEND_ONLY);
 		wait(self->writeLastBlockToFile());
 		wait(self->file->sync());
@ -135,7 +137,7 @@ public:
 		Arena arena;
 		auto zeroes = new (arena) unsigned char[length];
 		memset(zeroes, 0, length);
-		wait(self->write(zeroes, length, offset));
+		wait(uncancellable(holdWhile(arena, self->write(zeroes, length, offset))));
 		return Void();
 	}
 };
@ -159,11 +161,11 @@ void AsyncFileEncrypted::delref() {
 }

 Future<int> AsyncFileEncrypted::read(void* data, int length, int64_t offset) {
-	return AsyncFileEncryptedImpl::read(this, data, length, offset);
+	return AsyncFileEncryptedImpl::read(Reference<AsyncFileEncrypted>::addRef(this), data, length, offset);
 }

 Future<Void> AsyncFileEncrypted::write(void const* data, int length, int64_t offset) {
-	return AsyncFileEncryptedImpl::write(this, data, length, offset);
+	return AsyncFileEncryptedImpl::write(Reference<AsyncFileEncrypted>::addRef(this), data, length, offset);
 }

 Future<Void> AsyncFileEncrypted::zeroRange(int64_t offset, int64_t length) {
@ -177,7 +179,7 @@ Future<Void> AsyncFileEncrypted::truncate(int64_t size) {

 Future<Void> AsyncFileEncrypted::sync() {
 	ASSERT(mode == Mode::APPEND_ONLY);
-	return AsyncFileEncryptedImpl::sync(this);
+	return AsyncFileEncryptedImpl::sync(Reference<AsyncFileEncrypted>::addRef(this));
 }

 Future<Void> AsyncFileEncrypted::flush() {
@ -217,7 +219,11 @@ StreamCipher::IV AsyncFileEncrypted::getIV(uint32_t block) const {
 }

 Future<Void> AsyncFileEncrypted::writeLastBlockToFile() {
-	return file->write(&writeBuffer[0], offsetInBlock, currentBlock * FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE);
+	// The source buffer for the write is owned by *this so this must be kept alive by reference count until the write
+	// is finished.
+	return uncancellable(
+	    holdWhile(Reference<AsyncFileEncrypted>::addRef(this),
+	              file->write(&writeBuffer[0], offsetInBlock, currentBlock * FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE)));
 }

 size_t AsyncFileEncrypted::RandomCache::evict() {
--- a/fdbrpc/include/fdbrpc/AsyncFileChaos.h
+++ b/fdbrpc/include/fdbrpc/AsyncFileChaos.h
@ -71,8 +71,9 @@ public:

 		// Wait for diskDelay before submitting the I/O
 		// Template types are being provided explicitly because they can't be automatically deduced for some reason.
+		// Capture file by value in case this is destroyed during the delay
 		return mapAsync<Void, std::function<Future<int>(Void)>, int>(
-		    delay(diskDelay), [=](Void _) -> Future<int> { return file->read(data, length, offset); });
+		    delay(diskDelay), [=, file = file](Void _) -> Future<int> { return file->read(data, length, offset); });
 	}

 	Future<Void> write(void const* data, int length, int64_t offset) override {
@ -111,12 +112,14 @@ public:
 		}

 		// Wait for diskDelay before submitting the I/O
-		return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(delay(diskDelay), [=](Void _) -> Future<Void> {
-			if (pdata)
-				return holdWhile(arena, file->write(pdata, length, offset));
+		// Capture file by value in case this is destroyed during the delay
+		return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(
+		    delay(diskDelay), [=, file = file](Void _) -> Future<Void> {
+			    if (pdata)
+				    return holdWhile(arena, file->write(pdata, length, offset));

-			return file->write(data, length, offset);
-		});
+			    return file->write(data, length, offset);
+		    });
 	}

 	Future<Void> truncate(int64_t size) override {
@ -125,8 +128,9 @@ public:
 			return file->truncate(size);

 		// Wait for diskDelay before submitting the I/O
+		// Capture file by value in case this is destroyed during the delay
 		return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(
-		    delay(diskDelay), [=](Void _) -> Future<Void> { return file->truncate(size); });
+		    delay(diskDelay), [=, file = file](Void _) -> Future<Void> { return file->truncate(size); });
 	}

 	Future<Void> sync() override {
@ -135,8 +139,9 @@ public:
 			return file->sync();

 		// Wait for diskDelay before submitting the I/O
+		// Capture file by value in case this is destroyed during the delay
 		return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(
-		    delay(diskDelay), [=](Void _) -> Future<Void> { return file->sync(); });
+		    delay(diskDelay), [=, file = file](Void _) -> Future<Void> { return file->sync(); });
 	}

 	Future<int64_t> size() const override {
@ -145,8 +150,9 @@ public:
 			return file->size();

 		// Wait for diskDelay before submitting the I/O
+		// Capture file by value in case this is destroyed during the delay
 		return mapAsync<Void, std::function<Future<int64_t>(Void)>, int64_t>(
-		    delay(diskDelay), [=](Void _) -> Future<int64_t> { return file->size(); });
+		    delay(diskDelay), [=, file = file](Void _) -> Future<int64_t> { return file->size(); });
 	}

 	int64_t debugFD() const override { return file->debugFD(); }
--- a/fdbrpc/include/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/include/fdbrpc/AsyncFileNonDurable.actor.h
@ -46,12 +46,17 @@ ACTOR Future<Void> sendErrorOnProcess(ISimulator::ProcessInfo* process,
                                      TaskPriority taskID);

 ACTOR template <class T>
-Future<T> sendErrorOnShutdown(Future<T> in) {
-	choose {
-		when(wait(success(g_simulator->getCurrentProcess()->shutdownSignal.getFuture()))) {
-			throw io_error().asInjectedFault();
+Future<T> sendErrorOnShutdown(Future<T> in, bool assertOnCancel = false) {
+	try {
+		choose {
+			when(wait(success(g_simulator->getCurrentProcess()->shutdownSignal.getFuture()))) {
+				throw io_error().asInjectedFault();
+			}
+			when(T rep = wait(in)) { return rep; }
 		}
-		when(T rep = wait(in)) { return rep; }
+	} catch (Error& e) {
+		ASSERT(e.code() != error_code_actor_cancelled || !assertOnCancel);
+		throw;
 	}
 }

@ -59,9 +64,12 @@ class AsyncFileDetachable final : public IAsyncFile, public ReferenceCounted<Asy
 private:
 	Reference<IAsyncFile> file;
 	Future<Void> shutdown;
+	bool assertOnReadWriteCancel;

 public:
-	explicit AsyncFileDetachable(Reference<IAsyncFile> file) : file(file) { shutdown = doShutdown(this); }
+	explicit AsyncFileDetachable(Reference<IAsyncFile> file) : file(file), assertOnReadWriteCancel(true) {
+		shutdown = doShutdown(this);
+	}

 	ACTOR Future<Void> doShutdown(AsyncFileDetachable* self) {
 		wait(success(g_simulator->getCurrentProcess()->shutdownSignal.getFuture()));
@ -84,13 +92,13 @@ public:
 	Future<int> read(void* data, int length, int64_t offset) override {
 		if (!file.getPtr() || g_simulator->getCurrentProcess()->shutdownSignal.getFuture().isReady())
 			return io_error().asInjectedFault();
-		return sendErrorOnShutdown(file->read(data, length, offset));
+		return sendErrorOnShutdown(file->read(data, length, offset), assertOnReadWriteCancel);
 	}

 	Future<Void> write(void const* data, int length, int64_t offset) override {
 		if (!file.getPtr() || g_simulator->getCurrentProcess()->shutdownSignal.getFuture().isReady())
 			return io_error().asInjectedFault();
-		return sendErrorOnShutdown(file->write(data, length, offset));
+		return sendErrorOnShutdown(file->write(data, length, offset), assertOnReadWriteCancel);
 	}

 	Future<Void> truncate(int64_t size) override {
--- a/fdbrpc/include/fdbrpc/AsyncFileReadAhead.actor.h
+++ b/fdbrpc/include/fdbrpc/AsyncFileReadAhead.actor.h
@ -52,7 +52,7 @@ public:

 		state Reference<CacheBlock> block(new CacheBlock(length));
 		try {
-			int len = wait(f->m_f->read(block->data, length, offset));
+			int len = wait(uncancellable(holdWhile(block, f->m_f->read(block->data, length, offset))));
 			block->len = len;
 		} catch (Error& e) {
 			f->m_max_concurrent_reads.release(1);
--- a/fdbrpc/include/fdbrpc/AsyncFileWriteChecker.h
+++ b/fdbrpc/include/fdbrpc/AsyncFileWriteChecker.h
@ -32,14 +32,18 @@ public:

 	// For read() and write(), the data buffer must remain valid until the future is ready
 	Future<int> read(void* data, int length, int64_t offset) override {
-		return map(m_f->read(data, length, offset), [=](int r) {
-			updateChecksumHistory(false, offset, r, (uint8_t*)data);
+		// Lambda must hold a reference to this to keep it alive until after the read
+		auto self = Reference<AsyncFileWriteChecker>::addRef(this);
+		return map(m_f->read(data, length, offset), [self, data, offset](int r) {
+			self->updateChecksumHistory(false, offset, r, (uint8_t*)data);
 			return r;
 		});
 	}
 	Future<Void> readZeroCopy(void** data, int* length, int64_t offset) override {
-		return map(m_f->readZeroCopy(data, length, offset), [=](Void r) {
-			updateChecksumHistory(false, offset, *length, (uint8_t*)data);
+		// Lambda must hold a reference to this to keep it alive until after the read
+		auto self = Reference<AsyncFileWriteChecker>::addRef(this);
+		return map(m_f->readZeroCopy(data, length, offset), [self, data, length, offset](Void r) {
+			self->updateChecksumHistory(false, offset, *length, (uint8_t*)data);
 			return r;
 		});
 	}
@ -50,12 +54,14 @@ public:
 	}

 	Future<Void> truncate(int64_t size) override {
-		return map(m_f->truncate(size), [=](Void r) {
+		// Lambda must hold a reference to this to keep it alive until after the read
+		auto self = Reference<AsyncFileWriteChecker>::addRef(this);
+		return map(m_f->truncate(size), [self, size](Void r) {
 			// Truncate the page checksum history if it is in use
-			if ((size / checksumHistoryPageSize) < checksumHistory.size()) {
-				int oldCapacity = checksumHistory.capacity();
-				checksumHistory.resize(size / checksumHistoryPageSize);
-				checksumHistoryBudget.get() -= (checksumHistory.capacity() - oldCapacity);
+			if ((size / checksumHistoryPageSize) < self->checksumHistory.size()) {
+				int oldCapacity = self->checksumHistory.capacity();
+				self->checksumHistory.resize(size / checksumHistoryPageSize);
+				checksumHistoryBudget.get() -= (self->checksumHistory.capacity() - oldCapacity);
 			}
 			return r;
 		});
--- a/fdbrpc/include/fdbrpc/FlowTransport.h
+++ b/fdbrpc/include/fdbrpc/FlowTransport.h
@ -239,7 +239,7 @@ public:
 	// Sets endpoint to be a new local endpoint which delivers messages to the given receiver
 	void addEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID);

-	void addEndpoints(std::vector<std::pair<struct FlowReceiver*, TaskPriority>> const& streams);
+	void addEndpoints(std::vector<std::pair<class FlowReceiver*, TaskPriority>> const& streams);

 	// The given local endpoint no longer delivers messages to the given receiver or uses resources
 	void removeEndpoint(const Endpoint&, NetworkMessageReceiver*);
--- a/fdbrpc/include/fdbrpc/TenantInfo.h
+++ b/fdbrpc/include/fdbrpc/TenantInfo.h
@ -42,8 +42,6 @@ struct TenantInfo {
 	// Is set during deserialization. It will be set to true if the tenant
 	// name is set and the client is authorized to use this tenant.
 	bool tenantAuthorized = false;
-	// Number of storage bytes currently used by this tenant.
-	int64_t storageUsage = 0;

 	// Helper function for most endpoints that read/write data. This returns true iff
 	// the client is either a) a trusted peer or b) is accessing keyspace belonging to a tenant,
--- a/fdbrpc/include/fdbrpc/fdbrpc.h
+++ b/fdbrpc/include/fdbrpc/fdbrpc.h
@ -28,9 +28,14 @@
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbrpc/networksender.actor.h"

-struct FlowReceiver : public NetworkMessageReceiver {
-	// Common endpoint code for NetSAV<> and NetNotifiedQueue<>
+// Common endpoint code for NetSAV<> and NetNotifiedQueue<>
+class FlowReceiver : public NetworkMessageReceiver, public NonCopyable {
+	Optional<PeerCompatibilityPolicy> peerCompatibilityPolicy_;
+	Endpoint endpoint;
+	bool m_isLocalEndpoint;
+	bool m_stream;

+protected:
 	FlowReceiver() : m_isLocalEndpoint(false), m_stream(false) {}

 	FlowReceiver(Endpoint const& remoteEndpoint, bool stream)
@ -46,8 +51,17 @@ struct FlowReceiver : public NetworkMessageReceiver {
 		}
 	}

-	bool isLocalEndpoint() { return m_isLocalEndpoint; }
-	bool isRemoteEndpoint() { return endpoint.isValid() && !m_isLocalEndpoint; }
+public:
+	bool isLocalEndpoint() const { return m_isLocalEndpoint; }
+	bool isRemoteEndpoint() const { return endpoint.isValid() && !m_isLocalEndpoint; }
+
+	void setRemoteEndpoint(Endpoint const& remoteEndpoint, bool stream) {
+		ASSERT(!m_isLocalEndpoint);
+		ASSERT(!endpoint.isValid());
+		endpoint = remoteEndpoint;
+		m_stream = stream;
+		FlowTransport::transport().addPeerReference(endpoint, m_stream);
+	}

 	// If already a remote endpoint, returns that.  Otherwise makes this
 	//   a local endpoint and returns that.
@ -80,12 +94,6 @@ struct FlowReceiver : public NetworkMessageReceiver {
 	}

 	const Endpoint& getRawEndpoint() { return endpoint; }
-
-private:
-	Optional<PeerCompatibilityPolicy> peerCompatibilityPolicy_;
-	Endpoint endpoint;
-	bool m_isLocalEndpoint;
-	bool m_stream;
 };

 template <class T>
@ -363,8 +371,9 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue<T>,
 			this->sendError(message.getError());
 		} else {
 			if (message.get().asUnderlyingType().acknowledgeToken.present()) {
-				acknowledgements = AcknowledgementReceiver(
-				    FlowTransport::transport().loadedEndpoint(message.get().asUnderlyingType().acknowledgeToken.get()));
+				acknowledgements.setRemoteEndpoint(
+				    FlowTransport::transport().loadedEndpoint(message.get().asUnderlyingType().acknowledgeToken.get()),
+				    false);
 				if (onConnect.isValid() && onConnect.canBeSet()) {
 					onConnect.send(Void());
 				}
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@ -1240,6 +1240,7 @@ public:
 				PromiseTask* task = self->taskQueue.getReadyTask();
 				self->taskQueue.popReadyTask();
 				self->execTask(*task);
+				delete task;
 				self->yielded = false;
 			}
 		}
@ -2261,7 +2262,7 @@ public:
 	}

 	// Implementation
-	struct PromiseTask final {
+	struct PromiseTask final : public FastAllocated<PromiseTask> {
 		Promise<Void> promise;
 		ProcessInfo* machine;
 		explicit PromiseTask(ProcessInfo* machine) : machine(machine) {}
--- a/fdbserver/BlobManager.actor.cpp
+++ b/fdbserver/BlobManager.actor.cpp
@ -3537,7 +3537,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
 	}

 	// skip the rest of the algorithm for the first blob manager
-	if (bmData->epoch == 1) {
+	if (bmData->epoch == 1 && !isFullRestoreMode()) {
 		bmData->doneRecovering.send(Void());
 		return Void();
 	}
--- a/fdbserver/BlobManifest.actor.cpp
+++ b/fdbserver/BlobManifest.actor.cpp
@ -26,6 +26,7 @@
 #include "fdbclient/BlobGranuleCommon.h"
 #include "fdbserver/Knobs.h"
 #include "flow/FastRef.h"
+#include "flow/Trace.h"
 #include "flow/flow.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/BlobConnectionProvider.h"
@ -189,23 +190,6 @@ private:
 	static const int sMaxCount_{ 5 }; // max number of manifest file to keep
 };

-// Defines granule info that interests full restore
-struct BlobGranuleVersion {
-	// Two constructors required by VectorRef
-	BlobGranuleVersion() {}
-	BlobGranuleVersion(Arena& a, const BlobGranuleVersion& copyFrom)
-	  : granuleID(copyFrom.granuleID), keyRange(a, copyFrom.keyRange), version(copyFrom.version),
-	    sizeInBytes(copyFrom.sizeInBytes) {}
-
-	UID granuleID;
-	KeyRangeRef keyRange;
-	Version version;
-	int64_t sizeInBytes;
-};
-
-// Defines a vector for BlobGranuleVersion
-typedef Standalone<VectorRef<BlobGranuleVersion>> BlobGranuleVersionVector;
-
 // Defines filename, version, size for each granule file that interests full restore
 struct GranuleFileVersion {
 	Version version;
@ -226,16 +210,53 @@ public:
 			Value data = wait(readFromFile(self));
 			Standalone<BlobManifest> manifest = decode(data);
 			wait(writeSystemKeys(self, manifest.rows));
-			BlobGranuleVersionVector _ = wait(listGranules(self));
+			BlobGranuleRestoreVersionVector _ = wait(listGranules(self));
 		} catch (Error& e) {
 			dprint("WARNING: unexpected manifest loader error {}\n", e.what()); // skip error handling so far
 		}
 		return Void();
 	}

+	// Iterate active granules and return their version/sizes
+	ACTOR static Future<BlobGranuleRestoreVersionVector> listGranules(Reference<BlobManifestLoader> self) {
+		state Transaction tr(self->db_);
+		loop {
+			state BlobGranuleRestoreVersionVector results;
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+
+			try {
+				std::vector<KeyRangeRef> granules;
+				state int i = 0;
+				auto limit = GetRangeLimits::BYTE_LIMIT_UNLIMITED;
+				state RangeResult blobRanges = wait(tr.getRange(blobGranuleMappingKeys, limit));
+				for (i = 0; i < blobRanges.size() - 1; i++) {
+					Key startKey = blobRanges[i].key.removePrefix(blobGranuleMappingKeys.begin);
+					Key endKey = blobRanges[i + 1].key.removePrefix(blobGranuleMappingKeys.begin);
+					state KeyRange granuleRange = KeyRangeRef(startKey, endKey);
+					try {
+						Standalone<BlobGranuleRestoreVersion> granule = wait(getGranule(&tr, granuleRange));
+						results.push_back_deep(results.arena(), granule);
+					} catch (Error& e) {
+						if (e.code() == error_code_restore_missing_data) {
+							dprint("missing data for key range {} \n", granuleRange.toString());
+							TraceEvent("BlobRestoreMissingData").detail("KeyRange", granuleRange.toString());
+						} else {
+							throw;
+						}
+					}
+				}
+				return results;
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+	}
+
 	// Print out a summary for blob granules
 	ACTOR static Future<Void> print(Reference<BlobManifestLoader> self) {
-		state BlobGranuleVersionVector granules = wait(listGranules(self));
+		state BlobGranuleRestoreVersionVector granules = wait(listGranules(self));
 		for (auto granule : granules) {
 			wait(checkGranuleFiles(self, granule));
 		}
@ -285,41 +306,9 @@ private:
 		}
 	}

-	// Iterate active granules and return their version/sizes
-	ACTOR static Future<BlobGranuleVersionVector> listGranules(Reference<BlobManifestLoader> self) {
-		state Transaction tr(self->db_);
-		loop {
-			state BlobGranuleVersionVector results;
-			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
-
-			try {
-				std::vector<KeyRangeRef> granules;
-				state int i = 0;
-				auto limit = GetRangeLimits::BYTE_LIMIT_UNLIMITED;
-				state RangeResult blobRanges = wait(tr.getRange(blobGranuleMappingKeys, limit));
-				for (i = 0; i < blobRanges.size() - 1; i++) {
-					Key startKey = blobRanges[i].key.removePrefix(blobGranuleMappingKeys.begin);
-					Key endKey = blobRanges[i + 1].key.removePrefix(blobGranuleMappingKeys.begin);
-					state KeyRange granuleRange = KeyRangeRef(startKey, endKey);
-					try {
-						Standalone<BlobGranuleVersion> granule = wait(getGranule(&tr, granuleRange));
-						results.push_back_deep(results.arena(), granule);
-					} catch (Error& e) {
-						dprint("missing data for key range {} \n", granuleRange.toString());
-					}
-				}
-				return results;
-			} catch (Error& e) {
-				wait(tr.onError(e));
-			}
-		}
-	}
-
 	// Find the newest granule for a key range. The newest granule has the max version and relevant files
-	ACTOR static Future<Standalone<BlobGranuleVersion>> getGranule(Transaction* tr, KeyRangeRef range) {
-		state Standalone<BlobGranuleVersion> granuleVersion;
+	ACTOR static Future<Standalone<BlobGranuleRestoreVersion>> getGranule(Transaction* tr, KeyRangeRef range) {
+		state Standalone<BlobGranuleRestoreVersion> granuleVersion;
 		KeyRange historyKeyRange = blobGranuleHistoryKeyRangeFor(range);
 		// reverse lookup so that the first row is the newest version
 		state RangeResult results =
@ -389,7 +378,7 @@ private:
 	}

 	// Read data from granules and print out summary
-	ACTOR static Future<Void> checkGranuleFiles(Reference<BlobManifestLoader> self, BlobGranuleVersion granule) {
+	ACTOR static Future<Void> checkGranuleFiles(Reference<BlobManifestLoader> self, BlobGranuleRestoreVersion granule) {
 		state KeyRangeRef range = granule.keyRange;
 		state Version readVersion = granule.version;
 		state Transaction tr(self->db_);
@ -441,3 +430,11 @@ ACTOR Future<Void> printRestoreSummary(Database db, Reference<BlobConnectionProv
 	wait(BlobManifestLoader::print(loader));
 	return Void();
 }
+
+// API to list blob granules
+ACTOR Future<BlobGranuleRestoreVersionVector> listBlobGranules(Database db,
+                                                               Reference<BlobConnectionProvider> blobConn) {
+	Reference<BlobManifestLoader> loader = makeReference<BlobManifestLoader>(db, blobConn);
+	BlobGranuleRestoreVersionVector result = wait(BlobManifestLoader::listGranules(loader));
+	return result;
+}
--- a/fdbserver/BlobMigrator.actor.cpp
+++ b/fdbserver/BlobMigrator.actor.cpp
@ -30,54 +30,312 @@
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/SystemData.h"
 #include "fdbclient/NativeAPI.actor.h"
+#include "fdbclient/ManagementAPI.actor.h"
 #include "fdbserver/ServerDBInfo.actor.h"
 #include "fdbserver/WaitFailure.h"
-
+#include "fdbserver/MoveKeys.actor.h"
+#include "fdbserver/BlobGranuleServerCommon.actor.h"
 #include "flow/actorcompiler.h" // has to be last include
+#include "flow/network.h"
+#include <algorithm>
+#include <string>
+
+#define ENABLE_DEBUG_MG true
+
+template <typename... T>
+static inline void dprint(fmt::format_string<T...> fmt, T&&... args) {
+	if (ENABLE_DEBUG_MG)
+		fmt::print(fmt, std::forward<T>(args)...);
+}

 // BlobMigrator manages data migration from blob storage to storage server. It implements a minimal set of
 // StorageServerInterface APIs which are needed for DataDistributor to start data migration.
 class BlobMigrator : public NonCopyable, public ReferenceCounted<BlobMigrator> {
 public:
 	BlobMigrator(Reference<AsyncVar<ServerDBInfo> const> dbInfo, BlobMigratorInterface interf)
-	  : blobMigratorInterf(interf), actors(false) {
-		if (!blobConn.isValid() && SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
-			blobConn = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL);
+	  : interf_(interf), actors_(false) {
+		if (!blobConn_.isValid() && SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
+			blobConn_ = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL);
 		}
-		db = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True);
+		db_ = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True);
 	}
 	~BlobMigrator() {}

+	// Start migration
 	ACTOR static Future<Void> start(Reference<BlobMigrator> self) {
-		self->actors.add(waitFailureServer(self->blobMigratorInterf.waitFailure.getFuture()));
+		if (!isFullRestoreMode()) {
+			return Void();
+		}
+		wait(delay(10)); // TODO need to wait for a signal for readiness of blob manager
+
+		BlobGranuleRestoreVersionVector granules = wait(listBlobGranules(self->db_, self->blobConn_));
+		self->blobGranules_ = granules;
+
+		wait(prepare(self, normalKeys));
+
+		wait(serverLoop(self));
+		return Void();
+	}
+
+private:
+	// Prepare for data migration for given key range.
+	ACTOR static Future<Void> prepare(Reference<BlobMigrator> self, KeyRangeRef keys) {
+		// Register as a storage server, so that DataDistributor could start data movement after
+		std::pair<Version, Tag> verAndTag = wait(addStorageServer(self->db_, self->interf_.ssi));
+		dprint("Started storage server interface {} {}\n", verAndTag.first, verAndTag.second.toString());
+
+		// Reassign key ranges to the storage server
+		// It'll restart DataDistributor so that internal data structures like ShardTracker, ShardsAffectedByTeamFailure
+		// could be re-initialized. Ideally it should be done within DataDistributor, then we don't need to
+		// restart DataDistributor
+		state int oldMode = wait(setDDMode(self->db_, 0));
+		wait(unassignServerKeys(self, keys));
+		wait(assignKeysToServer(self, keys, self->interf_.ssi.id()));
+		wait(success(setDDMode(self->db_, oldMode)));
+		return Void();
+	}
+
+	// Assign given key range to specified storage server. Subsquent
+	ACTOR static Future<Void> assignKeysToServer(Reference<BlobMigrator> self, KeyRangeRef keys, UID serverUID) {
+		state Transaction tr(self->db_);
 		loop {
-			choose {
-				when(HaltBlobMigratorRequest req = waitNext(self->blobMigratorInterf.haltBlobMigrator.getFuture())) {
-					req.reply.send(Void());
-					TraceEvent("BlobMigratorHalted", self->blobMigratorInterf.id()).detail("ReqID", req.requesterID);
-					break;
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+			try {
+				state Value value = keyServersValue(std::vector<UID>({ serverUID }), std::vector<UID>(), UID(), UID());
+				wait(krmSetRange(&tr, keyServersPrefix, keys, value));
+				wait(krmSetRange(&tr, serverKeysPrefixFor(serverUID), keys, serverKeysTrue));
+				wait(tr.commit());
+				dprint("Assign {} to server {}\n", normalKeys.toString(), serverUID.toString());
+				return Void();
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+	}
+
+	// Unassign given key range from its current storage servers
+	ACTOR static Future<Void> unassignServerKeys(Reference<BlobMigrator> self, KeyRangeRef keys) {
+		state Transaction tr(self->db_);
+		loop {
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+			try {
+				state RangeResult serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));
+				ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY);
+				for (auto& server : serverList) {
+					state UID id = decodeServerListValue(server.value).id();
+					RangeResult ranges = wait(krmGetRanges(&tr, serverKeysPrefixFor(id), keys));
+					bool owning = false;
+					for (auto& r : ranges) {
+						if (r.value == serverKeysTrue) {
+							owning = true;
+							break;
+						}
+					}
+					if (owning) {
+						dprint("Unassign {} from storage server {}\n", keys.toString(), id.toString());
+						wait(krmSetRange(&tr, serverKeysPrefixFor(id), keys, serverKeysFalse));
+					}
 				}
-				when(wait(self->actors.getResult())) {}
+				wait(tr.commit());
+				return Void();
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+	}
+
+	// Main server loop
+	ACTOR static Future<Void> serverLoop(Reference<BlobMigrator> self) {
+		self->actors_.add(waitFailureServer(self->interf_.ssi.waitFailure.getFuture()));
+		self->actors_.add(handleRequest(self));
+		self->actors_.add(handleUnsupportedRequest(self));
+		loop {
+			try {
+				choose {
+					when(HaltBlobMigratorRequest req = waitNext(self->interf_.haltBlobMigrator.getFuture())) {
+						req.reply.send(Void());
+						TraceEvent("BlobMigratorHalted", self->interf_.id()).detail("ReqID", req.requesterID);
+						break;
+					}
+					when(wait(self->actors_.getResult())) {}
+				}
+			} catch (Error& e) {
+				dprint("Unexpected serverLoop error {}\n", e.what());
+				throw;
 			}
 		}
 		return Void();
 	}

+	// Handle StorageServerInterface APIs
+	ACTOR static Future<Void> handleRequest(Reference<BlobMigrator> self) {
+		state StorageServerInterface ssi = self->interf_.ssi;
+		loop {
+			try {
+				choose {
+					when(GetShardStateRequest req = waitNext(ssi.getShardState.getFuture())) {
+						dprint("Handle GetShardStateRequest\n");
+						Version version = maxVersion(self);
+						GetShardStateReply rep(version, version);
+						req.reply.send(rep); // return empty shards
+					}
+					when(WaitMetricsRequest req = waitNext(ssi.waitMetrics.getFuture())) {
+						// dprint("Handle WaitMetricsRequest\n");
+						self->actors_.add(processWaitMetricsRequest(self, req));
+					}
+					when(SplitMetricsRequest req = waitNext(ssi.splitMetrics.getFuture())) {
+						dprint("Handle SplitMetrics {}\n", req.keys.toString());
+						SplitMetricsReply rep;
+						for (auto granule : self->blobGranules_) {
+							// TODO: Use granule boundary as split point. A better approach is to split by size
+							if (granule.keyRange.begin > req.keys.begin && granule.keyRange.end < req.keys.end)
+								rep.splits.push_back_deep(rep.splits.arena(), granule.keyRange.begin);
+						}
+						req.reply.send(rep);
+					}
+					when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) {
+						fmt::print("Handle GetStorageMetrics\n");
+						StorageMetrics metrics;
+						metrics.bytes = sizeInBytes(self);
+						GetStorageMetricsReply resp;
+						resp.load = metrics;
+						req.reply.send(resp);
+					}
+					when(ReplyPromise<KeyValueStoreType> reply = waitNext(ssi.getKeyValueStoreType.getFuture())) {
+						dprint("Handle KeyValueStoreType\n");
+						reply.send(KeyValueStoreType::MEMORY);
+					}
+				}
+			} catch (Error& e) {
+				dprint("Unexpected blob migrator request error {}\n", e.what());
+				throw;
+			}
+		}
+	}
+
+	// Handle StorageServerInterface APIs that are not supported. Simply log and return error
+	ACTOR static Future<Void> handleUnsupportedRequest(Reference<BlobMigrator> self) {
+		state StorageServerInterface ssi = self->interf_.ssi;
+		loop {
+			try {
+				choose {
+					when(SplitRangeRequest req = waitNext(ssi.getRangeSplitPoints.getFuture())) {
+						dprint("Unsupported SplitRangeRequest\n");
+						req.reply.sendError(unsupported_operation());
+					}
+					when(StorageQueuingMetricsRequest req = waitNext(ssi.getQueuingMetrics.getFuture())) {
+						self->actors_.add(processStorageQueuingMetricsRequest(req));
+					}
+					when(ReadHotSubRangeRequest req = waitNext(ssi.getReadHotRanges.getFuture())) {
+						dprint("Unsupported ReadHotSubRange\n");
+						req.reply.sendError(unsupported_operation());
+					}
+					when(GetKeyValuesStreamRequest req = waitNext(ssi.getKeyValuesStream.getFuture())) {
+						dprint("Unsupported GetKeyValuesStreamRequest\n");
+						req.reply.sendError(unsupported_operation());
+					}
+					when(GetKeyRequest req = waitNext(ssi.getKey.getFuture())) {
+						dprint("Unsupported GetKeyRequest\n");
+						req.reply.sendError(unsupported_operation());
+					}
+					when(GetKeyValuesRequest req = waitNext(ssi.getKeyValues.getFuture())) {
+						/* dprint("Unsupported GetKeyValuesRequest {} - {} @ {}\n",
+						       req.begin.getKey().printable(),
+						       req.end.getKey().printable(),
+						       req.version); */
+						req.reply.sendError(unsupported_operation());
+					}
+					when(GetValueRequest req = waitNext(ssi.getValue.getFuture())) {
+						dprint("Unsupported GetValueRequest\n");
+						req.reply.sendError(unsupported_operation());
+					}
+					when(GetCheckpointRequest req = waitNext(ssi.checkpoint.getFuture())) {
+						dprint("Unsupported GetCheckpoint \n");
+						req.reply.sendError(unsupported_operation());
+					}
+					when(FetchCheckpointRequest req = waitNext(ssi.fetchCheckpoint.getFuture())) {
+						dprint("Unsupported FetchCheckpointRequest\n");
+						req.reply.sendError(unsupported_operation());
+					}
+					when(UpdateCommitCostRequest req = waitNext(ssi.updateCommitCostRequest.getFuture())) {
+						dprint("Unsupported UpdateCommitCostRequest\n");
+						req.reply.sendError(unsupported_operation());
+					}
+					when(FetchCheckpointKeyValuesRequest req = waitNext(ssi.fetchCheckpointKeyValues.getFuture())) {
+						dprint("Unsupported FetchCheckpointKeyValuesRequest\n");
+						req.reply.sendError(unsupported_operation());
+					}
+				}
+			} catch (Error& e) {
+				dprint("Unexpected request handling error {}\n", e.what());
+				throw;
+			}
+		}
+	}
+
+	ACTOR static Future<Void> processWaitMetricsRequest(Reference<BlobMigrator> self, WaitMetricsRequest req) {
+		state WaitMetricsRequest waitMetricsRequest = req;
+		// FIXME get rid of this delay. it's a temp solution to avoid starvaion scheduling of DD
+		// processes
+		wait(delay(1));
+		StorageMetrics metrics;
+		metrics.bytes = sizeInBytes(self, waitMetricsRequest.keys);
+		waitMetricsRequest.reply.send(metrics);
+		return Void();
+	}
+
+	ACTOR static Future<Void> processStorageQueuingMetricsRequest(StorageQueuingMetricsRequest req) {
+		dprint("Unsupported StorageQueuingMetricsRequest\n");
+		// FIXME get rid of this delay. it's a temp solution to avoid starvaion scheduling of DD
+		// processes
+		wait(delay(1));
+		req.reply.sendError(unsupported_operation());
+		return Void();
+	}
+
+	// Return total storage size in bytes for migration
+	static int64_t sizeInBytes(Reference<BlobMigrator> self) { return sizeInBytes(self, normalKeys); }
+
+	// Return storage size in bytes for given key range
+	static int64_t sizeInBytes(Reference<BlobMigrator> self, KeyRangeRef range) {
+		int64_t bytes = 0;
+		for (auto granule : self->blobGranules_) {
+			if (range.intersects(granule.keyRange))
+				bytes += granule.sizeInBytes;
+		}
+		return bytes;
+	}
+
+	// Return max version for all blob granules
+	static Version maxVersion(Reference<BlobMigrator> self) {
+		Version max = 0;
+		for (auto granule : self->blobGranules_) {
+			max = std::max(granule.version, max);
+		}
+		return max;
+	}
+
 private:
-	Database db;
-	Reference<BlobConnectionProvider> blobConn;
-	BlobMigratorInterface blobMigratorInterf;
-	ActorCollection actors;
+	Database db_;
+	Reference<BlobConnectionProvider> blobConn_;
+	BlobGranuleRestoreVersionVector blobGranules_;
+	BlobMigratorInterface interf_;
+	ActorCollection actors_;
 };

 // Main entry point
-ACTOR Future<Void> blobMigrator(BlobMigratorInterface ssi, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
-	fmt::print("Start blob migrator {} \n", ssi.id().toString());
+ACTOR Future<Void> blobMigrator(BlobMigratorInterface interf, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
+	fmt::print("Start blob migrator {} \n", interf.id().toString());
 	try {
-		Reference<BlobMigrator> self = makeReference<BlobMigrator>(dbInfo, ssi);
+		Reference<BlobMigrator> self = makeReference<BlobMigrator>(dbInfo, interf);
 		wait(BlobMigrator::start(self));
 	} catch (Error& e) {
-		fmt::print("unexpected blob migrator error {}\n", e.what());
+		dprint("Unexpected blob migrator error {}\n", e.what());
+		TraceEvent("BlobMigratorError", interf.id()).error(e);
 	}
 	return Void();
 }
--- a/fdbserver/BlobWorker.actor.cpp
+++ b/fdbserver/BlobWorker.actor.cpp
@ -3961,7 +3961,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
 				}
 			}

-			if (createChangeFeed) {
+			if (createChangeFeed && !isFullRestoreMode()) {
 				// create new change feed for new version of granule
 				wait(updateChangeFeed(
 				    &tr, granuleIDToCFKey(info.granuleID), ChangeFeedStatus::CHANGE_FEED_CREATE, req.keyRange));
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -2615,8 +2615,9 @@ ACTOR Future<Void> monitorBlobMigrator(ClusterControllerData* self) {
 	}
 	loop {
 		if (self->db.serverInfo->get().blobMigrator.present() && !self->recruitBlobMigrator.get()) {
-			state Future<Void> wfClient = waitFailureClient(self->db.serverInfo->get().blobMigrator.get().waitFailure,
-			                                                SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME);
+			state Future<Void> wfClient =
+			    waitFailureClient(self->db.serverInfo->get().blobMigrator.get().ssi.waitFailure,
+			                      SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME);
 			loop {
 				choose {
 					when(wait(wfClient)) {
--- a/fdbserver/DDRelocationQueue.actor.cpp
+++ b/fdbserver/DDRelocationQueue.actor.cpp
@ -687,6 +687,20 @@ struct DDQueue : public IDDRelocationQueue {

 	Reference<EventCacheHolder> movedKeyServersEventHolder;

+	int moveReusePhysicalShard;
+	int moveCreateNewPhysicalShard;
+	enum RetryFindDstReason {
+		None = 0,
+		RemoteBestTeamNotReady,
+		PrimaryNoHealthyTeam,
+		RemoteNoHealthyTeam,
+		RemoteTeamIsFull,
+		RemoteTeamIsNotHealthy,
+		NoAvailablePhysicalShard,
+		NumberOfTypes,
+	};
+	std::vector<int> retryFindDstReasonCount;
+
 	void startRelocation(int priority, int healthPriority) {
 		// Although PRIORITY_TEAM_REDUNDANT has lower priority than split and merge shard movement,
 		// we must count it into unhealthyRelocations; because team removers relies on unhealthyRelocations to
@ -750,7 +764,9 @@ struct DDQueue : public IDDRelocationQueue {
 	    output(output), input(input), getShardMetrics(getShardMetrics), getTopKMetrics(getTopKMetrics), lastInterval(0),
 	    suppressIntervals(0), rawProcessingUnhealthy(new AsyncVar<bool>(false)),
 	    rawProcessingWiggle(new AsyncVar<bool>(false)), unhealthyRelocations(0),
-	    movedKeyServersEventHolder(makeReference<EventCacheHolder>("MovedKeyServers")) {}
+	    movedKeyServersEventHolder(makeReference<EventCacheHolder>("MovedKeyServers")), moveReusePhysicalShard(0),
+	    moveCreateNewPhysicalShard(0), retryFindDstReasonCount(static_cast<int>(RetryFindDstReason::NumberOfTypes), 0) {
+	}
 	DDQueue() = default;

 	void validate() {
@ -1463,6 +1479,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 		loop {
 			destOverloadedCount = 0;
 			stuckCount = 0;
+			state DDQueue::RetryFindDstReason retryFindDstReason = DDQueue::RetryFindDstReason::None;
 			// state int bestTeamStuckThreshold = 50;
 			loop {
 				state int tciIndex = 0;
@ -1489,10 +1506,13 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 							    .detail("TeamCollectionIndex", tciIndex)
 							    .detail("RestoreDataMoveForDest",
 							            describe(tciIndex == 0 ? rd.dataMove->primaryDest : rd.dataMove->remoteDest));
+							retryFindDstReason = DDQueue::RetryFindDstReason::RemoteBestTeamNotReady;
 							foundTeams = false;
 							break;
 						}
 						if (!bestTeam.first.present() || !bestTeam.first.get()->isHealthy()) {
+							retryFindDstReason = tciIndex == 0 ? DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam
+							                                   : DDQueue::RetryFindDstReason::RemoteNoHealthyTeam;
 							foundTeams = false;
 							break;
 						}
@ -1545,12 +1565,15 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 							// getting the destination team or we could miss failure notifications for the storage
 							// servers in the destination team
 							TraceEvent("BestTeamNotReady");
+							retryFindDstReason = DDQueue::RetryFindDstReason::RemoteBestTeamNotReady;
 							foundTeams = false;
 							break;
 						}
 						// If a DC has no healthy team, we stop checking the other DCs until
 						// the unhealthy DC is healthy again or is excluded.
 						if (!bestTeam.first.present()) {
+							retryFindDstReason = tciIndex == 0 ? DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam
+							                                   : DDQueue::RetryFindDstReason::RemoteNoHealthyTeam;
 							foundTeams = false;
 							break;
 						}
@ -1574,6 +1597,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 							if (tciIndex == 1 && !forceToUseNewPhysicalShard) {
 								bool minAvailableSpaceRatio = bestTeam.first.get()->getMinAvailableSpaceRatio(true);
 								if (minAvailableSpaceRatio < SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO) {
+									retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsFull;
 									foundTeams = false;
 									break;
 								}
@ -1616,6 +1640,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 				if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
 				    bestTeams.size() > 1 && !forceToUseNewPhysicalShard) {
 					if (!bestTeams[1].first->isHealthy()) {
+						retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy;
 						foundTeams = false;
 					}
 				}
@ -1676,6 +1701,19 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 					// when !rd.isRestore(), dataMoveId is just decided as physicalShardIDCandidate
 					// thus, update the physicalShardIDCandidate to related data structures
 					ASSERT(physicalShardIDCandidate != UID().first());
+					if (self->physicalShardCollection->physicalShardExists(physicalShardIDCandidate)) {
+						self->moveReusePhysicalShard++;
+					} else {
+						self->moveCreateNewPhysicalShard++;
+						if (retryFindDstReason == DDQueue::RetryFindDstReason::None) {
+							// When creating a new physical shard, but the reason is none, this can only happen when
+							// determinePhysicalShardIDGivenPrimaryTeam() finds that there is no available physical
+							// shard.
+							self->retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]++;
+						} else {
+							self->retryFindDstReasonCount[retryFindDstReason]++;
+						}
+					}
 					rd.dataMoveId = newShardId(physicalShardIDCandidate, AssignEmptyRange::False);
 					auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
 					inFlightRange.value().dataMoveId = rd.dataMoveId;
@ -2472,6 +2510,30 @@ ACTOR Future<Void> dataDistributionQueue(Reference<IDDTxnProcessor> db,
 					    .trackLatest("MovingData"); // This trace event's trackLatest lifetime is controlled by
 					                                // DataDistributor::movingDataEventHolder. The track latest
 					                                // key we use here must match the key used in the holder.
+
+					if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+						TraceEvent("PhysicalShardMoveStats")
+						    .detail("MoveCreateNewPhysicalShard", self.moveCreateNewPhysicalShard)
+						    .detail("MoveReusePhysicalShard", self.moveReusePhysicalShard)
+						    .detail("RemoteBestTeamNotReady",
+						            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteBestTeamNotReady])
+						    .detail("PrimaryNoHealthyTeam",
+						            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam])
+						    .detail("RemoteNoHealthyTeam",
+						            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteNoHealthyTeam])
+						    .detail("RemoteTeamIsFull",
+						            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsFull])
+						    .detail("RemoteTeamIsNotHealthy",
+						            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy])
+						    .detail(
+						        "NoAvailablePhysicalShard",
+						        self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]);
+						self.moveCreateNewPhysicalShard = 0;
+						self.moveReusePhysicalShard = 0;
+						for (int i = 0; i < self.retryFindDstReasonCount.size(); ++i) {
+							self.retryFindDstReasonCount[i] = 0;
+						}
+					}
 				}
 				when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator
 				when(wait(waitForAll(ddQueueFutures))) {}
--- a/fdbserver/DDShardTracker.actor.cpp
+++ b/fdbserver/DDShardTracker.actor.cpp
@ -2081,6 +2081,10 @@ void PhysicalShardCollection::logPhysicalShardCollection() {
 	}
 }

+bool PhysicalShardCollection::physicalShardExists(uint64_t physicalShardID) {
+	return physicalShardInstances.find(physicalShardID) != physicalShardInstances.end();
+}
+
 // FIXME: complete this test with non-empty range
 TEST_CASE("/DataDistributor/Tracker/FetchTopK") {
 	state DataDistributionTracker self;
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -286,8 +286,6 @@ public:
 	PromiseStream<RelocateShard> relocationProducer, relocationConsumer;
 	Reference<PhysicalShardCollection> physicalShardCollection;

-	StorageQuotaInfo storageQuotaInfo;
-
 	Promise<Void> initialized;

 	std::unordered_map<AuditType, std::vector<std::shared_ptr<DDAudit>>> audits;
@ -542,27 +540,6 @@ public:
 	}
 };

-ACTOR Future<Void> storageQuotaTracker(Database cx, StorageQuotaInfo* storageQuotaInfo) {
-	loop {
-		state Transaction tr(cx);
-		loop {
-			try {
-				state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY));
-				TraceEvent("StorageQuota_ReadCurrentQuotas").detail("Size", currentQuotas.size());
-				for (auto const kv : currentQuotas) {
-					Key const key = kv.key.removePrefix(storageQuotaPrefix);
-					uint64_t const quota = BinaryReader::fromStringRef<uint64_t>(kv.value, Unversioned());
-					storageQuotaInfo->quotaMap[key] = quota;
-				}
-				wait(delay(5.0));
-				break;
-			} catch (Error& e) {
-				wait(tr.onError(e));
-			}
-		}
-	}
-}
-
 // Periodically check and log the physicalShard status; clean up empty physicalShard;
 ACTOR Future<Void> monitorPhysicalShardStatus(Reference<PhysicalShardCollection> self) {
 	ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
@ -683,16 +660,15 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 			                                    self->ddId,
 			                                    &normalDDQueueErrors()));

-			actors.push_back(reportErrorsExcept(storageQuotaTracker(cx, &self->storageQuotaInfo),
-			                                    "StorageQuotaTracker",
-			                                    self->ddId,
-			                                    &normalDDQueueErrors()));
-
 			if (ddIsTenantAware) {
 				actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorTenantMap(),
 				                                    "DDTenantCacheMonitor",
 				                                    self->ddId,
 				                                    &normalDDQueueErrors()));
+				actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorStorageQuota(),
+				                                    "StorageQuotaTracker",
+				                                    self->ddId,
+				                                    &normalDDQueueErrors()));
 				actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorStorageUsage(),
 				                                    "StorageUsageTracker",
 				                                    self->ddId,
--- a/fdbserver/DiskQueue.actor.cpp
+++ b/fdbserver/DiskQueue.actor.cpp
@ -429,7 +429,7 @@ public:
 		waitfor.push_back(self->files[1].f->write(pageData.begin(), pageData.size(), self->writingPos));
 		self->writingPos += pageData.size();

-		return waitForAll(waitfor);
+		return waitForAllReadyThenThrow(waitfor);
 	}

 	// Write the given data (pageData) to the queue files of self, sync data to disk, and delete the memory (pageMem)
@ -655,7 +655,7 @@ public:
 			for (int i = 0; i < 2; i++)
 				if (self->files[i].size > 0)
 					reads.push_back(self->files[i].f->read(self->firstPages[i], sizeof(Page), 0));
-			wait(waitForAll(reads));
+			wait(waitForAllReadyThenThrow(reads));

 			// Determine which file comes first
 			if (compare(self->firstPages[1], self->firstPages[0])) {
@ -743,7 +743,10 @@ public:
 	}

 	// Read nPages from pageOffset*sizeof(Page) offset in file self->files[file]
-	ACTOR static Future<Standalone<StringRef>> read(RawDiskQueue_TwoFiles* self, int file, int pageOffset, int nPages) {
+	ACTOR static UNCANCELLABLE Future<Standalone<StringRef>> read(RawDiskQueue_TwoFiles* self,
+	                                                              int file,
+	                                                              int pageOffset,
+	                                                              int nPages) {
 		state TrackMe trackMe(self);
 		state const size_t bytesRequested = nPages * sizeof(Page);
 		state Standalone<StringRef> result = makeAlignedString(sizeof(Page), bytesRequested);
--- a/fdbserver/EncryptKeyProxy.actor.cpp
+++ b/fdbserver/EncryptKeyProxy.actor.cpp
@ -388,6 +388,15 @@ ACTOR Future<Void> getCipherKeysByBaseCipherKeyIds(Reference<EncryptKeyProxyData
 		try {
 			KmsConnLookupEKsByKeyIdsReq keysByIdsReq;
 			for (const auto& item : lookupCipherInfoMap) {
+				// TODO: Currently getEncryptCipherKeys does not pass the domain name, once that is fixed we can remove
+				// the check on the empty domain name
+				if (!item.second.domainName.empty()) {
+					if (item.second.domainId == FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
+						ASSERT(item.second.domainName == FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
+					} else if (item.second.domainId == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
+						ASSERT(item.second.domainName == FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
+					}
+				}
 				keysByIdsReq.encryptKeyInfos.emplace_back_deep(
 				    keysByIdsReq.arena, item.second.domainId, item.second.baseCipherId, item.second.domainName);
 			}
@ -527,6 +536,11 @@ ACTOR Future<Void> getLatestCipherKeys(Reference<EncryptKeyProxyData> ekpProxyDa
 		try {
 			KmsConnLookupEKsByDomainIdsReq keysByDomainIdReq;
 			for (const auto& item : lookupCipherDomains) {
+				if (item.second.domainId == FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
+					ASSERT(item.second.domainName == FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
+				} else if (item.second.domainId == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
+					ASSERT(item.second.domainName == FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
+				}
 				keysByDomainIdReq.encryptDomainInfos.emplace_back_deep(
 				    keysByDomainIdReq.arena, item.second.domainId, item.second.domainName);
 			}
--- a/fdbserver/KeyValueStoreCompressTestData.actor.cpp
+++ b/fdbserver/KeyValueStoreCompressTestData.actor.cpp
@ -53,7 +53,11 @@ struct KeyValueStoreCompressTestData final : IKeyValueStore {
 	void set(KeyValueRef keyValue, const Arena* arena = nullptr) override {
 		store->set(KeyValueRef(keyValue.key, pack(keyValue.value)), arena);
 	}
-	void clear(KeyRangeRef range, const Arena* arena = nullptr) override { store->clear(range, arena); }
+	void clear(KeyRangeRef range,
+	           const StorageServerMetrics* storageMetrics = nullptr,
+	           const Arena* arena = nullptr) override {
+		store->clear(range, storageMetrics, arena);
+	}
 	Future<Void> commit(bool sequential = false) override { return store->commit(sequential); }

 	Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> options) override {
--- a/fdbserver/KeyValueStoreMemory.actor.cpp
+++ b/fdbserver/KeyValueStoreMemory.actor.cpp
@ -130,7 +130,7 @@ public:
 		}
 	}

-	void clear(KeyRangeRef range, const Arena* arena) override {
+	void clear(KeyRangeRef range, const StorageServerMetrics* storageMetrics, const Arena* arena) override {
 		// A commit that occurs with no available space returns Never, so we can throw out all modifications
 		if (getAvailableSize() <= 0)
 			return;
--- a/fdbserver/KeyValueStoreRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp
@ -1846,22 +1846,52 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 	void set(KeyValueRef kv, const Arena*) override {
 		if (writeBatch == nullptr) {
 			writeBatch.reset(new rocksdb::WriteBatch());
+			keysSet.clear();
 		}
 		ASSERT(defaultFdbCF != nullptr);
 		writeBatch->Put(defaultFdbCF, toSlice(kv.key), toSlice(kv.value));
+		if (SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE) {
+			keysSet.insert(kv.key);
+		}
 	}

-	void clear(KeyRangeRef keyRange, const Arena*) override {
+	void clear(KeyRangeRef keyRange, const StorageServerMetrics* storageMetrics, const Arena*) override {
 		if (writeBatch == nullptr) {
 			writeBatch.reset(new rocksdb::WriteBatch());
+			keysSet.clear();
 		}

 		ASSERT(defaultFdbCF != nullptr);
-
 		if (keyRange.singleKeyRange()) {
 			writeBatch->Delete(defaultFdbCF, toSlice(keyRange.begin));
 		} else {
-			writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end));
+			if (SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE && storageMetrics != nullptr &&
+			    storageMetrics->byteSample.getEstimate(keyRange) <
+			        SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT) {
+				rocksdb::ReadOptions options = sharedState->getReadOptions();
+				auto beginSlice = toSlice(keyRange.begin);
+				auto endSlice = toSlice(keyRange.end);
+				options.iterate_lower_bound = &beginSlice;
+				options.iterate_upper_bound = &endSlice;
+				auto cursor = std::unique_ptr<rocksdb::Iterator>(db->NewIterator(options, defaultFdbCF));
+				cursor->Seek(toSlice(keyRange.begin));
+				while (cursor->Valid() && toStringRef(cursor->key()) < keyRange.end) {
+					writeBatch->Delete(defaultFdbCF, cursor->key());
+					cursor->Next();
+				}
+				if (!cursor->status().ok()) {
+					// if readrange iteration fails, then do a deleteRange.
+					writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end));
+				} else {
+					auto it = keysSet.lower_bound(keyRange.begin);
+					while (it != keysSet.end() && *it < keyRange.end) {
+						writeBatch->Delete(defaultFdbCF, toSlice(*it));
+						it++;
+					}
+				}
+			} else {
+				writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end));
+			}
 		}
 	}

@ -1890,6 +1920,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 		}
 		auto a = new Writer::CommitAction();
 		a->batchToCommit = std::move(writeBatch);
+		keysSet.clear();
 		auto res = a->done.getFuture();
 		writeThread->post(a);
 		return res;
@ -2083,6 +2114,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 	Promise<Void> closePromise;
 	Future<Void> openFuture;
 	std::unique_ptr<rocksdb::WriteBatch> writeBatch;
+	std::set<Key> keysSet;
 	Optional<Future<Void>> metrics;
 	FlowLock readSemaphore;
 	int numReadWaiters;
--- a/fdbserver/KeyValueStoreSQLite.actor.cpp
+++ b/fdbserver/KeyValueStoreSQLite.actor.cpp
@ -1596,7 +1596,9 @@ public:
 	StorageBytes getStorageBytes() const override;

 	void set(KeyValueRef keyValue, const Arena* arena = nullptr) override;
-	void clear(KeyRangeRef range, const Arena* arena = nullptr) override;
+	void clear(KeyRangeRef range,
+	           const StorageServerMetrics* storageMetrics = nullptr,
+	           const Arena* arena = nullptr) override;
 	Future<Void> commit(bool sequential = false) override;

 	Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> optionss) override;
@ -2215,7 +2217,7 @@ void KeyValueStoreSQLite::set(KeyValueRef keyValue, const Arena* arena) {
 	++writesRequested;
 	writeThread->post(new Writer::SetAction(keyValue));
 }
-void KeyValueStoreSQLite::clear(KeyRangeRef range, const Arena* arena) {
+void KeyValueStoreSQLite::clear(KeyRangeRef range, const StorageServerMetrics* storageMetrics, const Arena* arena) {
 	++writesRequested;
 	writeThread->post(new Writer::ClearAction(range));
 }
--- a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp
@ -49,6 +49,7 @@ static_assert((ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR == 27) ? ROCKSDB_PATCH >= 3 :
              "Unsupported rocksdb version. Update the rocksdb to 6.27.3 version");

 const std::string rocksDataFolderSuffix = "-data";
+const std::string METADATA_SHARD_ID = "kvs-metadata";
 const KeyRef shardMappingPrefix("\xff\xff/ShardMapping/"_sr);
 // TODO: move constants to a header file.
 const StringRef ROCKSDBSTORAGE_HISTOGRAM_GROUP = "RocksDBStorage"_sr;
@ -304,13 +305,12 @@ rocksdb::ReadOptions getReadOptions() {
 }

 struct ReadIterator {
-	rocksdb::ColumnFamilyHandle* cf;
 	uint64_t index; // incrementing counter to uniquely identify read iterator.
 	bool inUse;
 	std::shared_ptr<rocksdb::Iterator> iter;
 	double creationTime;
 	ReadIterator(rocksdb::ColumnFamilyHandle* cf, uint64_t index, rocksdb::DB* db, rocksdb::ReadOptions& options)
-	  : cf(cf), index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options, cf)) {}
+	  : index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options, cf)) {}
 };

 /*
@ -475,13 +475,26 @@ struct PhysicalShard {
 	}

 	~PhysicalShard() {
-		if (!deletePending)
-			return;
+		logShardEvent(id, ShardOp::CLOSE);
+		isInitialized.store(false);
+		readIterPool.reset();

-		// Destroy CF
-		auto s = db->DropColumnFamily(cf);
+		// Deleting default column family is not allowed.
+		if (id == "default") {
+			return;
+		}
+
+		if (deletePending) {
+			auto s = db->DropColumnFamily(cf);
+			if (!s.ok()) {
+				logRocksDBError(s, "DestroyShard");
+				logShardEvent(id, ShardOp::DESTROY, SevError, s.ToString());
+				return;
+			}
+		}
+		auto s = db->DestroyColumnFamilyHandle(cf);
 		if (!s.ok()) {
-			logRocksDBError(s, "DestroyShard");
+			logRocksDBError(s, "DestroyCFHandle");
 			logShardEvent(id, ShardOp::DESTROY, SevError, s.ToString());
 			return;
 		}
@ -628,7 +641,7 @@ public:
 		std::vector<rocksdb::ColumnFamilyDescriptor> descriptors;
 		bool foundMetadata = false;
 		for (const auto& name : columnFamilies) {
-			if (name == "kvs-metadata") {
+			if (name == METADATA_SHARD_ID) {
 				foundMetadata = true;
 			}
 			descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ name, cfOptions });
@ -652,19 +665,19 @@ public:
 			TraceEvent(SevInfo, "ShardedRocksInitLoadPhysicalShards", this->logId)
 			    .detail("PhysicalShardCount", handles.size());

+			std::shared_ptr<PhysicalShard> metadataShard = nullptr;
 			for (auto handle : handles) {
-				if (handle->GetName() == "kvs-metadata") {
-					metadataShard = std::make_shared<PhysicalShard>(db, "kvs-metadata", handle);
-				} else {
-					physicalShards[handle->GetName()] = std::make_shared<PhysicalShard>(db, handle->GetName(), handle);
+				auto shard = std::make_shared<PhysicalShard>(db, handle->GetName(), handle);
+				if (shard->id == METADATA_SHARD_ID) {
+					metadataShard = shard;
 				}
+				physicalShards[shard->id] = shard;
 				columnFamilyMap[handle->GetID()] = handle;
-				TraceEvent(SevVerbose, "ShardedRocksInitPhysicalShard", this->logId)
-				    .detail("PhysicalShard", handle->GetName());
+				TraceEvent(SevVerbose, "ShardedRocksInitPhysicalShard", this->logId).detail("PhysicalShard", shard->id);
 			}

 			std::set<std::string> unusedShards(columnFamilies.begin(), columnFamilies.end());
-			unusedShards.erase("kvs-metadata");
+			unusedShards.erase(METADATA_SHARD_ID);
 			unusedShards.erase("default");

 			KeyRange keyRange = prefixRange(shardMappingPrefix);
@ -746,9 +759,11 @@ public:
 			defaultShard->dataShards[specialKeys.begin.toString()] = std::move(dataShard);
 			physicalShards[defaultShard->id] = defaultShard;

-			metadataShard = std::make_shared<PhysicalShard>(db, "kvs-metadata");
+			// Create metadata shard.
+			auto metadataShard = std::make_shared<PhysicalShard>(db, METADATA_SHARD_ID);
 			metadataShard->init();
 			columnFamilyMap[metadataShard->cf->GetID()] = metadataShard->cf;
+			physicalShards[METADATA_SHARD_ID] = metadataShard;

 			// Write special key range metadata.
 			writeBatch = std::make_unique<rocksdb::WriteBatch>();
@ -763,7 +778,6 @@ public:
 			TraceEvent(SevInfo, "ShardedRocksInitializeMetaDataShard", this->logId)
 			    .detail("MetadataShardCF", metadataShard->cf->GetID());
 		}
-		physicalShards["kvs-metadata"] = metadataShard;

 		writeBatch = std::make_unique<rocksdb::WriteBatch>();
 		dirtyShards = std::make_unique<std::set<PhysicalShard*>>();
@ -910,6 +924,9 @@ public:
 	std::vector<std::shared_ptr<PhysicalShard>> getPendingDeletionShards(double cleanUpDelay) {
 		std::vector<std::shared_ptr<PhysicalShard>> emptyShards;
 		double currentTime = now();
+
+		TraceEvent(SevInfo, "ShardedRocksDB", logId)
+		    .detail("PendingDeletionShardQueueSize", pendingDeletionShards.size());
 		while (!pendingDeletionShards.empty()) {
 			const auto& id = pendingDeletionShards.front();
 			auto it = physicalShards.find(id);
@ -976,6 +993,10 @@ public:
 		    .detail("Info", "RangeToPersist")
 		    .detail("BeginKey", range.begin)
 		    .detail("EndKey", range.end);
+		auto it = physicalShards.find(METADATA_SHARD_ID);
+		ASSERT(it != physicalShards.end());
+		auto metadataShard = it->second;
+
 		writeBatch->DeleteRange(metadataShard->cf,
 		                        getShardMappingKey(range.begin, shardMappingPrefix),
 		                        getShardMappingKey(range.end, shardMappingPrefix));
@ -1043,24 +1064,30 @@ public:
 	}

 	void closeAllShards() {
-		for (auto& [_, shard] : physicalShards) {
-			shard->readIterPool.reset();
-		}
+		columnFamilyMap.clear();
+		physicalShards.clear();
 		// Close DB.
 		auto s = db->Close();
 		if (!s.ok()) {
 			logRocksDBError(s, "Close");
 			return;
 		}
+		TraceEvent("ShardedRocksDB", this->logId).detail("Info", "DBClosed");
 	}

 	void destroyAllShards() {
-		closeAllShards();
-		std::vector<rocksdb::ColumnFamilyDescriptor> cfs;
-		for (const auto& [key, _] : physicalShards) {
-			cfs.push_back(rocksdb::ColumnFamilyDescriptor{ key, getCFOptions() });
+		columnFamilyMap.clear();
+		for (auto& [_, shard] : physicalShards) {
+			shard->deletePending = true;
 		}
-		auto s = rocksdb::DestroyDB(path, getOptions(), cfs);
+		physicalShards.clear();
+		// Close DB.
+		auto s = db->Close();
+		if (!s.ok()) {
+			logRocksDBError(s, "Close");
+			return;
+		}
+		s = rocksdb::DestroyDB(path, getOptions());
 		if (!s.ok()) {
 			logRocksDBError(s, "DestroyDB");
 		}
@ -1121,7 +1148,6 @@ private:
 	std::unique_ptr<rocksdb::WriteBatch> writeBatch;
 	std::unique_ptr<std::set<PhysicalShard*>> dirtyShards;
 	KeyRangeMap<DataShard*> dataShardMap;
-	std::shared_ptr<PhysicalShard> metadataShard = nullptr;
 	std::deque<std::string> pendingDeletionShards;
 };

@ -2240,6 +2266,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		// TODO: Adapt the simulation framework to not advance time quickly when background reads/writes are
 		// occurring.
 		if (g_network->isSimulated()) {
+			TraceEvent(SevDebug, "ShardedRocksDB").detail("Info", "Use Coro threads in simulation.");
 			writeThread = CoroThreadPool::createThreadPool();
 			readThreads = CoroThreadPool::createThreadPool();
 		} else {
@ -2316,7 +2343,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {

 	void set(KeyValueRef kv, const Arena*) override { shardManager.put(kv.key, kv.value); }

-	void clear(KeyRangeRef range, const Arena*) override {
+	void clear(KeyRangeRef range, const StorageServerMetrics*, const Arena*) override {
 		if (range.singleKeyRange()) {
 			shardManager.clear(range.begin);
 		} else {
--- a/fdbserver/RestoreLoader.actor.cpp
+++ b/fdbserver/RestoreLoader.actor.cpp
@ -405,10 +405,6 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
 	    .detail("Offset", asset.offset)
 	    .detail("Length", asset.len);

-	// Ensure data blocks in the same file are processed in order
-	wait(processedFileOffset->whenAtLeast(asset.offset));
-	ASSERT(processedFileOffset->get() == asset.offset);
-
 	state Arena tempArena;
 	state StringRefReader reader(buf, restore_corrupted_data());
 	try {
@ -430,8 +426,9 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
 			const uint8_t* message = reader.consume(msgSize);

 			// Skip mutations out of the version range
-			if (!asset.isInVersionRange(msgVersion.version))
+			if (!asset.isInVersionRange(msgVersion.version)) {
 				continue;
+			}

 			state VersionedMutationsMap::iterator it;
 			bool inserted;
@ -452,6 +449,7 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
 			// Skip mutation whose commitVesion < range kv's version
 			if (logMutationTooOld(pRangeVersions, mutation, msgVersion.version)) {
 				cc->oldLogMutations += 1;
+				wait(yield()); // avoid potential stack overflows
 				continue;
 			}

@ -459,6 +457,7 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
 			if (mutation.param1 >= asset.range.end ||
 			    (isRangeMutation(mutation) && mutation.param2 < asset.range.begin) ||
 			    (!isRangeMutation(mutation) && mutation.param1 < asset.range.begin)) {
+				wait(yield()); // avoid potential stack overflows
 				continue;
 			}

@ -509,7 +508,6 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
 		    .detail("BlockLen", asset.len);
 		throw;
 	}
-	processedFileOffset->set(asset.offset + asset.len);
 	return Void();
 }

@ -526,8 +524,19 @@ ACTOR static Future<Void> parsePartitionedLogFileOnLoader(
 	state int readFileRetries = 0;
 	loop {
 		try {
+			// Ensure data blocks in the same file are processed in order
+			wait(processedFileOffset->whenAtLeast(asset.offset));
+			ASSERT(processedFileOffset->get() == asset.offset);
+
 			wait(_parsePartitionedLogFileOnLoader(
 			    pRangeVersions, processedFileOffset, kvOpsIter, samplesIter, cc, bc, asset, cx));
+			processedFileOffset->set(asset.offset + asset.len);
+
+			TraceEvent("FastRestoreLoaderDecodingLogFileDone")
+			    .detail("BatchIndex", asset.batchIndex)
+			    .detail("Filename", asset.filename)
+			    .detail("Offset", asset.offset)
+			    .detail("Length", asset.len);
 			break;
 		} catch (Error& e) {
 			if (e.code() == error_code_restore_bad_read || e.code() == error_code_restore_unsupported_file_version ||
--- a/fdbserver/RocksDBCheckpointUtils.actor.cpp
+++ b/fdbserver/RocksDBCheckpointUtils.actor.cpp
@ -529,6 +529,7 @@ ACTOR Future<Void> fetchCheckpointFile(Database cx,
 	state int64_t offset = 0;
 	state Reference<IAsyncFile> asyncFile;
 	loop {
+		offset = 0;
 		try {
 			asyncFile = Reference<IAsyncFile>();
 			++attempt;
@ -559,7 +560,8 @@ ACTOR Future<Void> fetchCheckpointFile(Database cx,
 				offset += rep.data.size();
 			}
 		} catch (Error& e) {
-			if (e.code() != error_code_end_of_stream) {
+			if (e.code() != error_code_end_of_stream ||
+			    (g_network->isSimulated() && attempt == 1 && deterministicRandom()->coinflip())) {
 				TraceEvent("FetchCheckpointFileError")
 				    .errorUnsuppressed(e)
 				    .detail("RemoteFile", remoteFile)
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@ -107,7 +107,8 @@ bool destructed = false;
 class TestConfig : public BasicTestConfig {
 	class ConfigBuilder {
 		using value_type = toml::basic_value<toml::discard_comments>;
-		using base_variant = std::variant<int, float, double, bool, std::string, std::vector<int>, ConfigDBType>;
+		using base_variant = std::
+		    variant<int, float, double, bool, std::string, std::vector<int>, std::vector<std::string>, ConfigDBType>;
 		using types =
 		    variant_map<variant_concat<base_variant, variant_map<base_variant, Optional>>, std::add_pointer_t>;
 		std::unordered_map<std::string_view, types> confMap;
@ -148,6 +149,17 @@ class TestConfig : public BasicTestConfig {
 				(*this)(&res);
 				*val = std::move(res);
 			}
+			void operator()(std::vector<std::string>* val) const {
+				auto arr = value.as_array();
+				for (const auto& i : arr) {
+					val->emplace_back(i.as_string());
+				}
+			}
+			void operator()(Optional<std::vector<std::string>>* val) const {
+				std::vector<std::string> res;
+				(*this)(&res);
+				*val = std::move(res);
+			}
 		};

 		struct trace_visitor {
@ -178,6 +190,26 @@ class TestConfig : public BasicTestConfig {
 					(*this)(&(val->get()));
 				}
 			}
+			void operator()(std::vector<std::string> const* val) const {
+				if (val->empty()) {
+					evt.detail(key.c_str(), "[]");
+					return;
+				}
+				std::stringstream value;
+				value << "[" << val->at(0);
+				for (int i = 1; i < val->size(); ++i) {
+					value << "," << val->at(i);
+				}
+				value << "]";
+				evt.detail(key.c_str(), value.str());
+			}
+			void operator()(Optional<std::vector<std::string>> const* val) const {
+				if (!val->present()) {
+					evt.detail(key.c_str(), *val);
+				} else {
+					(*this)(&(val->get()));
+				}
+			}
 			void operator()(ConfigDBType const* val) const { evt.detail(key.c_str(), *val); }
 			void operator()(Optional<ConfigDBType> const* val) const {
 				Optional<std::string> optStr;
@ -312,12 +344,24 @@ class TestConfig : public BasicTestConfig {
 			if (attrib == "blobGranulesEnabled") {
 				blobGranulesEnabled = strcmp(value.c_str(), "true") == 0;
 			}
+			if (attrib == "allowDefaultTenant") {
+				allowDefaultTenant = strcmp(value.c_str(), "true") == 0;
+			}
+			if (attrib == "allowCreatingTenants") {
+				allowCreatingTenants = strcmp(value.c_str(), "true") == 0;
+			}
 			if (attrib == "injectSSTargetedRestart") {
 				injectTargetedSSRestart = strcmp(value.c_str(), "true") == 0;
 			}
-
-			if (attrib == "injectSSDelay") {
-				injectSSDelay = strcmp(value.c_str(), "true") == 0;
+			if (attrib == "tenantModes") {
+				std::stringstream ss(value);
+				std::string token;
+				while (std::getline(ss, token, ',')) {
+					tenantModes.push_back(token);
+				}
+			}
+			if (attrib == "defaultTenant") {
+				defaultTenant = value;
 			}
 		}

@ -365,11 +409,14 @@ public:
 	bool randomlyRenameZoneId = false;

 	bool allowDefaultTenant = true;
-	bool allowDisablingTenants = true;
 	bool allowCreatingTenants = true;
 	bool injectTargetedSSRestart = false;
-	bool tenantModeRequired = false;
 	bool injectSSDelay = false;
+	// By default, tenant mode is set randomly
+	// If provided, set using TenantMode::fromString
+	// Ensure no '_experimental` suffix in the mode name
+	std::vector<std::string> tenantModes;
+	Optional<std::string> defaultTenant;
 	std::string testClass; // unused -- used in TestHarness
 	float testPriority; // unused -- used in TestHarness

@ -432,12 +479,12 @@ public:
 		    .add("extraMachineCountDC", &extraMachineCountDC)
 		    .add("blobGranulesEnabled", &blobGranulesEnabled)
 		    .add("allowDefaultTenant", &allowDefaultTenant)
-		    .add("allowDisablingTenants", &allowDisablingTenants)
 		    .add("allowCreatingTenants", &allowCreatingTenants)
-		    .add("tenantModeRequired", &tenantModeRequired)
 		    .add("randomlyRenameZoneId", &randomlyRenameZoneId)
 		    .add("injectTargetedSSRestart", &injectTargetedSSRestart)
-		    .add("injectSSDelay", &injectSSDelay);
+		    .add("injectSSDelay", &injectSSDelay)
+		    .add("tenantModes", &tenantModes)
+		    .add("defaultTenant", &defaultTenant);
 		try {
 			auto file = toml::parse(testFile);
 			if (file.contains("configuration") && toml::find(file, "configuration").is_table()) {
@ -1118,18 +1165,18 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
                                          int* pTesterCount,
                                          Optional<ClusterConnectionString>* pConnString,
                                          Standalone<StringRef>* pStartingConfiguration,
-                                          TestConfig testConfig,
+                                          TestConfig* testConfig,
                                          std::string whitelistBinPaths,
                                          ProtocolVersion protocolVersion) {
 	CSimpleIni ini;
 	ini.SetUnicode();
 	ini.LoadFile(joinPath(baseFolder, "restartInfo.ini").c_str());

-	auto configDBType = testConfig.getConfigDBType();
+	auto configDBType = testConfig->getConfigDBType();

 	// Randomly change data center id names to test that localities
 	// can be modified on cluster restart
-	bool renameZoneIds = testConfig.randomlyRenameZoneId ? deterministicRandom()->random01() < 0.1 : false;
+	bool renameZoneIds = testConfig->randomlyRenameZoneId ? deterministicRandom()->random01() < 0.1 : false;
 	CODE_PROBE(renameZoneIds, "Zone ID names altered in restart test");

 	// allows multiple ipAddr entries
@ -1146,26 +1193,34 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
 		int desiredCoordinators = atoi(ini.GetValue("META", "desiredCoordinators"));
 		int testerCount = atoi(ini.GetValue("META", "testerCount"));
 		auto tssModeStr = ini.GetValue("META", "tssMode");
+		auto tenantMode = ini.GetValue("META", "tenantMode");
+		if (tenantMode != nullptr) {
+			testConfig->tenantModes.push_back(tenantMode);
+		}
+		std::string defaultTenant = ini.GetValue("META", "defaultTenant", "");
+		if (!defaultTenant.empty()) {
+			testConfig->defaultTenant = defaultTenant;
+		}
 		if (tssModeStr != nullptr) {
 			g_simulator->tssMode = (ISimulator::TSSMode)atoi(tssModeStr);
 		}
 		ClusterConnectionString conn(ini.GetValue("META", "connectionString"));
-		if (testConfig.extraDatabaseMode == ISimulator::ExtraDatabaseMode::Local) {
+		if (testConfig->extraDatabaseMode == ISimulator::ExtraDatabaseMode::Local) {
 			g_simulator->extraDatabases.clear();
 			g_simulator->extraDatabases.push_back(conn.toString());
 		}
-		if (!testConfig.disableHostname) {
+		if (!testConfig->disableHostname) {
 			auto mockDNSStr = ini.GetValue("META", "mockDNS");
 			if (mockDNSStr != nullptr) {
 				INetworkConnections::net()->parseMockDNSFromString(mockDNSStr);
 			}
 		}
 		auto& g_knobs = IKnobCollection::getMutableGlobalKnobCollection();
-		if (testConfig.disableRemoteKVS) {
+		if (testConfig->disableRemoteKVS) {
 			g_knobs.setKnob("remote_kv_store", KnobValueRef::create(bool{ false }));
 			TraceEvent(SevDebug, "DisableRemoteKVS");
 		}
-		if (testConfig.disableEncryption) {
+		if (testConfig->disableEncryption) {
 			g_knobs.setKnob("enable_encryption", KnobValueRef::create(bool{ false }));
 			g_knobs.setKnob("enable_tlog_encryption", KnobValueRef::create(bool{ false }));
 			g_knobs.setKnob("enable_storage_server_encryption", KnobValueRef::create(bool{ false }));
@ -2451,9 +2506,7 @@ ACTOR void setupAndRun(std::string dataFolder,
 	allowList.addTrustedSubnet("0.0.0.0/2"sv);
 	allowList.addTrustedSubnet("abcd::/16"sv);
 	state bool allowDefaultTenant = testConfig.allowDefaultTenant;
-	state bool allowDisablingTenants = testConfig.allowDisablingTenants;
 	state bool allowCreatingTenants = testConfig.allowCreatingTenants;
-	state bool tenantModeRequired = testConfig.tenantModeRequired;

 	if (!SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
 		testConfig.storageEngineExcludeTypes.push_back(5);
@ -2465,12 +2518,6 @@ ACTOR void setupAndRun(std::string dataFolder,
 	if (std::string_view(testFile).find("restarting") != std::string_view::npos) {
 		testConfig.storageEngineExcludeTypes.push_back(4);
 		testConfig.storageEngineExcludeTypes.push_back(5);
-
-		// Disable the default tenant in restarting tests for now
-		// TODO: persist the chosen default tenant in the restartInfo.ini file for the second test
-		allowDefaultTenant = false;
-		allowCreatingTenants = false;
-		tenantModeRequired = false;
 	}

 	// TODO: Currently backup and restore related simulation tests are failing when run with rocksDB storage engine
@ -2520,31 +2567,28 @@ ACTOR void setupAndRun(std::string dataFolder,
 	state Optional<TenantName> defaultTenant;
 	state Standalone<VectorRef<TenantNameRef>> tenantsToCreate;
 	state TenantMode tenantMode = TenantMode::DISABLED;
-	if (tenantModeRequired || (allowDefaultTenant && deterministicRandom()->random01() < 0.5)) {
-		defaultTenant = "SimulatedDefaultTenant"_sr;
-		tenantsToCreate.push_back_deep(tenantsToCreate.arena(), defaultTenant.get());
-		if (tenantModeRequired || deterministicRandom()->random01() < 0.9) {
-			tenantMode = TenantMode::REQUIRED;
-		} else {
+	// If this is a restarting test, restartInfo.ini is read in restartSimulatedSystem
+	// where we update the defaultTenant and tenantMode in the testConfig
+	// Defer setting tenant mode and default tenant until later
+	if (!rebooting) {
+		if (testConfig.tenantModes.size()) {
+			auto randomPick = deterministicRandom()->randomChoice(testConfig.tenantModes);
+			tenantMode = TenantMode::fromString(randomPick);
+			if (tenantMode == TenantMode::REQUIRED && allowDefaultTenant) {
+				defaultTenant = "SimulatedDefaultTenant"_sr;
+			}
+		} else if (allowDefaultTenant && deterministicRandom()->coinflip()) {
+			defaultTenant = "SimulatedDefaultTenant"_sr;
+			if (deterministicRandom()->random01() < 0.9) {
+				tenantMode = TenantMode::REQUIRED;
+			} else {
+				tenantMode = TenantMode::OPTIONAL_TENANT;
+			}
+		} else if (deterministicRandom()->coinflip()) {
 			tenantMode = TenantMode::OPTIONAL_TENANT;
 		}
-	} else if (!allowDisablingTenants || deterministicRandom()->random01() < 0.5) {
-		tenantMode = TenantMode::OPTIONAL_TENANT;
 	}

-	if (allowCreatingTenants && tenantMode != TenantMode::DISABLED && deterministicRandom()->random01() < 0.5) {
-		int numTenants = deterministicRandom()->randomInt(1, 6);
-		for (int i = 0; i < numTenants; ++i) {
-			tenantsToCreate.push_back_deep(tenantsToCreate.arena(),
-			                               TenantNameRef(format("SimulatedExtraTenant%04d", i)));
-		}
-	}
-
-	TraceEvent("SimulatedClusterTenantMode")
-	    .detail("UsingTenant", defaultTenant)
-	    .detail("TenantRequired", tenantMode.toString())
-	    .detail("TotalTenants", tenantsToCreate.size());
-
 	try {
 		// systemActors.push_back( startSystemMonitor(dataFolder) );
 		if (rebooting) {
@ -2553,7 +2597,7 @@ ACTOR void setupAndRun(std::string dataFolder,
 			                                         &testerCount,
 			                                         &connectionString,
 			                                         &startingConfiguration,
-			                                         testConfig,
+			                                         &testConfig,
 			                                         whitelistBinPaths,
 			                                         protocolVersion),
 			                  100.0));
@ -2574,6 +2618,31 @@ ACTOR void setupAndRun(std::string dataFolder,
 			                     tenantMode);
 			wait(delay(1.0)); // FIXME: WHY!!!  //wait for machines to boot
 		}
+		// restartSimulatedSystem can adjust some testConfig params related to tenants
+		// so set/overwrite those options if necessary here
+		if (rebooting && testConfig.tenantModes.size()) {
+			tenantMode = TenantMode::fromString(testConfig.tenantModes[0]);
+		}
+		if (testConfig.defaultTenant.present() && tenantMode != TenantMode::DISABLED && allowDefaultTenant) {
+			// Default tenant set by testConfig or restarting data in restartInfo.ini
+			defaultTenant = testConfig.defaultTenant.get();
+		}
+		if (!rebooting) {
+			if (defaultTenant.present() && allowDefaultTenant) {
+				tenantsToCreate.push_back_deep(tenantsToCreate.arena(), defaultTenant.get());
+			}
+			if (allowCreatingTenants && tenantMode != TenantMode::DISABLED && deterministicRandom()->coinflip()) {
+				int numTenants = deterministicRandom()->randomInt(1, 6);
+				for (int i = 0; i < numTenants; ++i) {
+					tenantsToCreate.push_back_deep(tenantsToCreate.arena(),
+					                               TenantNameRef(format("SimulatedExtraTenant%04d", i)));
+				}
+			}
+		}
+		TraceEvent("SimulatedClusterTenantMode")
+		    .detail("UsingTenant", defaultTenant)
+		    .detail("TenantMode", tenantMode.toString())
+		    .detail("TotalTenants", tenantsToCreate.size());
 		std::string clusterFileDir = joinPath(dataFolder, deterministicRandom()->randomUniqueID().toString());
 		platform::createDirectory(clusterFileDir);
 		writeFile(joinPath(clusterFileDir, "fdb.cluster"), connectionString.get().toString());
--- a/fdbserver/TenantCache.actor.cpp
+++ b/fdbserver/TenantCache.actor.cpp
@ -122,19 +122,20 @@ public:
 	ACTOR static Future<Void> monitorStorageUsage(TenantCache* tenantCache) {
 		TraceEvent(SevInfo, "StartingTenantCacheStorageUsageMonitor", tenantCache->id()).log();

-		state int refreshInterval = SERVER_KNOBS->TENANT_CACHE_STORAGE_REFRESH_INTERVAL;
+		state int refreshInterval = SERVER_KNOBS->TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL;
 		state double lastTenantListFetchTime = now();

 		loop {
 			state double fetchStartTime = now();
-			state std::vector<std::pair<KeyRef, TenantName>> tenantList = tenantCache->getTenantList();
+			state std::vector<TenantName> tenants = tenantCache->getTenantList();
 			state int i;
-			for (i = 0; i < tenantList.size(); i++) {
-				state ReadYourWritesTransaction tr(tenantCache->dbcx(), tenantList[i].second);
+			for (i = 0; i < tenants.size(); i++) {
+				state ReadYourWritesTransaction tr(tenantCache->dbcx(), tenants[i]);
 				loop {
 					try {
 						state int64_t size = wait(tr.getEstimatedRangeSizeBytes(normalKeys));
-						tenantCache->updateStorageUsage(tenantList[i].first, size);
+						tenantCache->tenantStorageMap[tenants[i]].usage = size;
+						break;
 					} catch (Error& e) {
 						TraceEvent("TenantCacheGetStorageUsageError", tenantCache->id()).error(e);
 						wait(tr.onError(e));
@ -149,6 +150,31 @@ public:
 			wait(delay(refreshInterval));
 		}
 	}
+
+	ACTOR static Future<Void> monitorStorageQuota(TenantCache* tenantCache) {
+		TraceEvent(SevInfo, "StartingTenantCacheStorageQuotaMonitor", tenantCache->id()).log();
+
+		state Transaction tr(tenantCache->dbcx());
+
+		loop {
+			loop {
+				try {
+					state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY));
+					for (auto const kv : currentQuotas) {
+						TenantName const tenant = kv.key.removePrefix(storageQuotaPrefix);
+						int64_t const quota = BinaryReader::fromStringRef<int64_t>(kv.value, Unversioned());
+						tenantCache->tenantStorageMap[tenant].quota = quota;
+					}
+					tr.reset();
+					break;
+				} catch (Error& e) {
+					TraceEvent("TenantCacheGetStorageQuotaError", tenantCache->id()).error(e);
+					wait(tr.onError(e));
+				}
+			}
+			wait(delay(SERVER_KNOBS->TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL));
+		}
+	}
 };

 void TenantCache::insert(TenantName& tenantName, TenantMapEntry& tenant) {
@ -203,21 +229,14 @@ int TenantCache::cleanup() {
 	return tenantsRemoved;
 }

-std::vector<std::pair<KeyRef, TenantName>> TenantCache::getTenantList() const {
-	std::vector<std::pair<KeyRef, TenantName>> tenants;
+std::vector<TenantName> TenantCache::getTenantList() const {
+	std::vector<TenantName> tenants;
 	for (const auto& [prefix, entry] : tenantCache) {
-		tenants.push_back({ prefix, entry->name() });
+		tenants.push_back(entry->name());
 	}
 	return tenants;
 }

-void TenantCache::updateStorageUsage(KeyRef prefix, int64_t size) {
-	auto it = tenantCache.find(prefix);
-	if (it != tenantCache.end()) {
-		it->value->updateStorageUsage(size);
-	}
-}
-
 std::string TenantCache::desc() const {
 	std::string s("@Generation: ");
 	s += std::to_string(generation) + " ";
@ -264,6 +283,16 @@ Optional<Reference<TCTenantInfo>> TenantCache::tenantOwning(KeyRef key) const {
 	return it->value;
 }

+std::vector<TenantName> TenantCache::getTenantsOverQuota() const {
+	std::vector<TenantName> tenants;
+	for (const auto& [tenant, storage] : tenantStorageMap) {
+		if (storage.usage > storage.quota) {
+			tenants.push_back(tenant);
+		}
+	}
+	return tenants;
+}
+
 Future<Void> TenantCache::monitorTenantMap() {
 	return TenantCacheImpl::monitorTenantMap(this);
 }
@ -272,6 +301,10 @@ Future<Void> TenantCache::monitorStorageUsage() {
 	return TenantCacheImpl::monitorStorageUsage(this);
 }

+Future<Void> TenantCache::monitorStorageQuota() {
+	return TenantCacheImpl::monitorStorageQuota(this);
+}
+
 class TenantCacheUnitTest {
 public:
 	ACTOR static Future<Void> InsertAndTestPresence() {
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@ -1620,9 +1620,17 @@ ACTOR Future<Void> redwoodMetricsLogger() {
 }

 // Holds an index of recently used objects.
-// ObjectType must have the methods
-//   bool evictable() const;            // return true if the entry can be evicted
-//   Future<Void> onEvictable() const;  // ready when entry can be evicted
+// ObjectType must have these methods
+//
+//   // Returns true iff the entry can be evicted
+//   bool evictable() const;
+//
+//	 // Ready when object is safe to evict from cache
+//   Future<Void> onEvictable() const;
+//
+//   // Ready when object destruction is safe
+//   // Should cancel pending async operations that are safe to cancel when cache is being destroyed
+//   Future<Void> cancel() const;
 template <class IndexType, class ObjectType>
 class ObjectCache : NonCopyable {
 	struct Entry;
@ -1845,7 +1853,7 @@ public:
 	}

 	// Clears the cache, saving the entries to second cache, then waits for each item to be evictable and evicts it.
-	ACTOR static Future<Void> clear_impl(ObjectCache* self) {
+	ACTOR static Future<Void> clear_impl(ObjectCache* self, bool waitForSafeEviction) {
 		// Claim ownership of all of our cached items, removing them from the evictor's control and quota.
 		for (auto& ie : self->cache) {
 			self->pEvictor->reclaim(ie.second);
@ -1857,16 +1865,15 @@ public:

 		state typename CacheT::iterator i = self->cache.begin();
 		while (i != self->cache.end()) {
-			if (!i->second.item.evictable()) {
-				wait(i->second.item.onEvictable());
-			}
+			wait(waitForSafeEviction ? i->second.item.onEvictable() : i->second.item.cancel());
 			++i;
 		}
+		self->cache.clear();

 		return Void();
 	}

-	Future<Void> clear() { return clear_impl(this); }
+	Future<Void> clear(bool waitForSafeEviction = false) { return clear_impl(this, waitForSafeEviction); }

 	// Move the prioritized evictions queued to the front of the eviction order
 	void flushPrioritizedEvictions() { pEvictor->moveIn(prioritizedEvictions); }
@ -1927,6 +1934,13 @@ public:
 		// Entry is evictable when its write and read futures are ready, even if they are
 		// errors, so any buffers they hold are no longer needed by the underlying file actors
 		Future<Void> onEvictable() const { return ready(readFuture) && ready(writeFuture); }
+
+		// Read and write futures are safe to cancel so just cancel them and return
+		Future<Void> cancel() {
+			writeFuture.cancel();
+			readFuture.cancel();
+			return Void();
+		}
 	};
 	typedef ObjectCache<LogicalPageID, PageCacheEntry> PageCacheT;

@ -2475,14 +2489,15 @@ public:

 	Future<LogicalPageID> newExtentPageID(QueueID queueID) override { return newExtentPageID_impl(this, queueID); }

-	ACTOR static Future<Void> writePhysicalBlock(DWALPager* self,
-	                                             Reference<ArenaPage> page,
-	                                             int blockNum,
-	                                             int blockSize,
-	                                             PhysicalPageID pageID,
-	                                             PagerEventReasons reason,
-	                                             unsigned int level,
-	                                             bool header) {
+	// Write one block of a page of a physical page in the page file.  Futures returned must be allowed to complete.
+	ACTOR static UNCANCELLABLE Future<Void> writePhysicalBlock(DWALPager* self,
+	                                                           Reference<ArenaPage> page,
+	                                                           int blockNum,
+	                                                           int blockSize,
+	                                                           PhysicalPageID pageID,
+	                                                           PagerEventReasons reason,
+	                                                           unsigned int level,
+	                                                           bool header) {

 		state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(header ? ioMaxPriority : ioMinPriority));
 		++g_redwoodMetrics.metric.pagerDiskWrite;
@ -2506,7 +2521,11 @@ public:
 		// Note:  Not using forwardError here so a write error won't be discovered until commit time.
 		debug_printf("DWALPager(%s) op=writeBlock %s\n", self->filename.c_str(), toString(pageID).c_str());
 		wait(self->pageFile->write(page->rawData() + (blockNum * blockSize), blockSize, (int64_t)pageID * blockSize));
-		debug_printf("DWALPager(%s) op=writeBlockDone %s\n", self->filename.c_str(), toString(pageID).c_str());
+
+		// This next line could crash on shutdown because this actor can't be cancelled so self could be destroyed after
+		// write, so enable this line with caution when debugging.
+		// debug_printf("DWALPager(%s) op=writeBlockDone %s\n", self->filename.c_str(), toString(pageID).c_str());
+
 		return Void();
 	}

@ -2530,6 +2549,7 @@ public:
 		return Void();
 	}

+	// All returned futures are added to the operations vector
 	Future<Void> writePhysicalPage(PagerEventReasons reason,
 	                               unsigned int level,
 	                               Standalone<VectorRef<PhysicalPageID>> pageIDs,
@ -2753,18 +2773,19 @@ public:
 	}
 	void freeExtent(LogicalPageID pageID) override { freeExtent_impl(this, pageID); }

-	ACTOR static Future<int> readPhysicalBlock(DWALPager* self,
-	                                           uint8_t* data,
-	                                           int blockSize,
-	                                           int64_t offset,
-	                                           int priority) {
+	ACTOR static UNCANCELLABLE Future<int> readPhysicalBlock(DWALPager* self,
+	                                                         Reference<ArenaPage> pageBuffer,
+	                                                         int pageOffset,
+	                                                         int blockSize,
+	                                                         int64_t offset,
+	                                                         int priority) {
 		state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(std::min(priority, ioMaxPriority)));
 		++g_redwoodMetrics.metric.pagerDiskRead;
-		int bytes = wait(self->pageFile->read(data, blockSize, offset));
+		int bytes = wait(self->pageFile->read(pageBuffer->rawData() + pageOffset, blockSize, offset));
 		return bytes;
 	}

-	// Read a physical page from the page file.  Note that header pages use a page size of smallestPhysicalBlock
+	// Read a physical page from the page file.  Note that header pages use a page size of smallestPhysicalBlock.
 	// If the user chosen physical page size is larger, then there will be a gap of unused space after the header pages
 	// and before the user-chosen sized pages.
 	ACTOR static Future<Reference<ArenaPage>> readPhysicalPage(DWALPager* self,
@ -2781,8 +2802,8 @@ public:
 		             page->rawData(),
 		             header);

-		int readBytes = wait(
-		    readPhysicalBlock(self, page->rawData(), page->rawSize(), (int64_t)pageID * page->rawSize(), priority));
+		int readBytes =
+		    wait(readPhysicalBlock(self, page, 0, page->rawSize(), (int64_t)pageID * page->rawSize(), priority));
 		debug_printf("DWALPager(%s) op=readPhysicalDiskReadComplete %s ptr=%p bytes=%d\n",
 		             self->filename.c_str(),
 		             toString(pageID).c_str(),
@ -2845,8 +2866,8 @@ public:
 		state int blockSize = self->physicalPageSize;
 		std::vector<Future<int>> reads;
 		for (int i = 0; i < pageIDs.size(); ++i) {
-			reads.push_back(readPhysicalBlock(
-			    self, page->rawData() + (i * blockSize), blockSize, ((int64_t)pageIDs[i]) * blockSize, priority));
+			reads.push_back(
+			    readPhysicalBlock(self, page, i * blockSize, blockSize, ((int64_t)pageIDs[i]) * blockSize, priority));
 		}
 		// wait for all the parallel read futures
 		wait(waitForAll(reads));
@ -3083,8 +3104,8 @@ public:
 			currentOffset = i * physicalReadSize;
 			debug_printf("DWALPager(%s) current offset %" PRId64 "\n", self->filename.c_str(), currentOffset);
 			++g_redwoodMetrics.metric.pagerDiskRead;
-			reads.push_back(
-			    self->pageFile->read(extent->rawData() + currentOffset, physicalReadSize, startOffset + currentOffset));
+			reads.push_back(self->readPhysicalBlock(
+			    self, extent, currentOffset, physicalReadSize, startOffset + currentOffset, ioMaxPriority));
 		}

 		// Handle the last read separately as it may be smaller than physicalReadSize
@ -3096,8 +3117,8 @@ public:
 			             currentOffset,
 			             lastReadSize);
 			++g_redwoodMetrics.metric.pagerDiskRead;
-			reads.push_back(
-			    self->pageFile->read(extent->rawData() + currentOffset, lastReadSize, startOffset + currentOffset));
+			reads.push_back(self->readPhysicalBlock(
+			    self, extent, currentOffset, lastReadSize, startOffset + currentOffset, ioMaxPriority));
 		}

 		// wait for all the parallel read futures for the given extent
@ -3562,30 +3583,36 @@ public:
 	Value getCommitRecord() const override { return lastCommittedHeader.userCommitRecord; }

 	ACTOR void shutdown(DWALPager* self, bool dispose) {
+		// Send to the error promise first and then delay(0) to give users a chance to cancel
+		// any outstanding operations
+		if (self->errorPromise.canBeSet()) {
+			debug_printf("DWALPager(%s) shutdown sending error\n", self->filename.c_str());
+			self->errorPromise.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress
+		}
+		wait(delay(0));
+
+		// The next section explicitly cancels all pending operations held in the pager
+		debug_printf("DWALPager(%s) shutdown kill ioLock\n", self->filename.c_str());
+		self->ioLock.kill();
+
 		debug_printf("DWALPager(%s) shutdown cancel recovery\n", self->filename.c_str());
 		self->recoverFuture.cancel();
 		debug_printf("DWALPager(%s) shutdown cancel commit\n", self->filename.c_str());
 		self->commitFuture.cancel();
 		debug_printf("DWALPager(%s) shutdown cancel remap\n", self->filename.c_str());
 		self->remapCleanupFuture.cancel();
+		debug_printf("DWALPager(%s) shutdown kill file extension\n", self->filename.c_str());
+		self->fileExtension.cancel();

-		if (self->errorPromise.canBeSet()) {
-			debug_printf("DWALPager(%s) shutdown sending error\n", self->filename.c_str());
-			self->errorPromise.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress
+		debug_printf("DWALPager(%s) shutdown cancel operations\n", self->filename.c_str());
+		for (auto& f : self->operations) {
+			f.cancel();
 		}
-
-		// Must wait for pending operations to complete, canceling them can cause a crash because the underlying
-		// operations may be uncancellable and depend on memory from calling scope's page reference
-		debug_printf("DWALPager(%s) shutdown wait for operations\n", self->filename.c_str());
-
-		// Pending ops must be all ready, errors are okay
-		wait(waitForAllReady(self->operations));
 		self->operations.clear();

 		debug_printf("DWALPager(%s) shutdown destroy page cache\n", self->filename.c_str());
 		wait(self->extentCache.clear());
 		wait(self->pageCache.clear());
-		wait(delay(0));

 		debug_printf("DWALPager(%s) shutdown remappedPagesMap: %s\n",
 		             self->filename.c_str(),
@ -3810,7 +3837,11 @@ private:
 	Promise<Void> closedPromise;
 	Promise<Void> errorPromise;
 	Future<Void> commitFuture;
+
+	// The operations vector is used to hold all disk writes made by the Pager, but could also hold
+	// other operations that need to be waited on before a commit can finish.
 	std::vector<Future<Void>> operations;
+
 	Future<Void> recoverFuture;
 	Future<Void> remapCleanupFuture;
 	bool remapCleanupStop;
@ -4582,7 +4613,7 @@ struct BoundaryRefAndPage {
 // DecodeBoundaryVerifier provides simulation-only verification of DeltaTree boundaries between
 // reads and writes by using a static structure to track boundaries used during DeltaTree generation
 // for all writes and updates across cold starts and virtual process restarts.
-struct DecodeBoundaryVerifier {
+class DecodeBoundaryVerifier {
 	struct DecodeBoundaries {
 		Key lower;
 		Key upper;
@ -4593,11 +4624,13 @@ struct DecodeBoundaryVerifier {

 	typedef std::map<Version, DecodeBoundaries> BoundariesByVersion;
 	std::unordered_map<LogicalPageID, BoundariesByVersion> boundariesByPageID;
-	std::vector<Key> boundarySamples;
 	int boundarySampleSize = 1000;
 	int boundaryPopulation = 0;
 	Reference<IPageEncryptionKeyProvider> keyProvider;

+public:
+	std::vector<Key> boundarySamples;
+
 	// Sample rate of pages to be scanned to verify if all entries in the page meet domain prefix requirement.
 	double domainPrefixScanProbability = 0.01;
 	uint64_t domainPrefixScanCount = 0;
@ -4626,7 +4659,7 @@ struct DecodeBoundaryVerifier {
 		if (boundarySamples.empty()) {
 			return Key();
 		}
-		return boundarySamples[deterministicRandom()->randomInt(0, boundarySamples.size())];
+		return deterministicRandom()->randomChoice(boundarySamples);
 	}

 	bool update(BTreeNodeLinkRef id,
@ -5192,6 +5225,15 @@ public:
 	Future<Void> init() { return m_init; }

 	virtual ~VersionedBTree() {
+		// DecodeBoundaryVerifier objects outlive simulated processes.
+		// Thus, if we did not clear the key providers here, each DecodeBoundaryVerifier object might
+		// maintain references to untracked peers through its key provider. This would result in
+		// errors when FlowTransport::removePeerReference is called to remove a peer that is no
+		// longer tracked by FlowTransport::transport().
+		if (m_pBoundaryVerifier != nullptr) {
+			m_pBoundaryVerifier->setKeyProvider(Reference<IPageEncryptionKeyProvider>());
+		}
+
 		// This probably shouldn't be called directly (meaning deleting an instance directly) but it should be safe,
 		// it will cancel init and commit and leave the pager alive but with potentially an incomplete set of
 		// uncommitted writes so it should not be committed.
@ -8003,7 +8045,9 @@ public:

 	Future<Void> getError() const override { return delayed(m_error.getFuture()); };

-	void clear(KeyRangeRef range, const Arena* arena = 0) override {
+	void clear(KeyRangeRef range,
+	           const StorageServerMetrics* storageMetrics = nullptr,
+	           const Arena* arena = 0) override {
 		debug_printf("CLEAR %s\n", printable(range).c_str());
 		m_tree->clear(range);
 	}
--- a/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h
+++ b/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h
@ -140,9 +140,27 @@ private:
 	Future<Void> collection;
 };

+// Defines granule info that interests full restore
+struct BlobGranuleRestoreVersion {
+	// Two constructors required by VectorRef
+	BlobGranuleRestoreVersion() {}
+	BlobGranuleRestoreVersion(Arena& a, const BlobGranuleRestoreVersion& copyFrom)
+	  : granuleID(copyFrom.granuleID), keyRange(a, copyFrom.keyRange), version(copyFrom.version),
+	    sizeInBytes(copyFrom.sizeInBytes) {}
+
+	UID granuleID;
+	KeyRangeRef keyRange;
+	Version version;
+	int64_t sizeInBytes;
+};
+
+// Defines a vector for BlobGranuleVersion
+typedef Standalone<VectorRef<BlobGranuleRestoreVersion>> BlobGranuleRestoreVersionVector;
+
 ACTOR Future<Void> dumpManifest(Database db, Reference<BlobConnectionProvider> blobConn, int64_t epoch, int64_t seqNo);
 ACTOR Future<Void> loadManifest(Database db, Reference<BlobConnectionProvider> blobConn);
 ACTOR Future<Void> printRestoreSummary(Database db, Reference<BlobConnectionProvider> blobConn);
+ACTOR Future<BlobGranuleRestoreVersionVector> listBlobGranules(Database db, Reference<BlobConnectionProvider> blobConn);
 inline bool isFullRestoreMode() {
 	return SERVER_KNOBS->BLOB_FULL_RESTORE_MODE;
 };
--- a/fdbserver/include/fdbserver/BlobMigratorInterface.h
+++ b/fdbserver/include/fdbserver/BlobMigratorInterface.h
@ -30,23 +30,25 @@
 struct BlobMigratorInterface {
 	constexpr static FileIdentifier file_identifier = 869199;
 	RequestStream<struct HaltBlobMigratorRequest> haltBlobMigrator;
-	RequestStream<ReplyPromise<Void>> waitFailure;
 	LocalityData locality;
 	UID uniqueID;
+	StorageServerInterface ssi;

 	BlobMigratorInterface() {}
-	BlobMigratorInterface(const struct LocalityData& l, UID id) : uniqueID(id), locality(l) {}
+	BlobMigratorInterface(const struct LocalityData& l, UID id) : uniqueID(id), locality(l) {
+		ssi.locality = l;
+		ssi.uniqueID = id;
+	}

-	void initEndpoints() {}
+	void initEndpoints() { ssi.initEndpoints(); }
 	UID id() const { return uniqueID; }
-	NetworkAddress address() const { return waitFailure.getEndpoint().getPrimaryAddress(); }
+	NetworkAddress address() const { return haltBlobMigrator.getEndpoint().getPrimaryAddress(); }
 	bool operator==(const BlobMigratorInterface& r) const { return id() == r.id(); }
 	bool operator!=(const BlobMigratorInterface& r) const { return !(*this == r); }

 	template <class Archive>
 	void serialize(Archive& ar) {
-		// StorageServerInterface::serialize(ar);
-		serializer(ar, waitFailure, haltBlobMigrator, locality, uniqueID);
+		serializer(ar, locality, uniqueID, haltBlobMigrator);
 	}
 };

--- a/fdbserver/include/fdbserver/DataDistribution.actor.h
+++ b/fdbserver/include/fdbserver/DataDistribution.actor.h
@ -322,6 +322,9 @@ public:
 	// Log physicalShard
 	void logPhysicalShardCollection();

+	// Checks if a physical shard exists.
+	bool physicalShardExists(uint64_t physicalShardID);
+
 private:
 	// Track physicalShard metrics by tracking keyRange metrics
 	void updatePhysicalShardMetricsByKeyRange(KeyRange keyRange,
@ -481,10 +484,6 @@ ShardSizeBounds getShardSizeBounds(KeyRangeRef shard, int64_t maxShardSize);
 // Determines the maximum shard size based on the size of the database
 int64_t getMaxShardSize(double dbSizeEstimate);

-struct StorageQuotaInfo {
-	std::map<Key, uint64_t> quotaMap;
-};
-
 #ifndef __INTEL_COMPILER
 #pragma endregion
 #endif
--- a/fdbserver/include/fdbserver/IKeyValueStore.h
+++ b/fdbserver/include/fdbserver/IKeyValueStore.h
@ -29,6 +29,7 @@
 #include "fdbserver/IClosable.h"
 #include "fdbserver/IPageEncryptionKeyProvider.actor.h"
 #include "fdbserver/ServerDBInfo.h"
+#include "fdbserver/StorageMetrics.h"

 struct CheckpointRequest {
 	const Version version; // The FDB version at which the checkpoint is created.
@ -52,7 +53,9 @@ public:
 	// persistRangeMapping().
 	virtual bool shardAware() const { return false; }
 	virtual void set(KeyValueRef keyValue, const Arena* arena = nullptr) = 0;
-	virtual void clear(KeyRangeRef range, const Arena* arena = nullptr) = 0;
+	virtual void clear(KeyRangeRef range,
+	                   const StorageServerMetrics* storageMetrics = nullptr,
+	                   const Arena* arena = nullptr) = 0;
 	virtual Future<Void> canCommit() { return Void(); }
 	virtual Future<Void> commit(
 	    bool sequential = false) = 0; // returns when prior sets and clears are (atomically) durable
--- a/fdbserver/include/fdbserver/RemoteIKeyValueStore.actor.h
+++ b/fdbserver/include/fdbserver/RemoteIKeyValueStore.actor.h
@ -390,7 +390,9 @@ struct RemoteIKeyValueStore : public IKeyValueStore {
 	void set(KeyValueRef keyValue, const Arena* arena = nullptr) override {
 		interf.set.send(IKVSSetRequest{ keyValue, ReplyPromise<Void>() });
 	}
-	void clear(KeyRangeRef range, const Arena* arena = nullptr) override {
+	void clear(KeyRangeRef range,
+	           const StorageServerMetrics* storageMetrics = nullptr,
+	           const Arena* arena = nullptr) override {
 		interf.clear.send(IKVSClearRequest{ range, ReplyPromise<Void>() });
 	}

--- a/fdbserver/include/fdbserver/TCInfo.h
+++ b/fdbserver/include/fdbserver/TCInfo.h
@ -268,5 +268,4 @@ public:
 	void removeTeam(TCTeamInfo team);
 	void updateCacheGeneration(int64_t generation) { m_cacheGeneration = generation; }
 	int64_t cacheGeneration() const { return m_cacheGeneration; }
-	void updateStorageUsage(int64_t size) { m_tenantInfo.storageUsage = size; }
 };
--- a/fdbserver/include/fdbserver/TenantCache.h
+++ b/fdbserver/include/fdbserver/TenantCache.h
@ -32,6 +32,12 @@

 typedef Map<KeyRef, Reference<TCTenantInfo>> TenantMapByPrefix;

+struct Storage {
+	int64_t quota = std::numeric_limits<int64_t>::max();
+	int64_t usage = 0;
+};
+typedef std::unordered_map<TenantName, Storage> TenantStorageMap;
+
 struct TenantCacheTenantCreated {
 	KeyRange keys;
 	Promise<bool> reply;
@ -50,6 +56,9 @@ private:
 	uint64_t generation;
 	TenantMapByPrefix tenantCache;

+	// Map from tenant names to storage quota and usage
+	TenantStorageMap tenantStorageMap;
+
 	// mark the start of a new sweep of the tenant cache
 	void startRefresh();

@ -62,11 +71,8 @@ private:
 	// return count of tenants that were found to be stale and removed from the cache
 	int cleanup();

-	// return the mapping from prefix -> tenant name for all tenants stored in the cache
-	std::vector<std::pair<KeyRef, TenantName>> getTenantList() const;
-
-	// update the size for a tenant; do nothing if the tenant doesn't exist in the map
-	void updateStorageUsage(KeyRef prefix, int64_t size);
+	// return all the TenantName for all tenants stored in the cache
+	std::vector<TenantName> getTenantList() const;

 	UID id() const { return distributorID; }

@ -85,9 +91,14 @@ public:

 	Future<Void> monitorStorageUsage();

+	Future<Void> monitorStorageQuota();
+
 	std::string desc() const;

 	bool isTenantKey(KeyRef key) const;

 	Optional<Reference<TCTenantInfo>> tenantOwning(KeyRef key) const;
+
+	// Get the list of tenants where the storage bytes currently used is greater than the quota allocated
+	std::vector<TenantName> getTenantsOverQuota() const;
 };
--- a/Show More
+++ b/Show More