Merge branch 'main' of github.com:apple/foundationdb into tenant-list-filter

2022-10-26 15:01:46 -07:00 · 2022-10-26 15:01:46 -07:00 · 886c286297
parent 098793893e 2bf9c2f448
commit 886c286297
103 changed files with 2941 additions and 1381 deletions
--- a/bindings/c/test/mako/mako.cpp
+++ b/bindings/c/test/mako/mako.cpp
@ -59,6 +59,8 @@
 #include "shm.hpp"
 #include "stats.hpp"
 #include "time.hpp"
+#include "rapidjson/document.h"
+#include "rapidjson/error/en.h"

 namespace mako {

@ -88,14 +90,29 @@ Transaction createNewTransaction(Database db, Arguments const& args, int id = -1
 	}
 	// Create Tenant Transaction
 	int tenant_id = (id == -1) ? urand(0, args.active_tenants - 1) : id;
+	Transaction tr;
+	std::string tenantStr;
 	// If provided tenants array, use it
 	if (tenants) {
-		return tenants[tenant_id].createTransaction();
+		tr = tenants[tenant_id].createTransaction();
+	} else {
+		tenantStr = "tenant" + std::to_string(tenant_id);
+		BytesRef tenant_name = toBytesRef(tenantStr);
+		Tenant t = db.openTenant(tenant_name);
+		tr = t.createTransaction();
 	}
-	std::string tenantStr = "tenant" + std::to_string(tenant_id);
-	BytesRef tenant_name = toBytesRef(tenantStr);
-	Tenant t = db.openTenant(tenant_name);
-	return t.createTransaction();
+	if (!args.authorization_tokens.empty()) {
+		// lookup token based on tenant name and, if found, set authz token to transaction
+		if (tenantStr.empty())
+			tenantStr = "tenant" + std::to_string(tenant_id);
+		auto tokenMapItr = args.authorization_tokens.find(tenantStr);
+		if (tokenMapItr != args.authorization_tokens.end()) {
+			tr.setOption(FDB_TR_OPTION_AUTHORIZATION_TOKEN, tokenMapItr->second);
+		} else {
+			logr.warn("Authorization token map is not empty, but could not find token for tenant '{}'", tenantStr);
+		}
+	}
+	return tr;
 }

 uint64_t byteswapHelper(uint64_t input) {
@ -815,6 +832,18 @@ int workerProcessMain(Arguments const& args, int worker_id, shared_memory::Acces
 		logr.error("network::setOption(FDB_NET_OPTION_DISTRIBUTED_CLIENT_TRACER): {}", err.what());
 	}

+	if (args.tls_certificate_file.has_value()) {
+		network::setOption(FDB_NET_OPTION_TLS_CERT_PATH, args.tls_certificate_file.value());
+	}
+
+	if (args.tls_key_file.has_value()) {
+		network::setOption(FDB_NET_OPTION_TLS_KEY_PATH, args.tls_key_file.value());
+	}
+
+	if (args.tls_ca_file.has_value()) {
+		network::setOption(FDB_NET_OPTION_TLS_CA_PATH, args.tls_ca_file.value());
+	}
+
 	/* enable flatbuffers if specified */
 	if (args.flatbuffers) {
 #ifdef FDB_NET_OPTION_USE_FLATBUFFERS
@ -982,57 +1011,55 @@ int workerProcessMain(Arguments const& args, int worker_id, shared_memory::Acces
 }

 /* initialize the parameters with default values */
-int initArguments(Arguments& args) {
-	memset(&args, 0, sizeof(Arguments)); /* zero-out everything */
-	args.num_fdb_clusters = 0;
-	args.num_databases = 1;
-	args.api_version = maxApiVersion();
-	args.json = 0;
-	args.num_processes = 1;
-	args.num_threads = 1;
-	args.async_xacts = 0;
-	args.mode = MODE_INVALID;
-	args.rows = 100000;
-	args.load_factor = 1.0;
-	args.row_digits = digits(args.rows);
-	args.seconds = 30;
-	args.iteration = 0;
-	args.tpsmax = 0;
-	args.tpsmin = -1;
-	args.tpsinterval = 10;
-	args.tpschange = TPS_SIN;
-	args.sampling = 1000;
-	args.key_length = 32;
-	args.value_length = 16;
-	args.active_tenants = 0;
-	args.total_tenants = 0;
-	args.tenant_batch_size = 10000;
-	args.zipf = 0;
-	args.commit_get = 0;
-	args.verbose = 1;
-	args.flatbuffers = 0; /* internal */
-	args.knobs[0] = '\0';
-	args.log_group[0] = '\0';
-	args.prefixpadding = 0;
-	args.trace = 0;
-	args.tracepath[0] = '\0';
-	args.traceformat = 0; /* default to client's default (XML) */
-	args.streaming_mode = FDB_STREAMING_MODE_WANT_ALL;
-	args.txntrace = 0;
-	args.txntagging = 0;
-	memset(args.txntagging_prefix, 0, TAGPREFIXLENGTH_MAX);
+Arguments::Arguments() {
+	num_fdb_clusters = 0;
+	num_databases = 1;
+	api_version = maxApiVersion();
+	json = 0;
+	num_processes = 1;
+	num_threads = 1;
+	async_xacts = 0;
+	mode = MODE_INVALID;
+	rows = 100000;
+	load_factor = 1.0;
+	row_digits = digits(rows);
+	seconds = 30;
+	iteration = 0;
+	tpsmax = 0;
+	tpsmin = -1;
+	tpsinterval = 10;
+	tpschange = TPS_SIN;
+	sampling = 1000;
+	key_length = 32;
+	value_length = 16;
+	active_tenants = 0;
+	total_tenants = 0;
+	tenant_batch_size = 10000;
+	zipf = 0;
+	commit_get = 0;
+	verbose = 1;
+	flatbuffers = 0; /* internal */
+	knobs[0] = '\0';
+	log_group[0] = '\0';
+	prefixpadding = 0;
+	trace = 0;
+	tracepath[0] = '\0';
+	traceformat = 0; /* default to client's default (XML) */
+	streaming_mode = FDB_STREAMING_MODE_WANT_ALL;
+	txntrace = 0;
+	txntagging = 0;
+	memset(txntagging_prefix, 0, TAGPREFIXLENGTH_MAX);
 	for (auto i = 0; i < MAX_OP; i++) {
-		args.txnspec.ops[i][OP_COUNT] = 0;
+		txnspec.ops[i][OP_COUNT] = 0;
 	}
-	args.client_threads_per_version = 0;
-	args.disable_client_bypass = false;
-	args.disable_ryw = 0;
-	args.json_output_path[0] = '\0';
-	args.stats_export_path[0] = '\0';
-	args.bg_materialize_files = false;
-	args.bg_file_path[0] = '\0';
-	args.distributed_tracer_client = 0;
-	return 0;
+	client_threads_per_version = 0;
+	disable_client_bypass = false;
+	disable_ryw = 0;
+	json_output_path[0] = '\0';
+	stats_export_path[0] = '\0';
+	bg_materialize_files = false;
+	bg_file_path[0] = '\0';
+	distributed_tracer_client = 0;
 }

 /* parse transaction specification */
@ -1279,6 +1306,10 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
 			{ "bg_file_path", required_argument, NULL, ARG_BG_FILE_PATH },
 			{ "stats_export_path", optional_argument, NULL, ARG_EXPORT_PATH },
 			{ "distributed_tracer_client", required_argument, NULL, ARG_DISTRIBUTED_TRACER_CLIENT },
+			{ "tls_certificate_file", required_argument, NULL, ARG_TLS_CERTIFICATE_FILE },
+			{ "tls_key_file", required_argument, NULL, ARG_TLS_KEY_FILE },
+			{ "tls_ca_file", required_argument, NULL, ARG_TLS_CA_FILE },
+			{ "authorization_token_file", required_argument, NULL, ARG_AUTHORIZATION_TOKEN_FILE },
 			{ NULL, 0, NULL, 0 }
 		};
 		idx = 0;
@ -1515,6 +1546,45 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
 				args.distributed_tracer_client = -1;
 			}
 			break;
+		case ARG_TLS_CERTIFICATE_FILE:
+			args.tls_certificate_file = std::string(optarg);
+			break;
+		case ARG_TLS_KEY_FILE:
+			args.tls_key_file = std::string(optarg);
+			break;
+		case ARG_TLS_CA_FILE:
+			args.tls_ca_file = std::string(optarg);
+			break;
+		case ARG_AUTHORIZATION_TOKEN_FILE: {
+			std::string tokenFilename(optarg);
+			std::ifstream ifs(tokenFilename);
+			std::ostringstream oss;
+			oss << ifs.rdbuf();
+			rapidjson::Document d;
+			d.Parse(oss.str().c_str());
+			if (d.HasParseError()) {
+				logr.error("Failed to parse authorization token JSON file '{}': {} at offset {}",
+				           tokenFilename,
+				           GetParseError_En(d.GetParseError()),
+				           d.GetErrorOffset());
+				return -1;
+			} else if (!d.IsObject()) {
+				logr.error("Authorization token JSON file '{}' must contain a JSON object", tokenFilename);
+				return -1;
+			}
+			for (auto itr = d.MemberBegin(); itr != d.MemberEnd(); ++itr) {
+				if (!itr->value.IsString()) {
+					logr.error("Token '{}' is not a string", itr->name.GetString());
+					return -1;
+				}
+				args.authorization_tokens.insert_or_assign(
+				    std::string(itr->name.GetString(), itr->name.GetStringLength()),
+				    std::string(itr->value.GetString(), itr->value.GetStringLength()));
+			}
+			logr.info("Added {} tenant authorization tokens to map from file '{}'",
+			          args.authorization_tokens.size(),
+			          tokenFilename);
+		} break;
 		}
 	}

@ -1525,93 +1595,97 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
 	return 0;
 }

-int validateArguments(Arguments const& args) {
-	if (args.mode == MODE_INVALID) {
+int Arguments::validate() {
+	if (mode == MODE_INVALID) {
 		logr.error("--mode has to be set");
 		return -1;
 	}
-	if (args.verbose < VERBOSE_NONE || args.verbose > VERBOSE_DEBUG) {
+	if (verbose < VERBOSE_NONE || verbose > VERBOSE_DEBUG) {
 		logr.error("--verbose must be between 0 and 3");
 		return -1;
 	}
-	if (args.rows <= 0) {
+	if (rows <= 0) {
 		logr.error("--rows must be a positive integer");
 		return -1;
 	}
-	if (args.load_factor <= 0 || args.load_factor > 1) {
+	if (load_factor <= 0 || load_factor > 1) {
 		logr.error("--load_factor must be in range (0, 1]");
 		return -1;
 	}
-	if (args.key_length < 0) {
+	if (key_length < 0) {
 		logr.error("--keylen must be a positive integer");
 		return -1;
 	}
-	if (args.value_length < 0) {
+	if (value_length < 0) {
 		logr.error("--vallen must be a positive integer");
 		return -1;
 	}
-	if (args.num_fdb_clusters > NUM_CLUSTERS_MAX) {
+	if (num_fdb_clusters > NUM_CLUSTERS_MAX) {
 		logr.error("Mako is not supported to do work to more than {} clusters", NUM_CLUSTERS_MAX);
 		return -1;
 	}
-	if (args.num_databases > NUM_DATABASES_MAX) {
+	if (num_databases > NUM_DATABASES_MAX) {
 		logr.error("Mako is not supported to do work to more than {} databases", NUM_DATABASES_MAX);
 		return -1;
 	}
-	if (args.num_databases < args.num_fdb_clusters) {
-		logr.error("--num_databases ({}) must be >= number of clusters({})", args.num_databases, args.num_fdb_clusters);
+	if (num_databases < num_fdb_clusters) {
+		logr.error("--num_databases ({}) must be >= number of clusters({})", num_databases, num_fdb_clusters);
 		return -1;
 	}
-	if (args.num_threads < args.num_databases) {
-		logr.error("--threads ({}) must be >= number of databases ({})", args.num_threads, args.num_databases);
+	if (num_threads < num_databases) {
+		logr.error("--threads ({}) must be >= number of databases ({})", num_threads, num_databases);
 		return -1;
 	}
-	if (args.key_length < 4 /* "mako" */ + args.row_digits) {
+	if (key_length < 4 /* "mako" */ + row_digits) {
 		logr.error("--keylen must be larger than {} to store \"mako\" prefix "
 		           "and maximum row number",
-		           4 + args.row_digits);
+		           4 + row_digits);
 		return -1;
 	}
-	if (args.active_tenants > args.total_tenants) {
+	if (active_tenants > total_tenants) {
 		logr.error("--active_tenants must be less than or equal to --total_tenants");
 		return -1;
 	}
-	if (args.tenant_batch_size < 1) {
+	if (tenant_batch_size < 1) {
 		logr.error("--tenant_batch_size must be at least 1");
 		return -1;
 	}
-	if (args.mode == MODE_RUN) {
-		if ((args.seconds > 0) && (args.iteration > 0)) {
+	if (mode == MODE_RUN) {
+		if ((seconds > 0) && (iteration > 0)) {
 			logr.error("Cannot specify seconds and iteration together");
 			return -1;
 		}
-		if ((args.seconds == 0) && (args.iteration == 0)) {
+		if ((seconds == 0) && (iteration == 0)) {
 			logr.error("Must specify either seconds or iteration");
 			return -1;
 		}
-		if (args.txntagging < 0) {
+		if (txntagging < 0) {
 			logr.error("--txntagging must be a non-negative integer");
 			return -1;
 		}
 	}

 	// ensure that all of the files provided to mako are valid and exist
-	if (args.mode == MODE_REPORT) {
-		if (!args.num_report_files) {
+	if (mode == MODE_REPORT) {
+		if (!num_report_files) {
 			logr.error("No files to merge");
 		}
-		for (int i = 0; i < args.num_report_files; i++) {
+		for (int i = 0; i < num_report_files; i++) {
 			struct stat buffer;
-			if (stat(args.report_files[i], &buffer) != 0) {
-				logr.error("Couldn't open file {}", args.report_files[i]);
+			if (stat(report_files[i], &buffer) != 0) {
+				logr.error("Couldn't open file {}", report_files[i]);
 				return -1;
 			}
 		}
 	}
-	if (args.distributed_tracer_client < 0) {
-		logr.error("--disibuted_tracer_client must specify either (disabled, network_lossy, log_file)");
+	if (distributed_tracer_client < 0) {
+		logr.error("--distributed_tracer_client must specify either (disabled, network_lossy, log_file)");
 		return -1;
 	}
+
+	if (!authorization_tokens.empty() && !tls_ca_file.has_value()) {
+		logr.warn("Authorization tokens are being used without explicit TLS CA file configured");
+	}
 	return 0;
 }

@ -2262,11 +2336,6 @@ int main(int argc, char* argv[]) {

 	auto rc = int{};
 	auto args = Arguments{};
-	rc = initArguments(args);
-	if (rc < 0) {
-		logr.error("initArguments failed");
-		return -1;
-	}
 	rc = parseArguments(argc, argv, args);
 	if (rc < 0) {
 		/* usage printed */
@ -2282,7 +2351,7 @@ int main(int argc, char* argv[]) {
 		args.total_tenants = args.active_tenants;
 	}

-	rc = validateArguments(args);
+	rc = args.validate();
 	if (rc < 0)
 		return -1;
 	logr.setVerbosity(args.verbose);
--- a/bindings/c/test/mako/mako.hpp
+++ b/bindings/c/test/mako/mako.hpp
@ -30,6 +30,7 @@
 #include <cassert>
 #include <chrono>
 #include <list>
+#include <map>
 #include <vector>
 #include <string_view>
 #include <fdb_api.hpp>
@ -79,7 +80,11 @@ enum ArgKind {
 	ARG_JSON_REPORT,
 	ARG_BG_FILE_PATH, // if blob granule files are stored locally, mako will read and materialize them if this is set
 	ARG_EXPORT_PATH,
-	ARG_DISTRIBUTED_TRACER_CLIENT
+	ARG_DISTRIBUTED_TRACER_CLIENT,
+	ARG_TLS_CERTIFICATE_FILE,
+	ARG_TLS_KEY_FILE,
+	ARG_TLS_CA_FILE,
+	ARG_AUTHORIZATION_TOKEN_FILE,
 };

 constexpr const int OP_COUNT = 0;
@ -131,6 +136,9 @@ constexpr const int MAX_REPORT_FILES = 200;

 /* benchmark parameters */
 struct Arguments {
+	Arguments();
+	int validate();
+
 	int api_version;
 	int json;
 	int num_processes;
@ -180,6 +188,10 @@ struct Arguments {
 	char report_files[MAX_REPORT_FILES][PATH_MAX];
 	int num_report_files;
 	int distributed_tracer_client;
+	std::optional<std::string> tls_certificate_file;
+	std::optional<std::string> tls_key_file;
+	std::optional<std::string> tls_ca_file;
+	std::map<std::string, std::string> authorization_tokens; // maps tenant name to token string
 };

 } // namespace mako
--- a/bindings/c/test/mako/mako.rst
+++ b/bindings/c/test/mako/mako.rst
@ -38,7 +38,7 @@ Arguments
  | - ``build``:  Populate data
  | - ``run``:  Run the benchmark

- | ``-c | --cluster <cluster file>``
+- | ``-c | --cluster <cluster_file>``
  | FDB cluster files (Required, comma-separated)

 - | ``-d | --num_databases <num_databases>``
@ -125,9 +125,21 @@ Arguments
  | Disable snapshot read-your-writes

 - | ``--json_report`` defaults to ``mako.json``
-  | ``--json_report=PATH``
+  | ``--json_report <path>``
  | Output stats to the specified json file

+- | ``--tls_certificate_file <path>``
+  | Use TLS certificate located in ``<path>``
+
+- | ``--tls_key_file <path>``
+  | Use TLS key file located in ``<path>``
+
+- | ``--tls_ca_file <path>``
+  | Use TLS CA file located in ``<path>``
+
+- | ``--authorization_token_file <path>``
+  | Use authorization token JSON file located in ``<path>``
+  | Expected content is a JSON object where each key is a tenant name and the mapped value is a token string

 Transaction Specification
 =========================
--- a/cmake/FlowCommands.cmake
+++ b/cmake/FlowCommands.cmake
@ -76,38 +76,11 @@ function(generate_coverage_xml)
  add_dependencies(coverage_${target_name} coveragetool)
 endfunction()

-# This function asserts that `versions.h` does not exist in the source
-# directory. It does this in the prebuild phase of the target.
-# This is an ugly hack that should make sure that cmake isn't used with
-# a source directory in which FDB was previously built with `make`.
-function(assert_no_version_h target)
-
-  message(STATUS "Check versions.h on ${target}")
-  set(target_name "${target}_versions_h_check")
-
-  if (DEFINED ENV{VERBOSE})
-    add_custom_target("${target_name}"
-      COMMAND "${CMAKE_COMMAND}" -DFILE="${CMAKE_SOURCE_DIR}/versions.h"
-      -P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake"
-      COMMAND echo
-      "${CMAKE_COMMAND}" -P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake"
-      -DFILE="${CMAKE_SOURCE_DIR}/versions.h"
-      COMMENT "Check old build system wasn't used in source dir")
-  else()
-    add_custom_target("${target_name}"
-      COMMAND "${CMAKE_COMMAND}" -DFILE="${CMAKE_SOURCE_DIR}/versions.h"
-      -P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake"
-      COMMENT "Check old build system wasn't used in source dir")
-  endif()
-
-  add_dependencies(${target} ${target_name})
-endfunction()
-
 add_custom_target(strip_targets)
 add_dependencies(packages strip_targets)

 function(strip_debug_symbols target)
-  if (WIN32)
+  if(WIN32)
    return()
  endif()
  get_target_property(target_type ${target} TYPE)
@ -146,7 +119,7 @@ function(strip_debug_symbols target)
      COMMAND objcopy --verbose --only-keep-debug $<TARGET_FILE:${target}> "${out_file}.debug"
      COMMAND objcopy --verbose --add-gnu-debuglink="${out_file}.debug" "${out_file}"
      COMMENT "Copy debug symbols to ${out_name}.debug")
-    add_custom_target(strip_${target} DEPENDS  "${out_file}.debug")
+    add_custom_target(strip_${target} DEPENDS "${out_file}.debug")
  else()
    add_custom_target(strip_${target})
    add_dependencies(strip_${target} strip_only_${target})
@ -171,7 +144,7 @@ function(copy_headers)
  foreach(f IN LISTS CP_SRCS)
    is_prefix(bd "${CMAKE_CURRENT_BINARY_DIR}" "${f}")
    is_prefix(sd "${CMAKE_CURRENT_SOURCE_DIR}" "${f}")
-    if (bd OR sd)
+    if(bd OR sd)
      continue()
    endif()
    is_header(hdr "${f}")
@ -180,7 +153,7 @@ function(copy_headers)
    endif()
    get_filename_component(fname ${f} NAME)
    get_filename_component(dname ${f} DIRECTORY)
-    if (dname)
+    if(dname)
      make_directory(${incl_dir}/${dname})
    endif()
    set(fpath "${incl_dir}/${dname}/${fname}")
@ -309,9 +282,6 @@ function(add_flow_target)

    add_custom_target(${AFT_NAME}_actors DEPENDS ${generated_files})
    add_dependencies(${AFT_NAME} ${AFT_NAME}_actors)
-    if(NOT WIN32)
-      assert_no_version_h(${AFT_NAME}_actors)
-    endif()
    generate_coverage_xml(${AFT_NAME})
    if(strip_target)
      strip_debug_symbols(${AFT_NAME})
--- a/cmake/awssdk.cmake
+++ b/cmake/awssdk.cmake
@ -8,40 +8,43 @@ endif()

 include(ExternalProject)
 ExternalProject_Add(awssdk_project
-  GIT_REPOSITORY    https://github.com/aws/aws-sdk-cpp.git
-  GIT_TAG           e4b4b310d8631bc7e9a797b6ac03a73c6f210bf6 # v1.9.331
-  SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src"
-  BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build"
-  GIT_CONFIG        advice.detachedHead=false
-  CMAKE_ARGS        -DBUILD_SHARED_LIBS=OFF        # SDK builds shared libs by default, we want static libs
-                    -DENABLE_TESTING=OFF
-                    -DBUILD_ONLY=core              # git repo contains SDK for every AWS product, we only want the core auth libraries
-                    -DSIMPLE_INSTALL=ON
-                    -DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path
-                    -DBYO_CRYPTO=ON                # we have our own crypto libraries that conflict if we let aws sdk build and link its own
-                    -DBUILD_CURL=ON
-                    -DBUILD_ZLIB=ON
-                    
-                    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${AWSSDK_COMPILER_FLAGS}
-  TEST_COMMAND      ""
+  GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git
+  GIT_TAG e4b4b310d8631bc7e9a797b6ac03a73c6f210bf6 # v1.9.331
+  SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src"
+  BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build"
+  GIT_CONFIG advice.detachedHead=false
+  # it seems advice.detachedHead breaks something which causes aws sdk to always be rebuilt.
+  # This option forces to cmake to build the aws sdk only once and never attempt to update it
+  UPDATE_DISCONNECTED ON
+  CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF        # SDK builds shared libs by default, we want static libs
+  -DENABLE_TESTING=OFF
+  -DBUILD_ONLY=core              # git repo contains SDK for every AWS product, we only want the core auth libraries
+  -DSIMPLE_INSTALL=ON
+  -DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path
+  -DBYO_CRYPTO=ON                # we have our own crypto libraries that conflict if we let aws sdk build and link its own
+  -DBUILD_CURL=ON
+  -DBUILD_ZLIB=ON
+
+  -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+  -DCMAKE_CXX_FLAGS=${AWSSDK_COMPILER_FLAGS}
+  TEST_COMMAND ""
  # the sdk build produces a ton of artifacts, with their own dependency tree, so there is a very specific dependency order they must be linked in
-  BUILD_BYPRODUCTS  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a"
-)
+  BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a"
+  )

 add_library(awssdk_core STATIC IMPORTED)
 add_dependencies(awssdk_core awssdk_project)
--- a/contrib/TestHarness2/test_harness/summarize.py
+++ b/contrib/TestHarness2/test_harness/summarize.py
@ -159,13 +159,20 @@ class Parser:
        pass


-class XmlParser(Parser, xml.sax.handler.ContentHandler):
+class XmlParser(Parser, xml.sax.handler.ContentHandler, xml.sax.handler.ErrorHandler):
    def __init__(self):
        super().__init__()
        self.handler: ParseHandler | None = None

    def parse(self, file: TextIO, handler: ParseHandler) -> None:
-        xml.sax.parse(file, self)
+        self.handler = handler
+        xml.sax.parse(file, self, errorHandler=self)
+
+    def error(self, exception):
+        pass
+
+    def fatalError(self, exception):
+        pass

    def startElement(self, name, attrs) -> None:
        attributes: Dict[str, str] = {}
@ -276,6 +283,7 @@ class TraceFiles:
                    raise StopIteration
                self.current += 1
                return self.trace_files[self.current - 1]
+
        return TraceFilesIterator(self)


@ -426,7 +434,8 @@ class Summary:
            lines = self.error_out.splitlines()
            stderr_bytes = 0
            for line in lines:
-                if line.endswith("WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!"):
+                if line.endswith(
+                        "WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!"):
                    # When running ASAN we expect to see this message. Boost coroutine should be using the correct asan annotations so that it shouldn't produce any false positives.
                    continue
                if line.endswith("Warning: unimplemented fcntl command: 1036"):
@ -560,6 +569,9 @@ class Summary:
        self.handler.add_handler(('Severity', '30'), parse_warning)

        def parse_error(attrs: Dict[str, str]):
+            if 'ErrorIsInjectedFault' in attrs and attrs['ErrorIsInjectedFault'].lower() in ['1', 'true']:
+                # ignore injected errors. In newer fdb versions these will have a lower severity
+                return
            self.errors += 1
            self.error = True
            if self.errors > config.max_errors:
@ -606,6 +618,7 @@ class Summary:
                child.attributes['File'] = attrs['File']
                child.attributes['Line'] = attrs['Line']
                self.out.append(child)
+
        self.handler.add_handler(('Type', 'BuggifySection'), buggify_section)
        self.handler.add_handler(('Type', 'FaultInjected'), buggify_section)

@ -614,9 +627,11 @@ class Summary:
            child.attributes['Name'] = attrs['Name']
            child.attributes['File'] = attrs['File']
            child.attributes['Line'] = attrs['Line']
+
        self.handler.add_handler(('Type', 'RunningUnitTest'), running_unit_test)

        def stderr_severity(attrs: Dict[str, str]):
            if 'NewSeverity' in attrs:
                self.stderr_severity = attrs['NewSeverity']
+
        self.handler.add_handler(('Type', 'StderrSeverity'), stderr_severity)
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@ -2365,6 +2365,7 @@ ACTOR Future<Void> runRestore(Database db,
 			                                                   KeyRef(addPrefix),
 			                                                   KeyRef(removePrefix),
 			                                                   LockDB::True,
+			                                                   UnlockDB::True,
 			                                                   onlyApplyMutationLogs,
 			                                                   inconsistentSnapshotOnly,
 			                                                   beginVersion,
--- a/fdbclient/BlobCipher.cpp
+++ b/fdbclient/BlobCipher.cpp
@ -83,7 +83,7 @@ BlobCipherMetrics::BlobCipherMetrics()
                  CounterSet(cc, "Backup"),
                  CounterSet(cc, "Test") }) {
 	specialCounter(cc, "CacheSize", []() { return BlobCipherKeyCache::getInstance()->getSize(); });
-	traceFuture = traceCounters("BlobCipherMetrics", UID(), FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, &cc);
+	traceFuture = cc.traceCounters("BlobCipherMetrics", UID(), FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL);
 }

 std::string toString(BlobCipherMetrics::UsageType type) {
--- a/fdbclient/BlobGranuleReader.actor.cpp
+++ b/fdbclient/BlobGranuleReader.actor.cpp
@ -142,7 +142,6 @@ bool isRangeFullyCovered(KeyRange range, Standalone<VectorRef<BlobGranuleChunkRe
 	for (const BlobGranuleChunkRef& chunk : blobChunks) {
 		blobRanges.push_back(chunk.keyRange);
 	}
-
 	return range.isCovered(blobRanges);
 }

@ -194,7 +193,7 @@ TEST_CASE("/fdbserver/blobgranule/isRangeCoveredByBlob") {
 		testAddChunkRange("key_a1"_sr, "key_a9"_sr, continuedChunks);
 		testAddChunkRange("key_a9"_sr, "key_b1"_sr, continuedChunks);
 		testAddChunkRange("key_b1"_sr, "key_b9"_sr, continuedChunks);
-		ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_b9"_sr), continuedChunks) == false);
+		ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_b9"_sr), continuedChunks));
 	}
 	return Void();
 }
--- a/fdbclient/ClientKnobs.cpp
+++ b/fdbclient/ClientKnobs.cpp
@ -272,6 +272,7 @@ void ClientKnobs::initialize(Randomize randomize) {
 	init( TAG_THROTTLE_EXPIRATION_INTERVAL,        60.0 ); if( randomize && BUGGIFY ) TAG_THROTTLE_EXPIRATION_INTERVAL = 1.0;
 	init( WRITE_COST_BYTE_FACTOR,                 16384 ); if( randomize && BUGGIFY ) WRITE_COST_BYTE_FACTOR = 4096;
 	init( READ_COST_BYTE_FACTOR,                  16384 ); if( randomize && BUGGIFY ) READ_COST_BYTE_FACTOR = 4096;
+	init( PROXY_MAX_TAG_THROTTLE_DURATION,          5.0 ); if( randomize && BUGGIFY ) PROXY_MAX_TAG_THROTTLE_DURATION = 0.5;

 	// busyness reporting
 	init( BUSYNESS_SPIKE_START_THRESHOLD,         0.100 );
--- a/fdbclient/FDBTypes.cpp
+++ b/fdbclient/FDBTypes.cpp
@ -22,6 +22,16 @@
 #include "fdbclient/Knobs.h"
 #include "fdbclient/NativeAPI.actor.h"

+KeyRangeRef toPrefixRelativeRange(KeyRangeRef range, KeyRef prefix) {
+	if (prefix.empty()) {
+		return range;
+	} else {
+		KeyRef begin = range.begin.startsWith(prefix) ? range.begin.removePrefix(prefix) : allKeys.begin;
+		KeyRef end = range.end.startsWith(prefix) ? range.end.removePrefix(prefix) : allKeys.end;
+		return KeyRangeRef(begin, end);
+	}
+}
+
 KeyRef keyBetween(const KeyRangeRef& keys) {
 	int pos = 0; // will be the position of the first difference between keys.begin and keys.end
 	int minSize = std::min(keys.begin.size(), keys.end.size());
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@ -167,6 +167,7 @@ public:
 	KeyBackedProperty<Key> removePrefix() { return configSpace.pack(__FUNCTION__sr); }
 	KeyBackedProperty<bool> onlyApplyMutationLogs() { return configSpace.pack(__FUNCTION__sr); }
 	KeyBackedProperty<bool> inconsistentSnapshotOnly() { return configSpace.pack(__FUNCTION__sr); }
+	KeyBackedProperty<bool> unlockDBAfterRestore() { return configSpace.pack(__FUNCTION__sr); }
 	// XXX: Remove restoreRange() once it is safe to remove. It has been changed to restoreRanges
 	KeyBackedProperty<KeyRange> restoreRange() { return configSpace.pack(__FUNCTION__sr); }
 	KeyBackedProperty<std::vector<KeyRange>> restoreRanges() { return configSpace.pack(__FUNCTION__sr); }
@ -591,12 +592,11 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
 	}

 	ACTOR static Future<StringRef> decryptImpl(Database cx,
-	                                           StringRef headerS,
+	                                           BlobCipherEncryptHeader header,
 	                                           const uint8_t* dataP,
 	                                           int64_t dataLen,
 	                                           Arena* arena) {
 		Reference<AsyncVar<ClientDBInfo> const> dbInfo = cx->clientInfo;
-		state BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(headerS);
 		TextAndHeaderCipherKeys cipherKeys = wait(getEncryptCipherKeys(dbInfo, header, BlobCipherMetrics::BACKUP));
 		ASSERT(cipherKeys.cipherHeaderKey.isValid() && cipherKeys.cipherTextKey.isValid());
 		validateEncryptionHeader(cipherKeys.cipherHeaderKey, cipherKeys.cipherTextKey, header);
@ -606,7 +606,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
 	}

 	static Future<StringRef> decrypt(Database cx,
-	                                 StringRef headerS,
+	                                 BlobCipherEncryptHeader headerS,
 	                                 const uint8_t* dataP,
 	                                 int64_t dataLen,
 	                                 Arena* arena) {
@ -651,7 +651,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
 	}

 	ACTOR static Future<Void> updateEncryptionKeysCtx(EncryptedRangeFileWriter* self, KeyRef key) {
-		state std::pair<int64_t, TenantName> curTenantInfo = wait(getEncryptionDomainDetails(key, self));
+		state std::pair<int64_t, TenantName> curTenantInfo = wait(getEncryptionDomainDetails(key, self->tenantCache));
 		state Reference<AsyncVar<ClientDBInfo> const> dbInfo = self->cx->clientInfo;

 		// Get text and header cipher key
@ -693,12 +693,13 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {

 	static bool isSystemKey(KeyRef key) { return key.size() && key[0] == systemKeys.begin[0]; }

-	ACTOR static Future<std::pair<int64_t, TenantName>>
-	getEncryptionDomainDetailsImpl(KeyRef key, Reference<TenantEntryCache<Void>> tenantCache, bool useTenantCache) {
+	ACTOR static Future<std::pair<int64_t, TenantName>> getEncryptionDomainDetailsImpl(
+	    KeyRef key,
+	    Reference<TenantEntryCache<Void>> tenantCache) {
 		if (isSystemKey(key)) {
 			return std::make_pair(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
 		}
-		if (key.size() < TENANT_PREFIX_SIZE || !useTenantCache) {
+		if (key.size() < TENANT_PREFIX_SIZE) {
 			return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
 		}
 		KeyRef tenantPrefix = KeyRef(key.begin(), TENANT_PREFIX_SIZE);
@ -710,21 +711,10 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
 		return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
 	}

-	static Future<std::pair<int64_t, TenantName>> getEncryptionDomainDetails(KeyRef key,
-	                                                                         EncryptedRangeFileWriter* self) {
-		// If tenants are disabled on a cluster then don't use the TenantEntryCache as it will result in alot of
-		// unnecessary cache misses. For a cluster configured in TenantMode::Optional, the backup performance may
-		// degrade if most of the mutations belong to an invalid tenant
-		TenantMode mode = self->cx->clientInfo->get().tenantMode;
-		bool useTenantCache = mode != TenantMode::DISABLED;
-		if (g_network->isSimulated() && mode == TenantMode::OPTIONAL_TENANT) {
-			// TODO: Currently simulation tests run with optional tenant mode but most data does not belong to any
-			// tenant. This results in many timeouts so disable using the tenant cache until optional tenant mode
-			// support with backups is more performant
-			useTenantCache = false;
-		}
-		CODE_PROBE(useTenantCache, "using tenant cache");
-		return getEncryptionDomainDetailsImpl(key, self->tenantCache, useTenantCache);
+	static Future<std::pair<int64_t, TenantName>> getEncryptionDomainDetails(
+	    KeyRef key,
+	    Reference<TenantEntryCache<Void>> tenantCache) {
+		return getEncryptionDomainDetailsImpl(key, tenantCache);
 	}

 	// Handles the first block and internal blocks.  Ends current block if needed.
@ -816,6 +806,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
 		    curKeyTenantInfo.first != FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
 			endKey = StringRef(k.begin(), TENANT_PREFIX_SIZE);
 		}
+
 		state ValueRef newValue = StringRef();
 		self->lastKey = k;
 		self->lastValue = v;
@ -834,9 +825,9 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
 		if (self->lastKey.size() == 0 || k.size() == 0) {
 			return false;
 		}
-		state std::pair<int64_t, TenantName> curKeyTenantInfo = wait(getEncryptionDomainDetails(k, self));
-		state std::pair<int64_t, TenantName> prevKeyTenantInfo = wait(getEncryptionDomainDetails(self->lastKey, self));
-		// crossing tenant boundaries so finish the current block using only the tenant prefix of the new key
+		state std::pair<int64_t, TenantName> curKeyTenantInfo = wait(getEncryptionDomainDetails(k, self->tenantCache));
+		state std::pair<int64_t, TenantName> prevKeyTenantInfo =
+		    wait(getEncryptionDomainDetails(self->lastKey, self->tenantCache));
 		if (curKeyTenantInfo.first != prevKeyTenantInfo.first) {
 			CODE_PROBE(true, "crossed tenant boundaries");
 			wait(handleTenantBondary(self, k, v, writeValue, curKeyTenantInfo));
@ -1040,11 +1031,18 @@ private:
 	Key lastValue;
 };

-void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>* results) {
+ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
+                                        Standalone<VectorRef<KeyValueRef>>* results,
+                                        bool encryptedBlock,
+                                        Optional<Reference<TenantEntryCache<Void>>> tenantCache,
+                                        Optional<BlobCipherEncryptHeader> encryptHeader) {
 	// Read begin key, if this fails then block was invalid.
-	uint32_t kLen = reader->consumeNetworkUInt32();
-	const uint8_t* k = reader->consume(kLen);
+	state uint32_t kLen = reader->consumeNetworkUInt32();
+	state const uint8_t* k = reader->consume(kLen);
 	results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef()));
+	state KeyRef prevKey = KeyRef(k, kLen);
+	state bool done = false;
+	state Optional<std::pair<int64_t, TenantName>> prevTenantInfo;

 	// Read kv pairs and end key
 	while (1) {
@ -1052,6 +1050,35 @@ void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>*
 		kLen = reader->consumeNetworkUInt32();
 		k = reader->consume(kLen);

+		// make sure that all keys in a block belong to exactly one tenant,
+		// unless its the last key in which case it can be a truncated (different) tenant prefix
+		if (encryptedBlock && g_network && g_network->isSimulated()) {
+			ASSERT(tenantCache.present());
+			ASSERT(encryptHeader.present());
+			state KeyRef curKey = KeyRef(k, kLen);
+			if (!prevTenantInfo.present()) {
+				std::pair<int64_t, TenantName> tenantInfo =
+				    wait(EncryptedRangeFileWriter::getEncryptionDomainDetails(prevKey, tenantCache.get()));
+				prevTenantInfo = tenantInfo;
+			}
+			std::pair<int64_t, TenantName> curTenantInfo =
+			    wait(EncryptedRangeFileWriter::getEncryptionDomainDetails(curKey, tenantCache.get()));
+			if (!curKey.empty() && !prevKey.empty() && prevTenantInfo.get().first != curTenantInfo.first) {
+				ASSERT(!done);
+				if (curTenantInfo.first != SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID &&
+				    curTenantInfo.first != FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
+					ASSERT(curKey.size() == TENANT_PREFIX_SIZE);
+				}
+				done = true;
+			}
+			// make sure that all keys (except possibly the last key) in a block are encrypted using the correct key
+			if (!prevKey.empty()) {
+				ASSERT(prevTenantInfo.get().first == encryptHeader.get().cipherTextDetails.encryptDomainId);
+			}
+			prevKey = curKey;
+			prevTenantInfo = curTenantInfo;
+		}
+
 		// If eof reached or first value len byte is 0xFF then a valid block end was reached.
 		if (reader->eof() || *reader->rptr == 0xFF) {
 			results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef()));
@ -1072,6 +1099,8 @@ void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>*
 	for (auto b : reader->remainder())
 		if (b != 0xFF)
 			throw restore_corrupted_data_padding();
+
+	return Void();
 }

 ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file,
@ -1094,7 +1123,11 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
 		// BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION
 		int32_t file_version = reader.consume<int32_t>();
 		if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {
-			decodeKVPairs(&reader, &results);
+			wait(decodeKVPairs(&reader,
+			                   &results,
+			                   false,
+			                   Optional<Reference<TenantEntryCache<Void>>>(),
+			                   Optional<BlobCipherEncryptHeader>()));
 		} else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) {
 			CODE_PROBE(true, "decoding encrypted block");
 			ASSERT(cx.present());
@ -1108,7 +1141,8 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<

 			// read encryption header
 			const uint8_t* headerStart = reader.consume(BlobCipherEncryptHeader::headerSize);
-			StringRef header = StringRef(headerStart, BlobCipherEncryptHeader::headerSize);
+			StringRef headerS = StringRef(headerStart, BlobCipherEncryptHeader::headerSize);
+			state BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(headerS);
 			const uint8_t* dataPayloadStart = headerStart + BlobCipherEncryptHeader::headerSize;
 			// calculate the total bytes read up to (and including) the header
 			int64_t bytesRead = sizeof(int32_t) + sizeof(uint32_t) + optionsLen + BlobCipherEncryptHeader::headerSize;
@ -1117,7 +1151,12 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
 			StringRef decryptedData =
 			    wait(EncryptedRangeFileWriter::decrypt(cx.get(), header, dataPayloadStart, dataLen, &results.arena()));
 			reader = StringRefReader(decryptedData, restore_corrupted_data());
-			decodeKVPairs(&reader, &results);
+			state Optional<Reference<TenantEntryCache<Void>>> tenantCache;
+			if (g_network && g_simulator->isSimulated()) {
+				tenantCache = makeReference<TenantEntryCache<Void>>(cx.get(), TenantEntryCacheRefreshMode::WATCH);
+				wait(tenantCache.get()->init());
+			}
+			wait(decodeKVPairs(&reader, &results, true, tenantCache, header));
 		} else {
 			throw restore_unsupported_file_version();
 		}
@ -3398,6 +3437,8 @@ struct RestoreCompleteTaskFunc : RestoreTaskFuncBase {

 		state RestoreConfig restore(task);
 		restore.stateEnum().set(tr, ERestoreState::COMPLETED);
+		state bool unlockDB = wait(restore.unlockDBAfterRestore().getD(tr, Snapshot::False, true));
+
 		tr->atomicOp(metadataVersionKey, metadataVersionRequiredValue, MutationRef::SetVersionstampedValue);
 		// Clear the file map now since it could be huge.
 		restore.fileSet().clear(tr);
@ -3413,7 +3454,9 @@ struct RestoreCompleteTaskFunc : RestoreTaskFuncBase {
 		restore.clearApplyMutationsKeys(tr);

 		wait(taskBucket->finish(tr, task));
-		wait(unlockDatabase(tr, restore.getUid()));
+		if (unlockDB) {
+			wait(unlockDatabase(tr, restore.getUid()));
+		}

 		return Void();
 	}
@ -5172,6 +5215,7 @@ public:
 	                                        Key addPrefix,
 	                                        Key removePrefix,
 	                                        LockDB lockDB,
+	                                        UnlockDB unlockDB,
 	                                        OnlyApplyMutationLogs onlyApplyMutationLogs,
 	                                        InconsistentSnapshotOnly inconsistentSnapshotOnly,
 	                                        Version beginVersion,
@ -5245,6 +5289,7 @@ public:
 		restore.onlyApplyMutationLogs().set(tr, onlyApplyMutationLogs);
 		restore.inconsistentSnapshotOnly().set(tr, inconsistentSnapshotOnly);
 		restore.beginVersion().set(tr, beginVersion);
+		restore.unlockDBAfterRestore().set(tr, unlockDB);
 		if (BUGGIFY && restoreRanges.size() == 1) {
 			restore.restoreRange().set(tr, restoreRanges[0]);
 		} else {
@ -5836,6 +5881,7 @@ public:
 	                                     Key addPrefix,
 	                                     Key removePrefix,
 	                                     LockDB lockDB,
+	                                     UnlockDB unlockDB,
 	                                     OnlyApplyMutationLogs onlyApplyMutationLogs,
 	                                     InconsistentSnapshotOnly inconsistentSnapshotOnly,
 	                                     Version beginVersion,
@ -5892,6 +5938,7 @@ public:
 				                   addPrefix,
 				                   removePrefix,
 				                   lockDB,
+				                   unlockDB,
 				                   onlyApplyMutationLogs,
 				                   inconsistentSnapshotOnly,
 				                   beginVersion,
@ -6017,7 +6064,7 @@ public:
 			}
 		}

-		Reference<IBackupContainer> bc = wait(backupConfig.backupContainer().getOrThrow(cx.getReference()));
+		state Reference<IBackupContainer> bc = wait(backupConfig.backupContainer().getOrThrow(cx.getReference()));

 		if (fastRestore) {
 			TraceEvent("AtomicParallelRestoreStartRestore").log();
@ -6043,24 +6090,80 @@ public:
 			return -1;
 		} else {
 			TraceEvent("AS_StartRestore").log();
-			Version ver = wait(restore(backupAgent,
-			                           cx,
-			                           cx,
-			                           tagName,
-			                           KeyRef(bc->getURL()),
-			                           bc->getProxy(),
-			                           ranges,
-			                           WaitForComplete::True,
-			                           ::invalidVersion,
-			                           Verbose::True,
-			                           addPrefix,
-			                           removePrefix,
-			                           LockDB::True,
-			                           OnlyApplyMutationLogs::False,
-			                           InconsistentSnapshotOnly::False,
-			                           ::invalidVersion,
-			                           {},
-			                           randomUid));
+			state Standalone<VectorRef<KeyRangeRef>> restoreRange;
+			state Standalone<VectorRef<KeyRangeRef>> systemRestoreRange;
+			bool encryptionEnabled = cx->clientInfo->get().isEncryptionEnabled;
+			for (auto r : ranges) {
+				if (!encryptionEnabled || !r.intersects(getSystemBackupRanges())) {
+					restoreRange.push_back_deep(restoreRange.arena(), r);
+				} else {
+					KeyRangeRef normalKeyRange = r & normalKeys;
+					KeyRangeRef systemKeyRange = r & systemKeys;
+					if (!normalKeyRange.empty()) {
+						restoreRange.push_back_deep(restoreRange.arena(), normalKeyRange);
+					}
+					if (!systemKeyRange.empty()) {
+						systemRestoreRange.push_back_deep(systemRestoreRange.arena(), systemKeyRange);
+					}
+				}
+			}
+			if (!systemRestoreRange.empty()) {
+				// restore system keys
+				wait(success(restore(backupAgent,
+				                     cx,
+				                     cx,
+				                     "system_restore"_sr,
+				                     KeyRef(bc->getURL()),
+				                     bc->getProxy(),
+				                     systemRestoreRange,
+				                     WaitForComplete::True,
+				                     ::invalidVersion,
+				                     Verbose::True,
+				                     addPrefix,
+				                     removePrefix,
+				                     LockDB::True,
+				                     UnlockDB::False,
+				                     OnlyApplyMutationLogs::False,
+				                     InconsistentSnapshotOnly::False,
+				                     ::invalidVersion,
+				                     {},
+				                     randomUid)));
+				state Reference<ReadYourWritesTransaction> rywTransaction =
+				    Reference<ReadYourWritesTransaction>(new ReadYourWritesTransaction(cx));
+				// clear old restore config associated with system keys
+				loop {
+					try {
+						rywTransaction->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+						rywTransaction->setOption(FDBTransactionOptions::LOCK_AWARE);
+						state RestoreConfig oldRestore(randomUid);
+						oldRestore.clear(rywTransaction);
+						wait(rywTransaction->commit());
+						break;
+					} catch (Error& e) {
+						wait(rywTransaction->onError(e));
+					}
+				}
+			}
+			// restore user data
+			state Version ver = wait(restore(backupAgent,
+			                                 cx,
+			                                 cx,
+			                                 tagName,
+			                                 KeyRef(bc->getURL()),
+			                                 bc->getProxy(),
+			                                 restoreRange,
+			                                 WaitForComplete::True,
+			                                 ::invalidVersion,
+			                                 Verbose::True,
+			                                 addPrefix,
+			                                 removePrefix,
+			                                 LockDB::True,
+			                                 UnlockDB::True,
+			                                 OnlyApplyMutationLogs::False,
+			                                 InconsistentSnapshotOnly::False,
+			                                 ::invalidVersion,
+			                                 {},
+			                                 randomUid));
 			return ver;
 		}
 	}
@ -6120,6 +6223,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
                                         Key addPrefix,
                                         Key removePrefix,
                                         LockDB lockDB,
+                                         UnlockDB unlockDB,
                                         OnlyApplyMutationLogs onlyApplyMutationLogs,
                                         InconsistentSnapshotOnly inconsistentSnapshotOnly,
                                         Version beginVersion,
@ -6137,6 +6241,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
 	                                    addPrefix,
 	                                    removePrefix,
 	                                    lockDB,
+	                                    unlockDB,
 	                                    onlyApplyMutationLogs,
 	                                    inconsistentSnapshotOnly,
 	                                    beginVersion,
@ -6178,6 +6283,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
 	               addPrefix,
 	               removePrefix,
 	               lockDB,
+	               UnlockDB::True,
 	               onlyApplyMutationLogs,
 	               inconsistentSnapshotOnly,
 	               beginVersion,
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@ -2559,19 +2559,19 @@ bool schemaMatch(json_spirit::mValue const& schemaValue,
 	}
 }

-void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota) {
+void setStorageQuota(Transaction& tr, StringRef tenantName, int64_t quota) {
 	tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 	auto key = storageQuotaKey(tenantName);
-	tr.set(key, BinaryWriter::toValue<uint64_t>(quota, Unversioned()));
+	tr.set(key, BinaryWriter::toValue<int64_t>(quota, Unversioned()));
 }

-ACTOR Future<Optional<uint64_t>> getStorageQuota(Transaction* tr, StringRef tenantName) {
+ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantName) {
 	tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
 	state Optional<Value> v = wait(tr->get(storageQuotaKey(tenantName)));
 	if (!v.present()) {
-		return Optional<uint64_t>();
+		return Optional<int64_t>();
 	}
-	return BinaryReader::fromStringRef<uint64_t>(v.get(), Unversioned());
+	return BinaryReader::fromStringRef<int64_t>(v.get(), Unversioned());
 }

 std::string ManagementAPI::generateErrorMessage(const CoordinatorsResult& res) {
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -1479,16 +1479,6 @@ Future<RangeResult> HealthMetricsRangeImpl::getRange(ReadYourWritesTransaction*
 	return healthMetricsGetRangeActor(ryw, kr);
 }

-KeyRangeRef toRelativeRange(KeyRangeRef range, KeyRef prefix) {
-	if (prefix.empty()) {
-		return range;
-	} else {
-		KeyRef begin = range.begin.startsWith(prefix) ? range.begin.removePrefix(prefix) : allKeys.begin;
-		KeyRef end = range.end.startsWith(prefix) ? range.end.removePrefix(prefix) : allKeys.end;
-		return KeyRangeRef(begin, end);
-	}
-}
-
 ACTOR Future<UID> getClusterId(Database db) {
 	while (!db->clientInfo->get().clusterId.isValid()) {
 		wait(db->clientInfo->onChange());
@ -1925,7 +1915,8 @@ Optional<KeyRangeLocationInfo> DatabaseContext::getCachedLocation(const Optional
 	auto range =
 	    isBackward ? locationCache.rangeContainingKeyBefore(resolvedKey) : locationCache.rangeContaining(resolvedKey);
 	if (range->value()) {
-		return KeyRangeLocationInfo(tenantEntry, toRelativeRange(range->range(), tenantEntry.prefix), range->value());
+		return KeyRangeLocationInfo(
+		    tenantEntry, toPrefixRelativeRange(range->range(), tenantEntry.prefix), range->value());
 	}

 	return Optional<KeyRangeLocationInfo>();
@ -1962,7 +1953,8 @@ bool DatabaseContext::getCachedLocations(const Optional<TenantNameRef>& tenantNa
 			result.clear();
 			return false;
 		}
-		result.emplace_back(tenantEntry, toRelativeRange(r->range() & resolvedRange, tenantEntry.prefix), r->value());
+		result.emplace_back(
+		    tenantEntry, toPrefixRelativeRange(r->range() & resolvedRange, tenantEntry.prefix), r->value());
 		if (result.size() == limit || begin == end) {
 			break;
 		}
@ -2978,7 +2970,7 @@ ACTOR Future<KeyRangeLocationInfo> getKeyLocation_internal(Database cx,

 					return KeyRangeLocationInfo(
 					    rep.tenantEntry,
-					    KeyRange(toRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena),
+					    KeyRange(toPrefixRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena),
 					    locationInfo);
 				}
 			}
@ -3123,7 +3115,7 @@ ACTOR Future<std::vector<KeyRangeLocationInfo>> getKeyRangeLocations_internal(
 						// efficient to save the map pairs and insert them all at once.
 						results.emplace_back(
 						    rep.tenantEntry,
-						    (toRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys),
+						    (toPrefixRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys),
 						    cx->setCachedLocation(
 						        tenant.name, rep.tenantEntry, rep.results[shard].first, rep.results[shard].second));
 						wait(yield());
@ -6558,7 +6550,7 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
 			    e.code() != error_code_grv_proxy_memory_limit_exceeded &&
 			    e.code() != error_code_batch_transaction_throttled && e.code() != error_code_tag_throttled &&
 			    e.code() != error_code_process_behind && e.code() != error_code_future_version &&
-			    e.code() != error_code_tenant_not_found) {
+			    e.code() != error_code_tenant_not_found && e.code() != error_code_proxy_tag_throttled) {
 				TraceEvent(SevError, "TryCommitError").error(e);
 			}
 			if (trState->trLogInfo)
@ -6999,6 +6991,8 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion(SpanContext parentSpa
 				                               &GrvProxyInterface::getConsistentReadVersion,
 				                               req,
 				                               cx->taskID))) {
+					CODE_PROBE(v.proxyTagThrottledDuration > 0.0,
+					           "getConsistentReadVersion received GetReadVersionReply delayed by proxy tag throttling");
 					if (tags.size() != 0) {
 						auto& priorityThrottledTags = cx->throttledTags[priority];
 						for (auto& tag : tags) {
@ -7033,7 +7027,7 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion(SpanContext parentSpa
 			}
 		} catch (Error& e) {
 			if (e.code() != error_code_broken_promise && e.code() != error_code_batch_transaction_throttled &&
-			    e.code() != error_code_grv_proxy_memory_limit_exceeded)
+			    e.code() != error_code_grv_proxy_memory_limit_exceeded && e.code() != error_code_proxy_tag_throttled)
 				TraceEvent(SevError, "GetConsistentReadVersionError").error(e);
 			if (e.code() == error_code_batch_transaction_throttled && !cx->apiVersionAtLeast(630)) {
 				wait(delayJittered(5.0));
@ -7484,7 +7478,7 @@ Future<Void> Transaction::onError(Error const& e) {
 	    e.code() == error_code_database_locked || e.code() == error_code_commit_proxy_memory_limit_exceeded ||
 	    e.code() == error_code_grv_proxy_memory_limit_exceeded || e.code() == error_code_process_behind ||
 	    e.code() == error_code_batch_transaction_throttled || e.code() == error_code_tag_throttled ||
-	    e.code() == error_code_blob_granule_request_failed) {
+	    e.code() == error_code_blob_granule_request_failed || e.code() == error_code_proxy_tag_throttled) {
 		if (e.code() == error_code_not_committed)
 			++trState->cx->transactionsNotCommitted;
 		else if (e.code() == error_code_commit_unknown_result)
@ -7724,6 +7718,35 @@ ACTOR Future<Standalone<VectorRef<ReadHotRangeWithMetrics>>> getReadHotRanges(Da
 	}
 }

+ACTOR Future<Optional<StorageMetrics>> waitStorageMetricsWithLocation(TenantInfo tenantInfo,
+                                                                      KeyRange keys,
+                                                                      std::vector<KeyRangeLocationInfo> locations,
+                                                                      StorageMetrics min,
+                                                                      StorageMetrics max,
+                                                                      StorageMetrics permittedError) {
+	try {
+		Future<StorageMetrics> fx;
+		if (locations.size() > 1) {
+			fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError);
+		} else {
+			WaitMetricsRequest req(tenantInfo, keys, min, max);
+			fx = loadBalance(locations[0].locations->locations(),
+			                 &StorageServerInterface::waitMetrics,
+			                 req,
+			                 TaskPriority::DataDistribution);
+		}
+		StorageMetrics x = wait(fx);
+		return x;
+	} catch (Error& e) {
+		TraceEvent(SevDebug, "WaitStorageMetricsError").error(e);
+		if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
+			TraceEvent(SevError, "WaitStorageMetricsError").error(e);
+			throw;
+		}
+	}
+	return Optional<StorageMetrics>();
+}
+
 ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
    Database cx,
    KeyRange keys,
@ -7753,38 +7776,26 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
 		}

 		// SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better
-		// solution to this.
-		if (locations.size() < shardLimit) {
-			try {
-				Future<StorageMetrics> fx;
-				if (locations.size() > 1) {
-					fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError);
-				} else {
-					WaitMetricsRequest req(tenantInfo, keys, min, max);
-					fx = loadBalance(locations[0].locations->locations(),
-					                 &StorageServerInterface::waitMetrics,
-					                 req,
-					                 TaskPriority::DataDistribution);
-				}
-				StorageMetrics x = wait(fx);
-				return std::make_pair(x, -1);
-			} catch (Error& e) {
-				if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
-					TraceEvent(SevError, "WaitStorageMetricsError").error(e);
-					throw;
-				}
-				cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
-				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
-			}
-		} else {
+		// solution to this. How could this happen?
+		if (locations.size() >= shardLimit) {
 			TraceEvent(SevWarn, "WaitStorageMetricsPenalty")
 			    .detail("Keys", keys)
-			    .detail("Limit", CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT)
+			    .detail("Limit", shardLimit)
+			    .detail("LocationSize", locations.size())
 			    .detail("JitteredSecondsOfPenitence", CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY);
 			wait(delayJittered(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
 			// make sure that the next getKeyRangeLocations() call will actually re-fetch the range
 			cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
+			continue;
 		}
+
+		Optional<StorageMetrics> res =
+		    wait(waitStorageMetricsWithLocation(tenantInfo, keys, locations, min, max, permittedError));
+		if (res.present()) {
+			return std::make_pair(res, -1);
+		}
+		cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
+		wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
 	}
 }

@ -8645,6 +8656,56 @@ Future<Void> DatabaseContext::splitStorageMetricsStream(const PromiseStream<Key>
 	    resultStream, Database(Reference<DatabaseContext>::addRef(this)), keys, limit, estimated, minSplitBytes);
 }

+ACTOR Future<Optional<Standalone<VectorRef<KeyRef>>>> splitStorageMetricsWithLocations(
+    std::vector<KeyRangeLocationInfo> locations,
+    KeyRange keys,
+    StorageMetrics limit,
+    StorageMetrics estimated,
+    Optional<int> minSplitBytes) {
+	state StorageMetrics used;
+	state Standalone<VectorRef<KeyRef>> results;
+	results.push_back_deep(results.arena(), keys.begin);
+	//TraceEvent("SplitStorageMetrics").detail("Locations", locations.size());
+	try {
+		state int i = 0;
+		for (; i < locations.size(); i++) {
+			SplitMetricsRequest req(
+			    locations[i].range, limit, used, estimated, i == locations.size() - 1, minSplitBytes);
+			SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(),
+			                                         &StorageServerInterface::splitMetrics,
+			                                         req,
+			                                         TaskPriority::DataDistribution));
+			if (res.splits.size() && res.splits[0] <= results.back()) { // split points are out of order, possibly
+				                                                        // because of moving data, throw error to retry
+				ASSERT_WE_THINK(false); // FIXME: This seems impossible and doesn't seem to be covered by testing
+				throw all_alternatives_failed();
+			}
+			if (res.splits.size()) {
+				results.append(results.arena(), res.splits.begin(), res.splits.size());
+				results.arena().dependsOn(res.splits.arena());
+			}
+			used = res.used;
+
+			//TraceEvent("SplitStorageMetricsResult").detail("Used", used.bytes).detail("Location", i).detail("Size", res.splits.size());
+		}
+
+		if (used.allLessOrEqual(limit * CLIENT_KNOBS->STORAGE_METRICS_UNFAIR_SPLIT_LIMIT) && results.size() > 1) {
+			results.resize(results.arena(), results.size() - 1);
+		}
+
+		if (keys.end <= locations.back().range.end) {
+			results.push_back_deep(results.arena(), keys.end);
+		}
+		return results;
+	} catch (Error& e) {
+		if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
+			TraceEvent(SevError, "SplitStorageMetricsError").error(e);
+			throw;
+		}
+	}
+	return Optional<Standalone<VectorRef<KeyRef>>>();
+}
+
 ACTOR Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(Database cx,
                                                                KeyRange keys,
                                                                StorageMetrics limit,
@ -8663,61 +8724,24 @@ ACTOR Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(Database cx,
 		                              Optional<UID>(),
 		                              UseProvisionalProxies::False,
 		                              latestVersion));
-		state StorageMetrics used;
-		state Standalone<VectorRef<KeyRef>> results;

 		// SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better
 		// solution to this.
 		if (locations.size() == CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT) {
 			wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
 			cx->invalidateCache(Key(), keys);
-		} else {
-			results.push_back_deep(results.arena(), keys.begin);
-			try {
-				//TraceEvent("SplitStorageMetrics").detail("Locations", locations.size());
-
-				state int i = 0;
-				for (; i < locations.size(); i++) {
-					SplitMetricsRequest req(
-					    locations[i].range, limit, used, estimated, i == locations.size() - 1, minSplitBytes);
-					SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(),
-					                                         &StorageServerInterface::splitMetrics,
-					                                         req,
-					                                         TaskPriority::DataDistribution));
-					if (res.splits.size() &&
-					    res.splits[0] <= results.back()) { // split points are out of order, possibly because of
-						                                   // moving data, throw error to retry
-						ASSERT_WE_THINK(
-						    false); // FIXME: This seems impossible and doesn't seem to be covered by testing
-						throw all_alternatives_failed();
-					}
-					if (res.splits.size()) {
-						results.append(results.arena(), res.splits.begin(), res.splits.size());
-						results.arena().dependsOn(res.splits.arena());
-					}
-					used = res.used;
-
-					//TraceEvent("SplitStorageMetricsResult").detail("Used", used.bytes).detail("Location", i).detail("Size", res.splits.size());
-				}
-
-				if (used.allLessOrEqual(limit * CLIENT_KNOBS->STORAGE_METRICS_UNFAIR_SPLIT_LIMIT) &&
-				    results.size() > 1) {
-					results.resize(results.arena(), results.size() - 1);
-				}
-
-				if (keys.end <= locations.back().range.end) {
-					results.push_back_deep(results.arena(), keys.end);
-				}
-				return results;
-			} catch (Error& e) {
-				if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
-					TraceEvent(SevError, "SplitStorageMetricsError").error(e);
-					throw;
-				}
-				cx->invalidateCache(Key(), keys);
-				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
-			}
+			continue;
 		}
+
+		Optional<Standalone<VectorRef<KeyRef>>> results =
+		    wait(splitStorageMetricsWithLocations(locations, keys, limit, estimated, minSplitBytes));
+
+		if (results.present()) {
+			return results.get();
+		}
+
+		cx->invalidateCache(Key(), keys);
+		wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
 	}
 }

@ -10540,6 +10564,76 @@ Reference<DatabaseContext::TransactionT> DatabaseContext::createTransaction() {
 }

 // BlobGranule API.
+ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobRanges(Transaction* tr, KeyRange range, int batchLimit) {
+	state Standalone<VectorRef<KeyRangeRef>> blobRanges;
+	state Key beginKey = range.begin;
+
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+
+			state RangeResult results = wait(
+			    krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2));
+
+			blobRanges.arena().dependsOn(results.arena());
+			for (int i = 0; i < results.size() - 1; i++) {
+				if (results[i].value == blobRangeActive) {
+					blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key));
+				}
+				if (blobRanges.size() == batchLimit) {
+					return blobRanges;
+				}
+			}
+
+			if (!results.more) {
+				return blobRanges;
+			}
+			beginKey = results.back().key;
+		} catch (Error& e) {
+			wait(tr->onError(e));
+		}
+	}
+}
+
+ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobbifiedRanges(Transaction* tr,
+                                                                     KeyRange range,
+                                                                     int rangeLimit,
+                                                                     Optional<TenantName> tenantName) {
+	state TenantMapEntry tme;
+
+	loop {
+		try {
+			if (tenantName.present()) {
+				wait(store(tme, blobGranuleGetTenantEntry(tr, range.begin, tenantName)));
+				range = range.withPrefix(tme.prefix);
+			}
+			break;
+		} catch (Error& e) {
+			wait(tr->onError(e));
+		}
+	}
+
+	state Standalone<VectorRef<KeyRangeRef>> blobRanges = wait(getBlobRanges(tr, range, rangeLimit));
+	if (!tenantName.present()) {
+		return blobRanges;
+	}
+
+	// Strip tenant prefix out.
+	state Standalone<VectorRef<KeyRangeRef>> tenantBlobRanges;
+	for (auto& blobRange : blobRanges) {
+		// Filter out blob ranges that span tenants for some reason.
+		if (!blobRange.begin.startsWith(tme.prefix) || !blobRange.end.startsWith(tme.prefix)) {
+			TraceEvent("ListBlobbifiedRangeSpansTenants")
+			    .suppressFor(/*seconds=*/5)
+			    .detail("Tenant", tenantName.get())
+			    .detail("Range", blobRange);
+			continue;
+		}
+		tenantBlobRanges.push_back_deep(tenantBlobRanges.arena(), blobRange.removePrefix(tme.prefix));
+	}
+	return tenantBlobRanges;
+}
+
 ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
                                         KeyRange range,
                                         Version purgeVersion,
@ -10582,10 +10676,13 @@ ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
 			}

 			// must be aligned to blob range(s)
-			state Future<Optional<Value>> beginPresent = tr.get(purgeRange.begin.withPrefix(blobRangeKeys.begin));
-			state Future<Optional<Value>> endPresent = tr.get(purgeRange.end.withPrefix(blobRangeKeys.begin));
-			wait(success(beginPresent) && success(endPresent));
-			if (!beginPresent.get().present() || !endPresent.get().present()) {
+			state Future<Standalone<VectorRef<KeyRangeRef>>> blobbifiedBegin =
+			    getBlobbifiedRanges(&tr, KeyRangeRef(purgeRange.begin, purgeRange.begin), 2, {});
+			state Future<Standalone<VectorRef<KeyRangeRef>>> blobbifiedEnd =
+			    getBlobbifiedRanges(&tr, KeyRangeRef(purgeRange.end, purgeRange.end), 2, {});
+			wait(success(blobbifiedBegin) && success(blobbifiedEnd));
+			if ((!blobbifiedBegin.get().empty() && blobbifiedBegin.get().front().begin < purgeRange.begin) ||
+			    (!blobbifiedEnd.get().empty() && blobbifiedEnd.get().back().end > purgeRange.end)) {
 				TraceEvent("UnalignedPurge")
 				    .detail("Range", range)
 				    .detail("Version", purgeVersion)
@ -10662,39 +10759,6 @@ Future<Void> DatabaseContext::waitPurgeGranulesComplete(Key purgeKey) {
 	return waitPurgeGranulesCompleteActor(Reference<DatabaseContext>::addRef(this), purgeKey);
 }

-ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobRanges(Reference<ReadYourWritesTransaction> tr,
-                                                               KeyRange range,
-                                                               int batchLimit) {
-	state Standalone<VectorRef<KeyRangeRef>> blobRanges;
-	state Key beginKey = range.begin;
-
-	loop {
-		try {
-			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-
-			state RangeResult results = wait(
-			    krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2));
-
-			blobRanges.arena().dependsOn(results.arena());
-			for (int i = 0; i < results.size() - 1; i++) {
-				if (results[i].value == blobRangeActive) {
-					blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key));
-				}
-				if (blobRanges.size() == batchLimit) {
-					return blobRanges;
-				}
-			}
-
-			if (!results.more) {
-				return blobRanges;
-			}
-			beginKey = results.back().key;
-		} catch (Error& e) {
-			wait(tr->onError(e));
-		}
-	}
-}
-
 ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx,
                                     KeyRange range,
                                     bool active,
@ -10716,7 +10780,7 @@ ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx,
 				range = range.withPrefix(tenantEntry.prefix);
 			}

-			Standalone<VectorRef<KeyRangeRef>> startBlobRanges = wait(getBlobRanges(tr, range, 1));
+			Standalone<VectorRef<KeyRangeRef>> startBlobRanges = wait(getBlobRanges(&tr->getTransaction(), range, 1));

 			if (active) {
 				// Idempotent request.
@ -10764,47 +10828,19 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRangesActor(Refer
                                                                           KeyRange range,
                                                                           int rangeLimit,
                                                                           Optional<TenantName> tenantName) {
+
 	state Database db(cx);
-	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
-	state TenantMapEntry tme;
+	state Transaction tr(db);

-	loop {
-		try {
-			if (tenantName.present()) {
-				wait(store(tme, blobGranuleGetTenantEntry(&tr->getTransaction(), range.begin, tenantName)));
-				range = range.withPrefix(tme.prefix);
-			}
-			break;
-		} catch (Error& e) {
-			wait(tr->onError(e));
-		}
-	}
+	Standalone<VectorRef<KeyRangeRef>> blobbifiedRanges = wait(getBlobbifiedRanges(&tr, range, rangeLimit, tenantName));

-	state Standalone<VectorRef<KeyRangeRef>> blobRanges = wait(getBlobRanges(tr, range, rangeLimit));
-	if (!tenantName.present()) {
-		return blobRanges;
-	}
-
-	// Strip tenant prefix out.
-	state Standalone<VectorRef<KeyRangeRef>> tenantBlobRanges;
-	for (auto& blobRange : blobRanges) {
-		// Filter out blob ranges that span tenants for some reason.
-		if (!blobRange.begin.startsWith(tme.prefix) || !blobRange.end.startsWith(tme.prefix)) {
-			TraceEvent("ListBlobbifiedRangeSpansTenants")
-			    .suppressFor(/*seconds=*/5)
-			    .detail("Tenant", tenantName.get())
-			    .detail("Range", blobRange);
-			continue;
-		}
-		tenantBlobRanges.push_back_deep(tenantBlobRanges.arena(), blobRange.removePrefix(tme.prefix));
-	}
-	return tenantBlobRanges;
+	return blobbifiedRanges;
 }

 Future<Standalone<VectorRef<KeyRangeRef>>> DatabaseContext::listBlobbifiedRanges(KeyRange range,
-                                                                                 int rowLimit,
+                                                                                 int rangeLimit,
                                                                                 Optional<TenantName> tenantName) {
-	return listBlobbifiedRangesActor(Reference<DatabaseContext>::addRef(this), range, rowLimit, tenantName);
+	return listBlobbifiedRangesActor(Reference<DatabaseContext>::addRef(this), range, rangeLimit, tenantName);
 }

 int64_t getMaxKeySize(KeyRef const& key) {
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@ -297,7 +297,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC,   isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120;
 	init( DD_TENANT_AWARENESS_ENABLED,                         false );
 	init( TENANT_CACHE_LIST_REFRESH_INTERVAL,                      2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
-	init( TENANT_CACHE_STORAGE_REFRESH_INTERVAL,                   2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
+	init( TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL,             2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
+	init( TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL,            10 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);

 	// TeamRemover
 	init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER,                false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
@ -726,8 +727,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL,                30.0 ); if(randomize && BUGGIFY) TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL = 1.0;
 	init( AUTO_TAG_THROTTLING_ENABLED,                          true ); if(randomize && BUGGIFY) AUTO_TAG_THROTTLING_ENABLED = false;
 	init( SS_THROTTLE_TAGS_TRACKED,                                1 ); if(randomize && BUGGIFY) SS_THROTTLE_TAGS_TRACKED = deterministicRandom()->randomInt(1, 10);
-	init( GLOBAL_TAG_THROTTLING,                               false );
-	init( ENFORCE_TAG_THROTTLING_ON_PROXIES,                   false );
+	init( GLOBAL_TAG_THROTTLING,                               false ); if(isSimulated) GLOBAL_TAG_THROTTLING = deterministicRandom()->coinflip();
+	init( ENFORCE_TAG_THROTTLING_ON_PROXIES,   GLOBAL_TAG_THROTTLING );
 	init( GLOBAL_TAG_THROTTLING_MIN_RATE,                        1.0 );
 	init( GLOBAL_TAG_THROTTLING_FOLDING_TIME,                   10.0 );
 	init( GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO,            5.0 );
@ -966,6 +967,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( BG_CONSISTENCY_CHECK_ENABLED,                         true ); if (randomize && BUGGIFY) BG_CONSISTENCY_CHECK_ENABLED = false;
 	init( BG_CONSISTENCY_CHECK_TARGET_SPEED_KB,                 1000 ); if (randomize && BUGGIFY) BG_CONSISTENCY_CHECK_TARGET_SPEED_KB *= (deterministicRandom()->randomInt(2, 50) / 10);
 	init( BG_KEY_TUPLE_TRUNCATE_OFFSET,                            0 );
+	init( BG_ENABLE_READ_DRIVEN_COMPACTION,                     true ); if (randomize && BUGGIFY) BG_ENABLE_READ_DRIVEN_COMPACTION = false;
+	init( BG_RDC_BYTES_FACTOR,                                     2 ); if (randomize && BUGGIFY) BG_RDC_BYTES_FACTOR = deterministicRandom()->randomInt(1, 10);
+	init( BG_RDC_READ_FACTOR,                                      3 ); if (randomize && BUGGIFY) BG_RDC_READ_FACTOR = deterministicRandom()->randomInt(1, 10);

 	init( BG_ENABLE_MERGING,                                    true ); if (randomize && BUGGIFY) BG_ENABLE_MERGING = false;
 	init( BG_MERGE_CANDIDATE_THRESHOLD_SECONDS, isSimulated ? 20.0 : 30 * 60 ); if (randomize && BUGGIFY) BG_MERGE_CANDIDATE_THRESHOLD_SECONDS = 5.0;
@ -974,6 +978,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM,                8 ); if( randomize && BUGGIFY ) BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM = 1;
 	init( BLOB_WORKER_RESNAPSHOT_PARALLELISM,                     40 ); if( randomize && BUGGIFY ) BLOB_WORKER_RESNAPSHOT_PARALLELISM = deterministicRandom()->randomInt(1, 10);
 	init( BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM,             2000 ); if( randomize && BUGGIFY ) BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM = deterministicRandom()->randomInt(10, 100);
+	init( BLOB_WORKER_RDC_PARALLELISM,                             2 ); if( randomize && BUGGIFY ) BLOB_WORKER_RDC_PARALLELISM = deterministicRandom()->randomInt(1, 6);
+
 	init( BLOB_WORKER_TIMEOUT,                                  10.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_TIMEOUT = 1.0;
 	init( BLOB_WORKER_REQUEST_TIMEOUT,                           5.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_REQUEST_TIMEOUT = 1.0;
 	init( BLOB_WORKERLIST_FETCH_INTERVAL,                        1.0 );
--- a/fdbclient/TaskBucket.actor.cpp
+++ b/fdbclient/TaskBucket.actor.cpp
@ -579,8 +579,8 @@ public:
 	                              int maxConcurrentTasks) {
 		state Reference<AsyncVar<bool>> paused = makeReference<AsyncVar<bool>>(true);
 		state Future<Void> watchPausedFuture = watchPaused(cx, taskBucket, paused);
-		taskBucket->metricLogger = traceCounters(
-		    "TaskBucketMetrics", taskBucket->dbgid, CLIENT_KNOBS->TASKBUCKET_LOGGING_DELAY, &taskBucket->cc);
+		taskBucket->metricLogger = taskBucket->cc.traceCounters(
+		    "TaskBucketMetrics", taskBucket->dbgid, CLIENT_KNOBS->TASKBUCKET_LOGGING_DELAY);
 		loop {
 			while (paused->get()) {
 				wait(paused->onChange() || watchPausedFuture);
--- a/fdbclient/include/fdbclient/BackupAgent.actor.h
+++ b/fdbclient/include/fdbclient/BackupAgent.actor.h
@ -196,6 +196,7 @@ public:
 	                        Key addPrefix = Key(),
 	                        Key removePrefix = Key(),
 	                        LockDB = LockDB::True,
+	                        UnlockDB = UnlockDB::True,
 	                        OnlyApplyMutationLogs = OnlyApplyMutationLogs::False,
 	                        InconsistentSnapshotOnly = InconsistentSnapshotOnly::False,
 	                        Version beginVersion = ::invalidVersion,
--- a/fdbclient/include/fdbclient/BlobWorkerCommon.h
+++ b/fdbclient/include/fdbclient/BlobWorkerCommon.h
@ -45,6 +45,7 @@ struct BlobWorkerStats {
 	Counter compressionBytesFinal;
 	Counter fullRejections;
 	Counter forceFlushCleanups;
+	Counter readDrivenCompactions;

 	int numRangesAssigned;
 	int mutationBytesBuffered;
@ -83,10 +84,11 @@ struct BlobWorkerStats {
 	    readRequestsWithBegin("ReadRequestsWithBegin", cc), readRequestsCollapsed("ReadRequestsCollapsed", cc),
 	    flushGranuleReqs("FlushGranuleReqs", cc), compressionBytesRaw("CompressionBytesRaw", cc),
 	    compressionBytesFinal("CompressionBytesFinal", cc), fullRejections("FullRejections", cc),
-	    forceFlushCleanups("ForceFlushCleanups", cc), numRangesAssigned(0), mutationBytesBuffered(0),
-	    activeReadRequests(0), granulesPendingSplitCheck(0), minimumCFVersion(0), cfVersionLag(0),
-	    notAtLatestChangeFeeds(0), lastResidentMemory(0), estimatedMaxResidentMemory(0),
-	    initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock), deltaWritesLock(deltaWritesLock) {
+	    forceFlushCleanups("ForceFlushCleanups", cc), readDrivenCompactions("ReadDrivenCompactions", cc),
+	    numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0), granulesPendingSplitCheck(0),
+	    minimumCFVersion(0), cfVersionLag(0), notAtLatestChangeFeeds(0), lastResidentMemory(0),
+	    estimatedMaxResidentMemory(0), initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock),
+	    deltaWritesLock(deltaWritesLock) {
 		specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; });
 		specialCounter(cc, "MutationBytesBuffered", [this]() { return this->mutationBytesBuffered; });
 		specialCounter(cc, "ActiveReadRequests", [this]() { return this->activeReadRequests; });
@ -103,8 +105,8 @@ struct BlobWorkerStats {
 		specialCounter(cc, "DeltaFileWritesActive", [this]() { return this->deltaWritesLock->activePermits(); });
 		specialCounter(cc, "DeltaFileWritesWaiting", [this]() { return this->deltaWritesLock->waiters(); });

-		logger = traceCounters("BlobWorkerMetrics", id, interval, &cc, "BlobWorkerMetrics");
+		logger = cc.traceCounters("BlobWorkerMetrics", id, interval, "BlobWorkerMetrics");
 	}
 };

-#endif
+#endif
--- a/fdbclient/include/fdbclient/ClientKnobs.h
+++ b/fdbclient/include/fdbclient/ClientKnobs.h
@ -262,6 +262,8 @@ public:
 	double TAG_THROTTLE_EXPIRATION_INTERVAL;
 	int64_t WRITE_COST_BYTE_FACTOR; // Used to round up the cost of write operations
 	int64_t READ_COST_BYTE_FACTOR; // Used to round up the cost of read operations
+	double PROXY_MAX_TAG_THROTTLE_DURATION; // Maximum duration that a transaction can be tag throttled by proxy before
+	                                        // being rejected

 	// busyness reporting
 	double BUSYNESS_SPIKE_START_THRESHOLD;
--- a/fdbclient/include/fdbclient/FDBTypes.h
+++ b/fdbclient/include/fdbclient/FDBTypes.h
@ -336,12 +336,13 @@ struct KeyRangeRef {
 	bool isCovered(std::vector<KeyRangeRef>& ranges) {
 		ASSERT(std::is_sorted(ranges.begin(), ranges.end(), KeyRangeRef::ArbitraryOrder()));
 		KeyRangeRef clone(begin, end);
+
 		for (auto r : ranges) {
-			if (begin < r.begin)
+			if (clone.begin < r.begin)
 				return false; // uncovered gap between clone.begin and r.begin
-			if (end <= r.end)
+			if (clone.end <= r.end)
 				return true; // range is fully covered
-			if (end > r.begin)
+			if (clone.end > r.begin)
 				// {clone.begin, r.end} is covered. need to check coverage for {r.end, clone.end}
 				clone = KeyRangeRef(r.end, clone.end);
 		}
@ -589,6 +590,8 @@ inline KeyRange prefixRange(KeyRef prefix) {
 // The returned reference is valid as long as keys is valid.
 KeyRef keyBetween(const KeyRangeRef& keys);

+KeyRangeRef toPrefixRelativeRange(KeyRangeRef range, KeyRef prefix);
+
 struct KeySelectorRef {
 private:
 	KeyRef key; // Find the last item less than key
--- a/fdbclient/include/fdbclient/KeyLocationService.h
+++ b/fdbclient/include/fdbclient/KeyLocationService.h
@ -0,0 +1,48 @@
+/*
+ * KeyLocationService.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FOUNDATIONDB_KEYLOCATIONSERVICE_H
+#define FOUNDATIONDB_KEYLOCATIONSERVICE_H
+
+#include "fdbclient/NativeAPI.actor.h"
+#include "fdbclient/DatabaseContext.h"
+
+class IKeyLocationService {
+
+	// If isBackward == true, returns the shard containing the key before 'key' (an infinitely long, inexpressible key).
+	// Otherwise returns the shard containing key. It's possible the returned location is a failed interface.
+	virtual Future<KeyRangeLocationInfo> getKeyLocation(TenantInfo tenant,
+	                                                    Key key,
+	                                                    SpanContext spanContext,
+	                                                    Optional<UID> debugID,
+	                                                    UseProvisionalProxies useProvisionalProxies,
+	                                                    Reverse isBackward,
+	                                                    Version version) = 0;
+
+	virtual Future<std::vector<KeyRangeLocationInfo>> getKeyRangeLocations(TenantInfo tenant,
+	                                                                       KeyRange keys,
+	                                                                       int limit,
+	                                                                       Reverse reverse,
+	                                                                       SpanContext spanContext,
+	                                                                       Optional<UID> debugID,
+	                                                                       UseProvisionalProxies useProvisionalProxies,
+	                                                                       Version version) = 0;
+};
+
+#endif // FOUNDATIONDB_KEYLOCATIONSERVICE_H
--- a/fdbclient/include/fdbclient/ManagementAPI.actor.h
+++ b/fdbclient/include/fdbclient/ManagementAPI.actor.h
@ -164,8 +164,8 @@ bool schemaMatch(json_spirit::mValue const& schema,
 ACTOR Future<Void> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID);

 // Set and get the storage quota per tenant
-void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota);
-ACTOR Future<Optional<uint64_t>> getStorageQuota(Transaction* tr, StringRef tenantName);
+void setStorageQuota(Transaction& tr, StringRef tenantName, int64_t quota);
+ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantName);

 #include "flow/unactorcompiler.h"
 #endif
--- a/fdbclient/include/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/include/fdbclient/NativeAPI.actor.h
@ -591,6 +591,26 @@ int64_t getMaxWriteKeySize(KeyRef const& key, bool hasRawAccess);
 // Returns the maximum legal size of a key that can be cleared. Keys larger than this will be assumed not to exist.
 int64_t getMaxClearKeySize(KeyRef const& key);

+struct KeyRangeLocationInfo;
+// Return the aggregated StorageMetrics of range keys to the caller. The locations tell which interface should
+// serve the request. The final result is within (min-permittedError/2, max + permittedError/2) if valid.
+ACTOR Future<Optional<StorageMetrics>> waitStorageMetricsWithLocation(TenantInfo tenantInfo,
+                                                                      KeyRange keys,
+                                                                      std::vector<KeyRangeLocationInfo> locations,
+                                                                      StorageMetrics min,
+                                                                      StorageMetrics max,
+                                                                      StorageMetrics permittedError);
+
+// Return the suggested split points from storage server.The locations tell which interface should
+// serve the request. `limit` is the current estimated storage metrics of `keys`.The returned points, if present,
+// guarantee the metrics of split result is within limit.
+ACTOR Future<Optional<Standalone<VectorRef<KeyRef>>>> splitStorageMetricsWithLocations(
+    std::vector<KeyRangeLocationInfo> locations,
+    KeyRange keys,
+    StorageMetrics limit,
+    StorageMetrics estimated,
+    Optional<int> minSplitBytes);
+
 namespace NativeAPI {
 ACTOR Future<std::vector<std::pair<StorageServerInterface, ProcessClass>>> getServerListAndProcessClasses(
    Transaction* tr);
--- a/fdbclient/include/fdbclient/ServerKnobs.h
+++ b/fdbclient/include/fdbclient/ServerKnobs.h
@ -237,8 +237,10 @@ public:
 	    DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC; // Minimal age of a correct-configured server before it's chosen to be wiggled
 	bool DD_TENANT_AWARENESS_ENABLED;
 	int TENANT_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantCache is refreshed
-	int TENANT_CACHE_STORAGE_REFRESH_INTERVAL; // How often the storage bytes used by each tenant in the TenantCache is
-	                                           // refreshed
+	int TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL; // How often the storage bytes used by each tenant is refreshed
+	                                                 // in the TenantCache
+	int TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL; // How often the storage quota allocated to each tenant is
+	                                                 // refreshed in the TenantCache

 	// TeamRemover to remove redundant teams
 	bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor
@ -948,10 +950,14 @@ public:
 	int BG_MERGE_CANDIDATE_THRESHOLD_SECONDS;
 	int BG_MERGE_CANDIDATE_DELAY_SECONDS;
 	int BG_KEY_TUPLE_TRUNCATE_OFFSET;
+	bool BG_ENABLE_READ_DRIVEN_COMPACTION;
+	int BG_RDC_BYTES_FACTOR;
+	int BG_RDC_READ_FACTOR;

 	int BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM;
 	int BLOB_WORKER_RESNAPSHOT_PARALLELISM;
 	int BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM;
+	int BLOB_WORKER_RDC_PARALLELISM;

 	double BLOB_WORKER_TIMEOUT; // Blob Manager's reaction time to a blob worker failure
 	double BLOB_WORKER_REQUEST_TIMEOUT; // Blob Worker's server-side request timeout
--- a/fdbclient/include/fdbclient/TenantEntryCache.actor.h
+++ b/fdbclient/include/fdbclient/TenantEntryCache.actor.h
@ -68,6 +68,10 @@ using TenantEntryCachePayloadFunc = std::function<TenantEntryCachePayload<T>(con
 // 1. Lookup by 'TenantId'
 // 2. Lookup by 'TenantPrefix'
 // 3. Lookup by 'TenantName'
+// TODO: Currently this cache performs poorly if there are tenant access happening to unknown tenants which happens most
+// frequently in optional tenant mode but can also happen in required mode if there are alot of tenants created. Further
+// as a consequence of the design we cannot be sure that the state of a given tenant is accurate even if its present in
+// the cache.

 template <class T>
 class TenantEntryCache : public ReferenceCounted<TenantEntryCache<T>>, NonCopyable {
--- a/fdbclient/include/fdbclient/Tracing.h
+++ b/fdbclient/include/fdbclient/Tracing.h
@ -273,17 +273,4 @@ struct ITracer {
 	virtual void trace(Span const& span) = 0;
 };

-void openTracer(TracerType type);
-
-template <class T>
-struct SpannedDeque : Deque<T> {
-	Span span;
-	explicit SpannedDeque(Location loc) : span(loc) {}
-	SpannedDeque(SpannedDeque&& other) : Deque<T>(std::move(other)), span(std::move(other.span)) {}
-	SpannedDeque(SpannedDeque const&) = delete;
-	SpannedDeque& operator=(SpannedDeque const&) = delete;
-	SpannedDeque& operator=(SpannedDeque&& other) {
-		*static_cast<Deque<T>*>(this) = std::move(other);
-		span = std::move(other.span);
-	}
-};
+void openTracer(TracerType type);
--- a/fdbmonitor/CMakeLists.txt
+++ b/fdbmonitor/CMakeLists.txt
@ -5,9 +5,8 @@ get_target_property(fdbclient_target_includes fdbclient INCLUDE_DIRECTORIES)
 target_link_libraries(fdbmonitor PUBLIC SimpleOpt)
 target_include_directories(fdbmonitor PUBLIC "${fdbclient_target_includes}")
 strip_debug_symbols(fdbmonitor)
-assert_no_version_h(fdbmonitor)
 if(UNIX AND NOT APPLE)
-    target_link_libraries(fdbmonitor PRIVATE rt)
+  target_link_libraries(fdbmonitor PRIVATE rt)
 endif()
 # FIXME: This include directory is an ugly hack. We probably want to fix this.
 # as soon as we get rid of the old build system
@ -17,17 +16,17 @@ target_link_libraries(fdbmonitor PUBLIC Threads::Threads)
 # appears to change its behavior (it no longer seems to restart killed
 # processes). fdbmonitor is single-threaded anyway.
 get_target_property(fdbmonitor_options fdbmonitor COMPILE_OPTIONS)
-if (NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
+if(NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
  list(REMOVE_ITEM fdbmonitor_options "-fsanitize=thread")
  set_property(TARGET fdbmonitor PROPERTY COMPILE_OPTIONS ${fdbmonitor_options})
-endif ()
+endif()

 get_target_property(fdbmonitor_options fdbmonitor LINK_OPTIONS)

-if (NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
+if(NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
  list(REMOVE_ITEM fdbmonitor_options "-fsanitize=thread")
  set_property(TARGET fdbmonitor PROPERTY LINK_OPTIONS ${fdbmonitor_options})
-endif ()
+endif()

 if(GENERATE_DEBUG_PACKAGES)
  fdb_install(TARGETS fdbmonitor DESTINATION fdbmonitor COMPONENT server)
@ -51,7 +50,7 @@ add_custom_target(clean_sandbox

 add_custom_target(start_sandbox
  COMMAND ${CMAKE_BINARY_DIR}/bin/fdbmonitor --conffile ${CMAKE_BINARY_DIR}/sandbox/foundationdb.conf
-                                             --lockfile ${CMAKE_BINARY_DIR}/sandbox/fdbmonitor.lock)
+  --lockfile ${CMAKE_BINARY_DIR}/sandbox/fdbmonitor.lock)

 add_dependencies(start_sandbox fdbmonitor fdbserver)

@ -61,6 +60,6 @@ if(NOT EXISTS ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh)
 endif()

 add_custom_target(generate_profile
-  COMMAND  ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh ${CMAKE_BINARY_DIR})
+  COMMAND ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh ${CMAKE_BINARY_DIR})

 add_dependencies(generate_profile fdbmonitor fdbserver mako fdbcli)
--- a/fdbrpc/Stats.actor.cpp
+++ b/fdbrpc/Stats.actor.cpp
@ -24,8 +24,8 @@
 Counter::Counter(std::string const& name, CounterCollection& collection)
  : name(name), interval_start(0), last_event(0), interval_sq_time(0), roughness_interval_start(0), interval_delta(0),
    interval_start_value(0) {
-	metric.init(collection.name + "." + (char)toupper(name.at(0)) + name.substr(1), collection.id);
-	collection.counters.push_back(this);
+	metric.init(collection.getName() + "." + (char)toupper(name.at(0)) + name.substr(1), collection.getId());
+	collection.addCounter(this);
 }

 void Counter::operator+=(Value delta) {
@ -88,36 +88,48 @@ void CounterCollection::logToTraceEvent(TraceEvent& te) const {
 	}
 }

-ACTOR Future<Void> traceCounters(std::string traceEventName,
-                                 UID traceEventID,
-                                 double interval,
-                                 CounterCollection* counters,
-                                 std::string trackLatestName,
-                                 std::function<void(TraceEvent&)> decorator) {
-	wait(delay(0)); // Give an opportunity for all members used in special counters to be initialized
+class CounterCollectionImpl {
+public:
+	ACTOR static Future<Void> traceCounters(CounterCollection* counters,
+	                                        std::string traceEventName,
+	                                        UID traceEventID,
+	                                        double interval,
+	                                        std::string trackLatestName,
+	                                        std::function<void(TraceEvent&)> decorator) {
+		wait(delay(0)); // Give an opportunity for all members used in special counters to be initialized

-	for (ICounter* c : counters->counters)
-		c->resetInterval();
-
-	state Reference<EventCacheHolder> traceEventHolder;
-	if (!trackLatestName.empty()) {
-		traceEventHolder = makeReference<EventCacheHolder>(trackLatestName);
-	}
-
-	state double last_interval = now();
-
-	loop {
-		TraceEvent te(traceEventName.c_str(), traceEventID);
-		te.detail("Elapsed", now() - last_interval);
-
-		counters->logToTraceEvent(te);
-		decorator(te);
+		for (ICounter* c : counters->counters)
+			c->resetInterval();

+		state Reference<EventCacheHolder> traceEventHolder;
 		if (!trackLatestName.empty()) {
-			te.trackLatest(traceEventHolder->trackingKey);
+			traceEventHolder = makeReference<EventCacheHolder>(trackLatestName);
 		}

-		last_interval = now();
-		wait(delay(interval, TaskPriority::FlushTrace));
+		state double last_interval = now();
+
+		loop {
+			TraceEvent te(traceEventName.c_str(), traceEventID);
+			te.detail("Elapsed", now() - last_interval);
+
+			counters->logToTraceEvent(te);
+			decorator(te);
+
+			if (!trackLatestName.empty()) {
+				te.trackLatest(traceEventHolder->trackingKey);
+			}
+
+			last_interval = now();
+			wait(delay(interval, TaskPriority::FlushTrace));
+		}
 	}
+};
+
+Future<Void> CounterCollection::traceCounters(std::string const& traceEventName,
+                                              UID traceEventID,
+                                              double interval,
+                                              std::string const& trackLatestName,
+                                              std::function<void(TraceEvent&)> const& decorator) {
+	return CounterCollectionImpl::traceCounters(
+	    this, traceEventName, traceEventID, interval, trackLatestName, decorator);
 }
--- a/fdbrpc/include/fdbrpc/Stats.h
+++ b/fdbrpc/include/fdbrpc/Stats.h
@ -67,17 +67,37 @@ struct Traceable<ICounter*> : std::true_type {
 	}
 };

-struct CounterCollection {
-	CounterCollection(std::string name, std::string id = std::string()) : name(name), id(id) {}
-	std::vector<struct ICounter*> counters, counters_to_remove;
-	~CounterCollection() {
-		for (auto c : counters_to_remove)
-			c->remove();
-	}
+class CounterCollection {
+	friend class CounterCollectionImpl;
+
 	std::string name;
 	std::string id;
+	std::vector<struct ICounter*> counters, countersToRemove;
+
+public:
+	CounterCollection(std::string const& name, std::string const& id = std::string()) : name(name), id(id) {}
+	~CounterCollection() {
+		for (auto c : countersToRemove)
+			c->remove();
+	}
+
+	void addCounter(ICounter* counter) { counters.push_back(counter); }
+
+	// Call remove method on this counter in ~CounterCollection
+	void markForRemoval(ICounter* counter) { countersToRemove.push_back(counter); }
+
+	std::string const& getName() const { return name; }
+
+	std::string const& getId() const { return id; }

 	void logToTraceEvent(TraceEvent& te) const;
+
+	Future<Void> traceCounters(
+	    std::string const& traceEventName,
+	    UID traceEventID,
+	    double interval,
+	    std::string const& trackLatestName = std::string(),
+	    std::function<void(TraceEvent&)> const& decorator = [](auto& te) {});
 };

 struct Counter final : ICounter, NonCopyable {
@ -131,8 +151,8 @@ struct Traceable<Counter> : std::true_type {
 template <class F>
 struct SpecialCounter final : ICounter, FastAllocated<SpecialCounter<F>>, NonCopyable {
 	SpecialCounter(CounterCollection& collection, std::string const& name, F&& f) : name(name), f(f) {
-		collection.counters.push_back(this);
-		collection.counters_to_remove.push_back(this);
+		collection.addCounter(this);
+		collection.markForRemoval(this);
 	}
 	void remove() override { delete this; }

@ -162,14 +182,6 @@ static void specialCounter(CounterCollection& collection, std::string const& nam
 	new SpecialCounter<F>(collection, name, std::move(f));
 }

-Future<Void> traceCounters(
-    std::string const& traceEventName,
-    UID const& traceEventID,
-    double const& interval,
-    CounterCollection* const& counters,
-    std::string const& trackLatestName = std::string(),
-    std::function<void(TraceEvent&)> const& decorator = [](TraceEvent& te) {});
-
 class LatencyBands {
 public:
 	LatencyBands(std::string name, UID id, double loggingInterval)
@ -180,7 +192,7 @@ public:
 			if (bands.size() == 0) {
 				ASSERT(!cc && !filteredCount);
 				cc = std::make_unique<CounterCollection>(name, id.toString());
-				logger = traceCounters(name, id, loggingInterval, cc.get(), id.toString() + "/" + name);
+				logger = cc->traceCounters(name, id, loggingInterval, id.toString() + "/" + name);
 				filteredCount = std::make_unique<Counter>("Filtered", *cc);
 				insertBand(std::numeric_limits<double>::infinity());
 			}
--- a/fdbrpc/include/fdbrpc/TenantInfo.h
+++ b/fdbrpc/include/fdbrpc/TenantInfo.h
@ -42,8 +42,6 @@ struct TenantInfo {
 	// Is set during deserialization. It will be set to true if the tenant
 	// name is set and the client is authorized to use this tenant.
 	bool tenantAuthorized = false;
-	// Number of storage bytes currently used by this tenant.
-	int64_t storageUsage = 0;

 	// Helper function for most endpoints that read/write data. This returns true iff
 	// the client is either a) a trusted peer or b) is accessing keyspace belonging to a tenant,
--- a/fdbserver/BackupWorker.actor.cpp
+++ b/fdbserver/BackupWorker.actor.cpp
@ -290,8 +290,8 @@ struct BackupData {
 		specialCounter(cc, "MsgQ", [this]() { return this->messages.size(); });
 		specialCounter(cc, "BufferedBytes", [this]() { return this->lock->activePermits(); });
 		specialCounter(cc, "AvailableBytes", [this]() { return this->lock->available(); });
-		logger = traceCounters(
-		    "BackupWorkerMetrics", myId, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "BackupWorkerMetrics");
+		logger =
+		    cc.traceCounters("BackupWorkerMetrics", myId, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "BackupWorkerMetrics");
 	}

 	bool pullFinished() const { return endVersion.present() && pulledVersion.get() > endVersion.get(); }
--- a/fdbserver/BlobManager.actor.cpp
+++ b/fdbserver/BlobManager.actor.cpp
@ -296,7 +296,7 @@ struct BlobManagerStats {
 		specialCounter(cc, "HardBoundaries", [mergeHardBoundaries]() { return mergeHardBoundaries->size(); });
 		specialCounter(cc, "SoftBoundaries", [mergeBoundaries]() { return mergeBoundaries->size(); });
 		specialCounter(cc, "BlockedAssignments", [this]() { return this->blockedAssignments; });
-		logger = traceCounters("BlobManagerMetrics", id, interval, &cc, "BlobManagerMetrics");
+		logger = cc.traceCounters("BlobManagerMetrics", id, interval, "BlobManagerMetrics");
 	}
 };

@ -3537,7 +3537,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
 	}

 	// skip the rest of the algorithm for the first blob manager
-	if (bmData->epoch == 1) {
+	if (bmData->epoch == 1 && !isFullRestoreMode()) {
 		bmData->doneRecovering.send(Void());
 		return Void();
 	}
--- a/fdbserver/BlobManifest.actor.cpp
+++ b/fdbserver/BlobManifest.actor.cpp
@ -26,6 +26,7 @@
 #include "fdbclient/BlobGranuleCommon.h"
 #include "fdbserver/Knobs.h"
 #include "flow/FastRef.h"
+#include "flow/Trace.h"
 #include "flow/flow.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/BlobConnectionProvider.h"
@ -189,23 +190,6 @@ private:
 	static const int sMaxCount_{ 5 }; // max number of manifest file to keep
 };

-// Defines granule info that interests full restore
-struct BlobGranuleVersion {
-	// Two constructors required by VectorRef
-	BlobGranuleVersion() {}
-	BlobGranuleVersion(Arena& a, const BlobGranuleVersion& copyFrom)
-	  : granuleID(copyFrom.granuleID), keyRange(a, copyFrom.keyRange), version(copyFrom.version),
-	    sizeInBytes(copyFrom.sizeInBytes) {}
-
-	UID granuleID;
-	KeyRangeRef keyRange;
-	Version version;
-	int64_t sizeInBytes;
-};
-
-// Defines a vector for BlobGranuleVersion
-typedef Standalone<VectorRef<BlobGranuleVersion>> BlobGranuleVersionVector;
-
 // Defines filename, version, size for each granule file that interests full restore
 struct GranuleFileVersion {
 	Version version;
@ -226,16 +210,53 @@ public:
 			Value data = wait(readFromFile(self));
 			Standalone<BlobManifest> manifest = decode(data);
 			wait(writeSystemKeys(self, manifest.rows));
-			BlobGranuleVersionVector _ = wait(listGranules(self));
+			BlobGranuleRestoreVersionVector _ = wait(listGranules(self));
 		} catch (Error& e) {
 			dprint("WARNING: unexpected manifest loader error {}\n", e.what()); // skip error handling so far
 		}
 		return Void();
 	}

+	// Iterate active granules and return their version/sizes
+	ACTOR static Future<BlobGranuleRestoreVersionVector> listGranules(Reference<BlobManifestLoader> self) {
+		state Transaction tr(self->db_);
+		loop {
+			state BlobGranuleRestoreVersionVector results;
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+
+			try {
+				std::vector<KeyRangeRef> granules;
+				state int i = 0;
+				auto limit = GetRangeLimits::BYTE_LIMIT_UNLIMITED;
+				state RangeResult blobRanges = wait(tr.getRange(blobGranuleMappingKeys, limit));
+				for (i = 0; i < blobRanges.size() - 1; i++) {
+					Key startKey = blobRanges[i].key.removePrefix(blobGranuleMappingKeys.begin);
+					Key endKey = blobRanges[i + 1].key.removePrefix(blobGranuleMappingKeys.begin);
+					state KeyRange granuleRange = KeyRangeRef(startKey, endKey);
+					try {
+						Standalone<BlobGranuleRestoreVersion> granule = wait(getGranule(&tr, granuleRange));
+						results.push_back_deep(results.arena(), granule);
+					} catch (Error& e) {
+						if (e.code() == error_code_restore_missing_data) {
+							dprint("missing data for key range {} \n", granuleRange.toString());
+							TraceEvent("BlobRestoreMissingData").detail("KeyRange", granuleRange.toString());
+						} else {
+							throw;
+						}
+					}
+				}
+				return results;
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+	}
+
 	// Print out a summary for blob granules
 	ACTOR static Future<Void> print(Reference<BlobManifestLoader> self) {
-		state BlobGranuleVersionVector granules = wait(listGranules(self));
+		state BlobGranuleRestoreVersionVector granules = wait(listGranules(self));
 		for (auto granule : granules) {
 			wait(checkGranuleFiles(self, granule));
 		}
@ -285,41 +306,9 @@ private:
 		}
 	}

-	// Iterate active granules and return their version/sizes
-	ACTOR static Future<BlobGranuleVersionVector> listGranules(Reference<BlobManifestLoader> self) {
-		state Transaction tr(self->db_);
-		loop {
-			state BlobGranuleVersionVector results;
-			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
-
-			try {
-				std::vector<KeyRangeRef> granules;
-				state int i = 0;
-				auto limit = GetRangeLimits::BYTE_LIMIT_UNLIMITED;
-				state RangeResult blobRanges = wait(tr.getRange(blobGranuleMappingKeys, limit));
-				for (i = 0; i < blobRanges.size() - 1; i++) {
-					Key startKey = blobRanges[i].key.removePrefix(blobGranuleMappingKeys.begin);
-					Key endKey = blobRanges[i + 1].key.removePrefix(blobGranuleMappingKeys.begin);
-					state KeyRange granuleRange = KeyRangeRef(startKey, endKey);
-					try {
-						Standalone<BlobGranuleVersion> granule = wait(getGranule(&tr, granuleRange));
-						results.push_back_deep(results.arena(), granule);
-					} catch (Error& e) {
-						dprint("missing data for key range {} \n", granuleRange.toString());
-					}
-				}
-				return results;
-			} catch (Error& e) {
-				wait(tr.onError(e));
-			}
-		}
-	}
-
 	// Find the newest granule for a key range. The newest granule has the max version and relevant files
-	ACTOR static Future<Standalone<BlobGranuleVersion>> getGranule(Transaction* tr, KeyRangeRef range) {
-		state Standalone<BlobGranuleVersion> granuleVersion;
+	ACTOR static Future<Standalone<BlobGranuleRestoreVersion>> getGranule(Transaction* tr, KeyRangeRef range) {
+		state Standalone<BlobGranuleRestoreVersion> granuleVersion;
 		KeyRange historyKeyRange = blobGranuleHistoryKeyRangeFor(range);
 		// reverse lookup so that the first row is the newest version
 		state RangeResult results =
@ -389,7 +378,7 @@ private:
 	}

 	// Read data from granules and print out summary
-	ACTOR static Future<Void> checkGranuleFiles(Reference<BlobManifestLoader> self, BlobGranuleVersion granule) {
+	ACTOR static Future<Void> checkGranuleFiles(Reference<BlobManifestLoader> self, BlobGranuleRestoreVersion granule) {
 		state KeyRangeRef range = granule.keyRange;
 		state Version readVersion = granule.version;
 		state Transaction tr(self->db_);
@ -441,3 +430,11 @@ ACTOR Future<Void> printRestoreSummary(Database db, Reference<BlobConnectionProv
 	wait(BlobManifestLoader::print(loader));
 	return Void();
 }
+
+// API to list blob granules
+ACTOR Future<BlobGranuleRestoreVersionVector> listBlobGranules(Database db,
+                                                               Reference<BlobConnectionProvider> blobConn) {
+	Reference<BlobManifestLoader> loader = makeReference<BlobManifestLoader>(db, blobConn);
+	BlobGranuleRestoreVersionVector result = wait(BlobManifestLoader::listGranules(loader));
+	return result;
+}
--- a/fdbserver/BlobMigrator.actor.cpp
+++ b/fdbserver/BlobMigrator.actor.cpp
@ -30,54 +30,312 @@
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/SystemData.h"
 #include "fdbclient/NativeAPI.actor.h"
+#include "fdbclient/ManagementAPI.actor.h"
 #include "fdbserver/ServerDBInfo.actor.h"
 #include "fdbserver/WaitFailure.h"
-
+#include "fdbserver/MoveKeys.actor.h"
+#include "fdbserver/BlobGranuleServerCommon.actor.h"
 #include "flow/actorcompiler.h" // has to be last include
+#include "flow/network.h"
+#include <algorithm>
+#include <string>
+
+#define ENABLE_DEBUG_MG true
+
+template <typename... T>
+static inline void dprint(fmt::format_string<T...> fmt, T&&... args) {
+	if (ENABLE_DEBUG_MG)
+		fmt::print(fmt, std::forward<T>(args)...);
+}

 // BlobMigrator manages data migration from blob storage to storage server. It implements a minimal set of
 // StorageServerInterface APIs which are needed for DataDistributor to start data migration.
 class BlobMigrator : public NonCopyable, public ReferenceCounted<BlobMigrator> {
 public:
 	BlobMigrator(Reference<AsyncVar<ServerDBInfo> const> dbInfo, BlobMigratorInterface interf)
-	  : blobMigratorInterf(interf), actors(false) {
-		if (!blobConn.isValid() && SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
-			blobConn = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL);
+	  : interf_(interf), actors_(false) {
+		if (!blobConn_.isValid() && SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
+			blobConn_ = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL);
 		}
-		db = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True);
+		db_ = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True);
 	}
 	~BlobMigrator() {}

+	// Start migration
 	ACTOR static Future<Void> start(Reference<BlobMigrator> self) {
-		self->actors.add(waitFailureServer(self->blobMigratorInterf.waitFailure.getFuture()));
+		if (!isFullRestoreMode()) {
+			return Void();
+		}
+		wait(delay(10)); // TODO need to wait for a signal for readiness of blob manager
+
+		BlobGranuleRestoreVersionVector granules = wait(listBlobGranules(self->db_, self->blobConn_));
+		self->blobGranules_ = granules;
+
+		wait(prepare(self, normalKeys));
+
+		wait(serverLoop(self));
+		return Void();
+	}
+
+private:
+	// Prepare for data migration for given key range.
+	ACTOR static Future<Void> prepare(Reference<BlobMigrator> self, KeyRangeRef keys) {
+		// Register as a storage server, so that DataDistributor could start data movement after
+		std::pair<Version, Tag> verAndTag = wait(addStorageServer(self->db_, self->interf_.ssi));
+		dprint("Started storage server interface {} {}\n", verAndTag.first, verAndTag.second.toString());
+
+		// Reassign key ranges to the storage server
+		// It'll restart DataDistributor so that internal data structures like ShardTracker, ShardsAffectedByTeamFailure
+		// could be re-initialized. Ideally it should be done within DataDistributor, then we don't need to
+		// restart DataDistributor
+		state int oldMode = wait(setDDMode(self->db_, 0));
+		wait(unassignServerKeys(self, keys));
+		wait(assignKeysToServer(self, keys, self->interf_.ssi.id()));
+		wait(success(setDDMode(self->db_, oldMode)));
+		return Void();
+	}
+
+	// Assign given key range to specified storage server. Subsquent
+	ACTOR static Future<Void> assignKeysToServer(Reference<BlobMigrator> self, KeyRangeRef keys, UID serverUID) {
+		state Transaction tr(self->db_);
 		loop {
-			choose {
-				when(HaltBlobMigratorRequest req = waitNext(self->blobMigratorInterf.haltBlobMigrator.getFuture())) {
-					req.reply.send(Void());
-					TraceEvent("BlobMigratorHalted", self->blobMigratorInterf.id()).detail("ReqID", req.requesterID);
-					break;
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+			try {
+				state Value value = keyServersValue(std::vector<UID>({ serverUID }), std::vector<UID>(), UID(), UID());
+				wait(krmSetRange(&tr, keyServersPrefix, keys, value));
+				wait(krmSetRange(&tr, serverKeysPrefixFor(serverUID), keys, serverKeysTrue));
+				wait(tr.commit());
+				dprint("Assign {} to server {}\n", normalKeys.toString(), serverUID.toString());
+				return Void();
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+	}
+
+	// Unassign given key range from its current storage servers
+	ACTOR static Future<Void> unassignServerKeys(Reference<BlobMigrator> self, KeyRangeRef keys) {
+		state Transaction tr(self->db_);
+		loop {
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+			try {
+				state RangeResult serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));
+				ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY);
+				for (auto& server : serverList) {
+					state UID id = decodeServerListValue(server.value).id();
+					RangeResult ranges = wait(krmGetRanges(&tr, serverKeysPrefixFor(id), keys));
+					bool owning = false;
+					for (auto& r : ranges) {
+						if (r.value == serverKeysTrue) {
+							owning = true;
+							break;
+						}
+					}
+					if (owning) {
+						dprint("Unassign {} from storage server {}\n", keys.toString(), id.toString());
+						wait(krmSetRange(&tr, serverKeysPrefixFor(id), keys, serverKeysFalse));
+					}
 				}
-				when(wait(self->actors.getResult())) {}
+				wait(tr.commit());
+				return Void();
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+	}
+
+	// Main server loop
+	ACTOR static Future<Void> serverLoop(Reference<BlobMigrator> self) {
+		self->actors_.add(waitFailureServer(self->interf_.ssi.waitFailure.getFuture()));
+		self->actors_.add(handleRequest(self));
+		self->actors_.add(handleUnsupportedRequest(self));
+		loop {
+			try {
+				choose {
+					when(HaltBlobMigratorRequest req = waitNext(self->interf_.haltBlobMigrator.getFuture())) {
+						req.reply.send(Void());
+						TraceEvent("BlobMigratorHalted", self->interf_.id()).detail("ReqID", req.requesterID);
+						break;
+					}
+					when(wait(self->actors_.getResult())) {}
+				}
+			} catch (Error& e) {
+				dprint("Unexpected serverLoop error {}\n", e.what());
+				throw;
 			}
 		}
 		return Void();
 	}

+	// Handle StorageServerInterface APIs
+	ACTOR static Future<Void> handleRequest(Reference<BlobMigrator> self) {
+		state StorageServerInterface ssi = self->interf_.ssi;
+		loop {
+			try {
+				choose {
+					when(GetShardStateRequest req = waitNext(ssi.getShardState.getFuture())) {
+						dprint("Handle GetShardStateRequest\n");
+						Version version = maxVersion(self);
+						GetShardStateReply rep(version, version);
+						req.reply.send(rep); // return empty shards
+					}
+					when(WaitMetricsRequest req = waitNext(ssi.waitMetrics.getFuture())) {
+						// dprint("Handle WaitMetricsRequest\n");
+						self->actors_.add(processWaitMetricsRequest(self, req));
+					}
+					when(SplitMetricsRequest req = waitNext(ssi.splitMetrics.getFuture())) {
+						dprint("Handle SplitMetrics {}\n", req.keys.toString());
+						SplitMetricsReply rep;
+						for (auto granule : self->blobGranules_) {
+							// TODO: Use granule boundary as split point. A better approach is to split by size
+							if (granule.keyRange.begin > req.keys.begin && granule.keyRange.end < req.keys.end)
+								rep.splits.push_back_deep(rep.splits.arena(), granule.keyRange.begin);
+						}
+						req.reply.send(rep);
+					}
+					when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) {
+						fmt::print("Handle GetStorageMetrics\n");
+						StorageMetrics metrics;
+						metrics.bytes = sizeInBytes(self);
+						GetStorageMetricsReply resp;
+						resp.load = metrics;
+						req.reply.send(resp);
+					}
+					when(ReplyPromise<KeyValueStoreType> reply = waitNext(ssi.getKeyValueStoreType.getFuture())) {
+						dprint("Handle KeyValueStoreType\n");
+						reply.send(KeyValueStoreType::MEMORY);
+					}
+				}
+			} catch (Error& e) {
+				dprint("Unexpected blob migrator request error {}\n", e.what());
+				throw;
+			}
+		}
+	}
+
+	// Handle StorageServerInterface APIs that are not supported. Simply log and return error
+	ACTOR static Future<Void> handleUnsupportedRequest(Reference<BlobMigrator> self) {
+		state StorageServerInterface ssi = self->interf_.ssi;
+		loop {
+			try {
+				choose {
+					when(SplitRangeRequest req = waitNext(ssi.getRangeSplitPoints.getFuture())) {
+						dprint("Unsupported SplitRangeRequest\n");
+						req.reply.sendError(unsupported_operation());
+					}
+					when(StorageQueuingMetricsRequest req = waitNext(ssi.getQueuingMetrics.getFuture())) {
+						self->actors_.add(processStorageQueuingMetricsRequest(req));
+					}
+					when(ReadHotSubRangeRequest req = waitNext(ssi.getReadHotRanges.getFuture())) {
+						dprint("Unsupported ReadHotSubRange\n");
+						req.reply.sendError(unsupported_operation());
+					}
+					when(GetKeyValuesStreamRequest req = waitNext(ssi.getKeyValuesStream.getFuture())) {
+						dprint("Unsupported GetKeyValuesStreamRequest\n");
+						req.reply.sendError(unsupported_operation());
+					}
+					when(GetKeyRequest req = waitNext(ssi.getKey.getFuture())) {
+						dprint("Unsupported GetKeyRequest\n");
+						req.reply.sendError(unsupported_operation());
+					}
+					when(GetKeyValuesRequest req = waitNext(ssi.getKeyValues.getFuture())) {
+						/* dprint("Unsupported GetKeyValuesRequest {} - {} @ {}\n",
+						       req.begin.getKey().printable(),
+						       req.end.getKey().printable(),
+						       req.version); */
+						req.reply.sendError(unsupported_operation());
+					}
+					when(GetValueRequest req = waitNext(ssi.getValue.getFuture())) {
+						dprint("Unsupported GetValueRequest\n");
+						req.reply.sendError(unsupported_operation());
+					}
+					when(GetCheckpointRequest req = waitNext(ssi.checkpoint.getFuture())) {
+						dprint("Unsupported GetCheckpoint \n");
+						req.reply.sendError(unsupported_operation());
+					}
+					when(FetchCheckpointRequest req = waitNext(ssi.fetchCheckpoint.getFuture())) {
+						dprint("Unsupported FetchCheckpointRequest\n");
+						req.reply.sendError(unsupported_operation());
+					}
+					when(UpdateCommitCostRequest req = waitNext(ssi.updateCommitCostRequest.getFuture())) {
+						dprint("Unsupported UpdateCommitCostRequest\n");
+						req.reply.sendError(unsupported_operation());
+					}
+					when(FetchCheckpointKeyValuesRequest req = waitNext(ssi.fetchCheckpointKeyValues.getFuture())) {
+						dprint("Unsupported FetchCheckpointKeyValuesRequest\n");
+						req.reply.sendError(unsupported_operation());
+					}
+				}
+			} catch (Error& e) {
+				dprint("Unexpected request handling error {}\n", e.what());
+				throw;
+			}
+		}
+	}
+
+	ACTOR static Future<Void> processWaitMetricsRequest(Reference<BlobMigrator> self, WaitMetricsRequest req) {
+		state WaitMetricsRequest waitMetricsRequest = req;
+		// FIXME get rid of this delay. it's a temp solution to avoid starvaion scheduling of DD
+		// processes
+		wait(delay(1));
+		StorageMetrics metrics;
+		metrics.bytes = sizeInBytes(self, waitMetricsRequest.keys);
+		waitMetricsRequest.reply.send(metrics);
+		return Void();
+	}
+
+	ACTOR static Future<Void> processStorageQueuingMetricsRequest(StorageQueuingMetricsRequest req) {
+		dprint("Unsupported StorageQueuingMetricsRequest\n");
+		// FIXME get rid of this delay. it's a temp solution to avoid starvaion scheduling of DD
+		// processes
+		wait(delay(1));
+		req.reply.sendError(unsupported_operation());
+		return Void();
+	}
+
+	// Return total storage size in bytes for migration
+	static int64_t sizeInBytes(Reference<BlobMigrator> self) { return sizeInBytes(self, normalKeys); }
+
+	// Return storage size in bytes for given key range
+	static int64_t sizeInBytes(Reference<BlobMigrator> self, KeyRangeRef range) {
+		int64_t bytes = 0;
+		for (auto granule : self->blobGranules_) {
+			if (range.intersects(granule.keyRange))
+				bytes += granule.sizeInBytes;
+		}
+		return bytes;
+	}
+
+	// Return max version for all blob granules
+	static Version maxVersion(Reference<BlobMigrator> self) {
+		Version max = 0;
+		for (auto granule : self->blobGranules_) {
+			max = std::max(granule.version, max);
+		}
+		return max;
+	}
+
 private:
-	Database db;
-	Reference<BlobConnectionProvider> blobConn;
-	BlobMigratorInterface blobMigratorInterf;
-	ActorCollection actors;
+	Database db_;
+	Reference<BlobConnectionProvider> blobConn_;
+	BlobGranuleRestoreVersionVector blobGranules_;
+	BlobMigratorInterface interf_;
+	ActorCollection actors_;
 };

 // Main entry point
-ACTOR Future<Void> blobMigrator(BlobMigratorInterface ssi, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
-	fmt::print("Start blob migrator {} \n", ssi.id().toString());
+ACTOR Future<Void> blobMigrator(BlobMigratorInterface interf, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
+	fmt::print("Start blob migrator {} \n", interf.id().toString());
 	try {
-		Reference<BlobMigrator> self = makeReference<BlobMigrator>(dbInfo, ssi);
+		Reference<BlobMigrator> self = makeReference<BlobMigrator>(dbInfo, interf);
 		wait(BlobMigrator::start(self));
 	} catch (Error& e) {
-		fmt::print("unexpected blob migrator error {}\n", e.what());
+		dprint("Unexpected blob migrator error {}\n", e.what());
+		TraceEvent("BlobMigratorError", interf.id()).error(e);
 	}
 	return Void();
 }
--- a/fdbserver/BlobWorker.actor.cpp
+++ b/fdbserver/BlobWorker.actor.cpp
@ -84,6 +84,15 @@ struct GranuleStartState {
 	Optional<GranuleHistory> history;
 };

+// TODO: add more (blob file request cost, in-memory mutations vs blob delta file, etc...)
+struct GranuleReadStats {
+	int64_t deltaBytesRead;
+
+	void reset() { deltaBytesRead = 0; }
+
+	GranuleReadStats() { reset(); }
+};
+
 struct GranuleMetadata : NonCopyable, ReferenceCounted<GranuleMetadata> {
 	KeyRange keyRange;

@ -120,11 +129,74 @@ struct GranuleMetadata : NonCopyable, ReferenceCounted<GranuleMetadata> {

 	AssignBlobRangeRequest originalReq;

+	GranuleReadStats readStats;
+	bool rdcCandidate;
+	Promise<Void> runRDC;
+
 	void resume() {
 		if (resumeSnapshot.canBeSet()) {
 			resumeSnapshot.send(Void());
 		}
 	}
+
+	void resetReadStats() {
+		rdcCandidate = false;
+		readStats.reset();
+		runRDC.reset();
+	}
+
+	// determine eligibility (>1) and priority for re-snapshotting this granule
+	double weightRDC() {
+		// ratio of read amp to write amp that would be incurred by re-snapshotting now
+		int64_t lastSnapshotSize = (files.snapshotFiles.empty()) ? 0 : files.snapshotFiles.back().length;
+		int64_t minSnapshotSize = SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES / 2;
+		lastSnapshotSize = std::max(minSnapshotSize, lastSnapshotSize);
+
+		int64_t writeAmp = lastSnapshotSize + bufferedDeltaBytes + bytesInNewDeltaFiles;
+		// read amp is deltaBytesRead. Read amp must be READ_FACTOR times larger than write amp
+		return (1.0 * readStats.deltaBytesRead) / (writeAmp * SERVER_KNOBS->BG_RDC_READ_FACTOR);
+	}
+
+	bool isEligibleRDC() {
+		// granule should be reasonably read-hot to be eligible
+		int64_t bytesWritten = bufferedDeltaBytes + bytesInNewDeltaFiles;
+		return bytesWritten * SERVER_KNOBS->BG_RDC_READ_FACTOR < readStats.deltaBytesRead;
+	}
+
+	bool updateReadStats(Version readVersion, const BlobGranuleChunkRef& chunk) {
+		// Only update stats for re-compacting for at-latest reads that have to do snapshot + delta merge
+		if (!SERVER_KNOBS->BG_ENABLE_READ_DRIVEN_COMPACTION || !chunk.snapshotFile.present() ||
+		    pendingSnapshotVersion != durableSnapshotVersion.get() || readVersion <= pendingSnapshotVersion) {
+			return false;
+		}
+
+		if (chunk.newDeltas.empty() && chunk.deltaFiles.empty()) {
+			return false;
+		}
+
+		readStats.deltaBytesRead += chunk.newDeltas.expectedSize();
+		for (auto& it : chunk.deltaFiles) {
+			readStats.deltaBytesRead += it.length;
+		}
+
+		if (rdcCandidate) {
+			return false;
+		}
+
+		if (isEligibleRDC() && weightRDC() > 1.0) {
+			rdcCandidate = true;
+			CODE_PROBE(true, "Granule read triggering read-driven compaction");
+			if (BW_DEBUG) {
+				fmt::print("Triggering read-driven compaction of [{0} - {1})\n",
+				           keyRange.begin.printable(),
+				           keyRange.end.printable());
+			}
+			return true;
+		}
+		return false;
+	}
+
+	inline bool doReadDrivenCompaction() { return runRDC.isSet(); }
 };

 struct GranuleRangeMetadata {
@ -200,6 +272,7 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
 	NotifiedVersion grvVersion;
 	Promise<Void> fatalError;
 	Promise<Void> simInjectFailure;
+	Promise<Void> doReadDrivenCompaction;

 	Reference<FlowLock> initialSnapshotLock;
 	Reference<FlowLock> resnapshotLock;
@ -293,6 +366,13 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
 		return stats.estimatedMaxResidentMemory >= memoryFullThreshold;
 	}

+	void triggerReadDrivenCompaction() {
+		Promise<Void> doRDC = doReadDrivenCompaction;
+		if (doRDC.canBeSet()) {
+			doRDC.send(Void());
+		}
+	}
+
 	bool maybeInjectTargetedRestart() {
 		// inject a BW restart at most once per test
 		if (g_network->isSimulated() && !g_simulator->speedUpSimulation &&
@ -1107,7 +1187,6 @@ ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData>
 			}
 			retries++;
 			CODE_PROBE(true, "Granule initial snapshot failed");
-			// FIXME: why can't we supress error event?
 			TraceEvent(retries < 10 ? SevDebug : SevWarn, "BlobGranuleInitialSnapshotRetry", bwData->id)
 			    .error(err)
 			    .detail("Granule", metadata->keyRange)
@ -2043,6 +2122,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 		metadata->pendingDeltaVersion = startVersion;
 		metadata->bufferedDeltaVersion = startVersion;
 		metadata->knownCommittedVersion = startVersion;
+		metadata->resetReadStats();

 		Reference<ChangeFeedData> cfData = makeReference<ChangeFeedData>(bwData->db.getPtr());

@ -2185,6 +2265,10 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 						}
 						nextForceFlush = metadata->forceFlushVersion.whenAtLeast(lastForceFlushVersion + 1);
 					}
+					when(wait(metadata->runRDC.getFuture())) {
+						// return control flow back to the triggering actor before continuing
+						wait(delay(0));
+					}
 				}
 			} catch (Error& e) {
 				// only error we should expect here is when we finish consuming old change feed
@ -2311,6 +2395,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 										    startState.granuleID,
 										    inFlightFiles.empty() ? Future<Void>(Void())
 										                          : success(inFlightFiles.back().future));
+										metadata->resetReadStats();
 									}
 									// reset force flush state, requests should retry and add it back once feed is ready
 									forceFlushVersions.clear();
@ -2419,20 +2504,20 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			// The force flush contract is a version cannot be put in forceFlushVersion unless the change feed
 			// is already whenAtLeast that version
 			bool forceFlush = !forceFlushVersions.empty() && forceFlushVersions.back() > metadata->pendingDeltaVersion;
+			bool doReadDrivenFlush = !metadata->currentDeltas.empty() && metadata->doReadDrivenCompaction();
 			CODE_PROBE(forceFlush, "Force flushing granule");
-			if (metadata->bufferedDeltaBytes >= SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES || forceFlush) {
+			if (metadata->bufferedDeltaBytes >= SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES || forceFlush ||
+			    doReadDrivenFlush) {
 				TraceEvent(SevDebug, "BlobGranuleDeltaFile", bwData->id)
 				    .detail("Granule", metadata->keyRange)
 				    .detail("Version", lastDeltaVersion);

 				// sanity check for version order
-
-				if (forceFlush) {
+				if (forceFlush || doReadDrivenFlush) {
 					if (lastDeltaVersion == invalidVersion) {
-						lastDeltaVersion = metadata->currentDeltas.empty() ? metadata->pendingDeltaVersion
-						                                                   : metadata->currentDeltas.back().version;
+						lastDeltaVersion = metadata->bufferedDeltaVersion;
 					}
-					if (lastDeltaVersion < forceFlushVersions.back()) {
+					if (!forceFlushVersions.empty() && lastDeltaVersion < forceFlushVersions.back()) {
 						if (BW_DEBUG) {
 							fmt::print("Granule [{0} - {1}) force flushing delta version {2} -> {3}\n",
 							           metadata->keyRange.begin.printable(),
@ -2444,13 +2529,6 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 					}
 				}
 				if (!metadata->currentDeltas.empty()) {
-					if (lastDeltaVersion < metadata->currentDeltas.back().version) {
-						fmt::print("Granule [{0} - {1}) LDV {2} < DeltaBack {3}\n",
-						           metadata->keyRange.begin.printable(),
-						           metadata->keyRange.end.printable(),
-						           lastDeltaVersion,
-						           metadata->currentDeltas.back().version);
-					}
 					ASSERT(lastDeltaVersion >= metadata->currentDeltas.back().version);
 					ASSERT(metadata->pendingDeltaVersion < metadata->currentDeltas.front().version);
 				} else {
@ -2507,6 +2585,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 				// add new pending delta file
 				ASSERT(metadata->pendingDeltaVersion < lastDeltaVersion);
 				metadata->pendingDeltaVersion = lastDeltaVersion;
+				ASSERT(metadata->bufferedDeltaVersion <= lastDeltaVersion);
 				metadata->bufferedDeltaVersion = lastDeltaVersion; // In case flush was forced at non-mutation version
 				metadata->bytesInNewDeltaFiles += metadata->bufferedDeltaBytes;

@ -2528,6 +2607,9 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 				// Wait on delta file starting here. If we have too many pending delta file writes, we need to not
 				// continue to consume from the change feed, as that will pile on even more delta files to write
 				wait(startDeltaFileWrite);
+			} else if (metadata->doReadDrivenCompaction()) {
+				ASSERT(metadata->currentDeltas.empty());
+				snapshotEligible = true;
 			}

 			// FIXME: if we're still reading from old change feed, we should probably compact if we're
@ -2535,7 +2617,8 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			// yet

 			// If we have enough delta files, try to re-snapshot
-			if (snapshotEligible && metadata->bytesInNewDeltaFiles >= SERVER_KNOBS->BG_DELTA_BYTES_BEFORE_COMPACT) {
+			if (snapshotEligible && (metadata->doReadDrivenCompaction() ||
+			                         metadata->bytesInNewDeltaFiles >= SERVER_KNOBS->BG_DELTA_BYTES_BEFORE_COMPACT)) {
 				if (BW_DEBUG && !inFlightFiles.empty()) {
 					fmt::print("Granule [{0} - {1}) ready to re-snapshot at {2} after {3} > {4} bytes, "
 					           "waiting for "
@ -2583,6 +2666,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,

 				// reset metadata
 				metadata->bytesInNewDeltaFiles = 0;
+				metadata->resetReadStats();

 				// If we have more than one snapshot file and that file is unblocked (committedVersion >=
 				// snapshotVersion), wait for it to finish
@ -3740,6 +3824,11 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 							}
 						}
 					}
+
+					// don't update read stats on a summarize read
+					if (metadata->updateReadStats(req.readVersion, chunk)) {
+						bwData->triggerReadDrivenCompaction();
+					}
 				}

 				rep.chunks.push_back(rep.arena, chunk);
@ -3961,7 +4050,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
 				}
 			}

-			if (createChangeFeed) {
+			if (createChangeFeed && !isFullRestoreMode()) {
 				// create new change feed for new version of granule
 				wait(updateChangeFeed(
 				    &tr, granuleIDToCFKey(info.granuleID), ChangeFeedStatus::CHANGE_FEED_CREATE, req.keyRange));
@ -4554,6 +4643,74 @@ ACTOR Future<Void> runGRVChecks(Reference<BlobWorkerData> bwData) {
 	}
 }

+struct RDCEntry {
+	double weight;
+	Reference<GranuleMetadata> granule;
+	RDCEntry(double weight, Reference<GranuleMetadata> granule) : weight(weight), granule(granule) {}
+};
+
+// for a top-k algorithm, we actually want a min-heap, so reverse the sort order
+struct OrderForTopK {
+	bool operator()(RDCEntry const& a, RDCEntry const& b) const { return b.weight - a.weight; }
+};
+
+typedef std::priority_queue<RDCEntry, std::vector<RDCEntry>, OrderForTopK> TopKPQ;
+
+ACTOR Future<Void> runReadDrivenCompaction(Reference<BlobWorkerData> bwData) {
+	state bool processedAll = true;
+	loop {
+		if (processedAll) {
+			wait(bwData->doReadDrivenCompaction.getFuture());
+			bwData->doReadDrivenCompaction.reset();
+			wait(delay(0));
+		}
+
+		TopKPQ topK;
+
+		// FIXME: possible to scan candidates instead of all granules?
+		int candidates = 0;
+		auto allRanges = bwData->granuleMetadata.intersectingRanges(normalKeys);
+		for (auto& it : allRanges) {
+			if (it.value().activeMetadata.isValid() && it.value().activeMetadata->cancelled.canBeSet()) {
+				auto metadata = it.value().activeMetadata;
+				if (metadata->rdcCandidate && metadata->isEligibleRDC() && metadata->runRDC.canBeSet() &&
+				    metadata->pendingSnapshotVersion == metadata->durableSnapshotVersion.get()) {
+					candidates++;
+					double weight = metadata->weightRDC();
+					if (weight > 1.0 &&
+					    (topK.size() < SERVER_KNOBS->BLOB_WORKER_RDC_PARALLELISM || weight > topK.top().weight)) {
+						if (topK.size() == SERVER_KNOBS->BLOB_WORKER_RDC_PARALLELISM) {
+							topK.pop();
+						}
+						topK.push(RDCEntry(weight, metadata));
+					}
+				}
+			}
+		}
+
+		CODE_PROBE(candidates > topK.size(), "Too many read-driven compaction candidates for one cycle");
+
+		std::vector<Future<Void>> futures;
+		futures.reserve(topK.size());
+		while (!topK.empty()) {
+			++bwData->stats.readDrivenCompactions;
+			Promise<Void> runRDC = topK.top().granule->runRDC;
+			ASSERT(runRDC.canBeSet());
+			Future<Void> waitForSnapshotComplete = topK.top().granule->durableSnapshotVersion.whenAtLeast(
+			                                           topK.top().granule->durableSnapshotVersion.get() + 1) ||
+			                                       topK.top().granule->cancelled.getFuture();
+			futures.push_back(waitForSnapshotComplete);
+			topK.pop();
+			runRDC.send(Void());
+		}
+		processedAll = futures.empty();
+		if (!futures.empty()) {
+			// wait at least one second to throttle this actor a bit
+			wait(waitForAll(futures) && delay(1.0));
+		}
+	}
+}
+
 // FIXME: better way to do this?
 // monitor system keyspace for new tenants
 ACTOR Future<Void> monitorTenants(Reference<BlobWorkerData> bwData) {
@ -4891,6 +5048,7 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
 	self->addActor.send(waitFailureServer(bwInterf.waitFailure.getFuture()));
 	self->addActor.send(runGRVChecks(self));
 	self->addActor.send(monitorTenants(self));
+	self->addActor.send(runReadDrivenCompaction(self));
 	state Future<Void> selfRemoved = monitorRemoval(self);
 	if (g_network->isSimulated() && BUGGIFY_WITH_PROB(0.25)) {
 		self->addActor.send(simForceFileWriteContention(self));
@ -5024,13 +5182,22 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
 				ASSERT(false);
 				throw internal_error();
 			}
-			when(wait(selfRemoved || self->simInjectFailure.getFuture())) {
+			when(wait(selfRemoved)) {
 				if (BW_DEBUG) {
 					printf("Blob worker detected removal. Exiting...\n");
 				}
 				TraceEvent("BlobWorkerRemoved", self->id);
 				break;
 			}
+			when(wait(self->simInjectFailure.getFuture())) {
+				// wait to let triggering actor finish to prevent weird shutdown races
+				wait(delay(0));
+				if (BW_DEBUG) {
+					printf("Blob worker simulation injected failure. Exiting...\n");
+				}
+				TraceEvent("BlobWorkerSimRemoved", self->id);
+				break;
+			}
 			when(wait(self->fatalError.getFuture())) {
 				TraceEvent(SevError, "BlobWorkerActorCollectionFatalErrorNotError", self->id);
 				ASSERT(false);
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -2615,8 +2615,9 @@ ACTOR Future<Void> monitorBlobMigrator(ClusterControllerData* self) {
 	}
 	loop {
 		if (self->db.serverInfo->get().blobMigrator.present() && !self->recruitBlobMigrator.get()) {
-			state Future<Void> wfClient = waitFailureClient(self->db.serverInfo->get().blobMigrator.get().waitFailure,
-			                                                SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME);
+			state Future<Void> wfClient =
+			    waitFailureClient(self->db.serverInfo->get().blobMigrator.get().ssi.waitFailure,
+			                      SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME);
 			loop {
 				choose {
 					when(wait(wfClient)) {
@ -3006,11 +3007,10 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
 	self.addActor.send(monitorConsistencyScan(&self));
 	self.addActor.send(metaclusterMetricsUpdater(&self));
 	self.addActor.send(dbInfoUpdater(&self));
-	self.addActor.send(traceCounters("ClusterControllerMetrics",
-	                                 self.id,
-	                                 SERVER_KNOBS->STORAGE_LOGGING_DELAY,
-	                                 &self.clusterControllerMetrics,
-	                                 self.id.toString() + "/ClusterControllerMetrics"));
+	self.addActor.send(self.clusterControllerMetrics.traceCounters("ClusterControllerMetrics",
+	                                                               self.id,
+	                                                               SERVER_KNOBS->STORAGE_LOGGING_DELAY,
+	                                                               self.id.toString() + "/ClusterControllerMetrics"));
 	self.addActor.send(traceRole(Role::CLUSTER_CONTROLLER, interf.id()));
 	// printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str());

--- a/fdbserver/ConfigBroadcaster.actor.cpp
+++ b/fdbserver/ConfigBroadcaster.actor.cpp
@ -183,8 +183,8 @@ class ConfigBroadcasterImpl {
 	    id(deterministicRandom()->randomUniqueID()), cc("ConfigBroadcaster"), compactRequest("CompactRequest", cc),
 	    successfulChangeRequest("SuccessfulChangeRequest", cc), failedChangeRequest("FailedChangeRequest", cc),
 	    snapshotRequest("SnapshotRequest", cc) {
-		logger = traceCounters(
-		    "ConfigBroadcasterMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ConfigBroadcasterMetrics");
+		logger = cc.traceCounters(
+		    "ConfigBroadcasterMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ConfigBroadcasterMetrics");
 	}

 	void addChanges(Standalone<VectorRef<VersionedConfigMutationRef>> const& changes,
--- a/fdbserver/ConfigNode.actor.cpp
+++ b/fdbserver/ConfigNode.actor.cpp
@ -812,7 +812,7 @@ public:
 	    successfulCommits("SuccessfulCommits", cc), failedCommits("FailedCommits", cc),
 	    setMutations("SetMutations", cc), clearMutations("ClearMutations", cc),
 	    getValueRequests("GetValueRequests", cc), getGenerationRequests("GetGenerationRequests", cc) {
-		logger = traceCounters("ConfigNodeMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ConfigNode");
+		logger = cc.traceCounters("ConfigNodeMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ConfigNode");
 		TraceEvent(SevInfo, "StartingConfigNode", id).detail("KVStoreAlreadyExists", kvStore.exists());
 	}

--- a/fdbserver/ConsistencyScan.actor.cpp
+++ b/fdbserver/ConsistencyScan.actor.cpp
@ -29,7 +29,7 @@
 #include "fdbclient/ReadYourWrites.h"
 #include "fdbclient/TagThrottle.actor.h"
 #include "fdbserver/Knobs.h"
-#include "fdbserver/StorageMetrics.h"
+#include "fdbserver/StorageMetrics.actor.h"
 #include "fdbserver/DataDistribution.actor.h"
 #include "fdbserver/RatekeeperInterface.h"
 #include "fdbserver/ServerDBInfo.h"
@ -393,6 +393,7 @@ ACTOR Future<bool> checkDataConsistency(Database cx,
 	state double rateLimiterStartTime = now();
 	state int64_t bytesReadInthisRound = 0;
 	state bool resume = !(restart || shuffleShards);
+	state bool testResult = true;

 	state double dbSize = 100e12;
 	if (g_network->isSimulated()) {
@ -710,7 +711,7 @@ ACTOR Future<bool> checkDataConsistency(Database cx,
 									    (!storageServerInterfaces[j].isTss() &&
 									     !storageServerInterfaces[firstValidServer].isTss())) {
 										testFailure("Data inconsistent", performQuiescentChecks, true);
-										return false;
+										testResult = false;
 									}
 								}
 							}
@ -949,7 +950,7 @@ ACTOR Future<bool> checkDataConsistency(Database cx,
 	}

 	*bytesReadInPrevRound = bytesReadInthisRound;
-	return true;
+	return testResult;
 }

 ACTOR Future<Void> runDataValidationCheck(ConsistencyScanData* self) {
--- a/fdbserver/DDRelocationQueue.actor.cpp
+++ b/fdbserver/DDRelocationQueue.actor.cpp
@ -689,6 +689,17 @@ struct DDQueue : public IDDRelocationQueue {

 	int moveReusePhysicalShard;
 	int moveCreateNewPhysicalShard;
+	enum RetryFindDstReason {
+		None = 0,
+		RemoteBestTeamNotReady,
+		PrimaryNoHealthyTeam,
+		RemoteNoHealthyTeam,
+		RemoteTeamIsFull,
+		RemoteTeamIsNotHealthy,
+		NoAvailablePhysicalShard,
+		NumberOfTypes,
+	};
+	std::vector<int> retryFindDstReasonCount;

 	void startRelocation(int priority, int healthPriority) {
 		// Although PRIORITY_TEAM_REDUNDANT has lower priority than split and merge shard movement,
@ -754,7 +765,8 @@ struct DDQueue : public IDDRelocationQueue {
 	    suppressIntervals(0), rawProcessingUnhealthy(new AsyncVar<bool>(false)),
 	    rawProcessingWiggle(new AsyncVar<bool>(false)), unhealthyRelocations(0),
 	    movedKeyServersEventHolder(makeReference<EventCacheHolder>("MovedKeyServers")), moveReusePhysicalShard(0),
-	    moveCreateNewPhysicalShard(0) {}
+	    moveCreateNewPhysicalShard(0), retryFindDstReasonCount(static_cast<int>(RetryFindDstReason::NumberOfTypes), 0) {
+	}
 	DDQueue() = default;

 	void validate() {
@ -1467,6 +1479,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 		loop {
 			destOverloadedCount = 0;
 			stuckCount = 0;
+			state DDQueue::RetryFindDstReason retryFindDstReason = DDQueue::RetryFindDstReason::None;
 			// state int bestTeamStuckThreshold = 50;
 			loop {
 				state int tciIndex = 0;
@ -1493,10 +1506,13 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 							    .detail("TeamCollectionIndex", tciIndex)
 							    .detail("RestoreDataMoveForDest",
 							            describe(tciIndex == 0 ? rd.dataMove->primaryDest : rd.dataMove->remoteDest));
+							retryFindDstReason = DDQueue::RetryFindDstReason::RemoteBestTeamNotReady;
 							foundTeams = false;
 							break;
 						}
 						if (!bestTeam.first.present() || !bestTeam.first.get()->isHealthy()) {
+							retryFindDstReason = tciIndex == 0 ? DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam
+							                                   : DDQueue::RetryFindDstReason::RemoteNoHealthyTeam;
 							foundTeams = false;
 							break;
 						}
@ -1549,12 +1565,15 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 							// getting the destination team or we could miss failure notifications for the storage
 							// servers in the destination team
 							TraceEvent("BestTeamNotReady");
+							retryFindDstReason = DDQueue::RetryFindDstReason::RemoteBestTeamNotReady;
 							foundTeams = false;
 							break;
 						}
 						// If a DC has no healthy team, we stop checking the other DCs until
 						// the unhealthy DC is healthy again or is excluded.
 						if (!bestTeam.first.present()) {
+							retryFindDstReason = tciIndex == 0 ? DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam
+							                                   : DDQueue::RetryFindDstReason::RemoteNoHealthyTeam;
 							foundTeams = false;
 							break;
 						}
@ -1578,6 +1597,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 							if (tciIndex == 1 && !forceToUseNewPhysicalShard) {
 								bool minAvailableSpaceRatio = bestTeam.first.get()->getMinAvailableSpaceRatio(true);
 								if (minAvailableSpaceRatio < SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO) {
+									retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsFull;
 									foundTeams = false;
 									break;
 								}
@ -1620,6 +1640,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 				if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
 				    bestTeams.size() > 1 && !forceToUseNewPhysicalShard) {
 					if (!bestTeams[1].first->isHealthy()) {
+						retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy;
 						foundTeams = false;
 					}
 				}
@ -1684,6 +1705,14 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 						self->moveReusePhysicalShard++;
 					} else {
 						self->moveCreateNewPhysicalShard++;
+						if (retryFindDstReason == DDQueue::RetryFindDstReason::None) {
+							// When creating a new physical shard, but the reason is none, this can only happen when
+							// determinePhysicalShardIDGivenPrimaryTeam() finds that there is no available physical
+							// shard.
+							self->retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]++;
+						} else {
+							self->retryFindDstReasonCount[retryFindDstReason]++;
+						}
 					}
 					rd.dataMoveId = newShardId(physicalShardIDCandidate, AssignEmptyRange::False);
 					auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
@ -2485,9 +2514,25 @@ ACTOR Future<Void> dataDistributionQueue(Reference<IDDTxnProcessor> db,
 					if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
 						TraceEvent("PhysicalShardMoveStats")
 						    .detail("MoveCreateNewPhysicalShard", self.moveCreateNewPhysicalShard)
-						    .detail("MoveReusePhysicalShard", self.moveReusePhysicalShard);
+						    .detail("MoveReusePhysicalShard", self.moveReusePhysicalShard)
+						    .detail("RemoteBestTeamNotReady",
+						            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteBestTeamNotReady])
+						    .detail("PrimaryNoHealthyTeam",
+						            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam])
+						    .detail("RemoteNoHealthyTeam",
+						            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteNoHealthyTeam])
+						    .detail("RemoteTeamIsFull",
+						            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsFull])
+						    .detail("RemoteTeamIsNotHealthy",
+						            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy])
+						    .detail(
+						        "NoAvailablePhysicalShard",
+						        self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]);
 						self.moveCreateNewPhysicalShard = 0;
 						self.moveReusePhysicalShard = 0;
+						for (int i = 0; i < self.retryFindDstReasonCount.size(); ++i) {
+							self.retryFindDstReasonCount[i] = 0;
+						}
 					}
 				}
 				when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator
--- a/fdbserver/DDShardTracker.actor.cpp
+++ b/fdbserver/DDShardTracker.actor.cpp
@ -212,7 +212,7 @@ ShardSizeBounds calculateShardSizeBounds(const KeyRange& keys,
                                         const Reference<AsyncVar<Optional<ShardMetrics>>>& shardMetrics,
                                         const BandwidthStatus& bandwidthStatus,
                                         PromiseStream<KeyRange> readHotShard) {
-	ShardSizeBounds bounds;
+	ShardSizeBounds bounds = ShardSizeBounds::shardSizeBoundsBeforeTrack();
 	if (shardMetrics->get().present()) {
 		auto bytes = shardMetrics->get().get().metrics.bytes;
 		auto readBandwidthStatus = getReadBandwidthStatus(shardMetrics->get().get().metrics);
@ -259,21 +259,7 @@ ShardSizeBounds calculateShardSizeBounds(const KeyRange& keys,
 		} else {
 			ASSERT(false);
 		}
-	} else {
-		bounds.max.bytes = -1;
-		bounds.min.bytes = -1;
-		bounds.permittedError.bytes = -1;
-		bounds.max.bytesPerKSecond = bounds.max.infinity;
-		bounds.min.bytesPerKSecond = 0;
-		bounds.permittedError.bytesPerKSecond = bounds.permittedError.infinity;
-		bounds.max.bytesReadPerKSecond = bounds.max.infinity;
-		bounds.min.bytesReadPerKSecond = 0;
-		bounds.permittedError.bytesReadPerKSecond = bounds.permittedError.infinity;
 	}
-
-	bounds.max.iosPerKSecond = bounds.max.infinity;
-	bounds.min.iosPerKSecond = 0;
-	bounds.permittedError.iosPerKSecond = bounds.permittedError.infinity;
 	return bounds;
 }

--- a/fdbserver/DDTeamCollection.actor.cpp
+++ b/fdbserver/DDTeamCollection.actor.cpp
@ -895,7 +895,7 @@ public:
 							if (maxPriority < SERVER_KNOBS->PRIORITY_TEAM_FAILED) {
 								std::pair<std::vector<ShardsAffectedByTeamFailure::Team>,
 								          std::vector<ShardsAffectedByTeamFailure::Team>>
-								    teams = self->shardsAffectedByTeamFailure->getTeamsFor(shards[i]);
+								    teams = self->shardsAffectedByTeamFailure->getTeamsForFirstShard(shards[i]);
 								for (int j = 0; j < teams.first.size() + teams.second.size(); j++) {
 									// t is the team in primary DC or the remote DC
 									auto& t =
--- a/fdbserver/DDTxnProcessor.actor.cpp
+++ b/fdbserver/DDTxnProcessor.actor.cpp
@ -763,7 +763,7 @@ std::vector<DDShardInfo> DDMockTxnProcessor::getDDShardInfos() const {
 		KeyRangeRef curRange = it->range();
 		DDShardInfo info(curRange.begin);

-		auto teams = mgs->shardMapping->getTeamsFor(curRange);
+		auto teams = mgs->shardMapping->getTeamsForFirstShard(curRange);
 		if (!teams.first.empty() && !teams.second.empty()) {
 			CODE_PROBE(true, "Mock InitialDataDistribution In-Flight shard");
 			info.hasDest = true;
@ -816,7 +816,7 @@ Future<Void> DDMockTxnProcessor::removeStorageServer(const UID& serverID,
                                                     const Optional<UID>& tssPairID,
                                                     const MoveKeysLock& lock,
                                                     const DDEnabledState* ddEnabledState) const {
-	ASSERT(mgs->allShardRemovedFromServer(serverID));
+	ASSERT(mgs->allShardsRemovedFromServer(serverID));
 	mgs->allServers.erase(serverID);
 	return Void();
 }
@ -862,16 +862,14 @@ Future<HealthMetrics> DDMockTxnProcessor::getHealthMetrics(bool detailed) const
 	return Future<HealthMetrics>();
 }

-// FIXME: finish implementation
 Future<Standalone<VectorRef<KeyRef>>> DDMockTxnProcessor::splitStorageMetrics(
    const KeyRange& keys,
    const StorageMetrics& limit,
    const StorageMetrics& estimated,
    const Optional<int>& minSplitBytes) const {
-	return Future<Standalone<VectorRef<KeyRef>>>();
+	return mgs->splitStorageMetrics(keys, limit, estimated, minSplitBytes);
 }

-// FIXME: finish implementation
 Future<std::pair<Optional<StorageMetrics>, int>> DDMockTxnProcessor::waitStorageMetrics(
    const KeyRange& keys,
    const StorageMetrics& min,
@ -879,7 +877,7 @@ Future<std::pair<Optional<StorageMetrics>, int>> DDMockTxnProcessor::waitStorage
    const StorageMetrics& permittedError,
    int shardLimit,
    int expectedShardCount) const {
-	return Future<std::pair<Optional<StorageMetrics>, int>>();
+	return mgs->waitStorageMetrics(keys, min, max, permittedError, shardLimit, expectedShardCount);
 }

 // FIXME: finish implementation
@ -910,7 +908,7 @@ void DDMockTxnProcessor::rawFinishMovement(MoveKeysParams& params,
 	ASSERT(params.finishMoveKeysParallelismLock->take().isReady());

 	// get source and dest teams
-	auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsFor(params.keys);
+	auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsForFirstShard(params.keys);

 	ASSERT_EQ(destTeams.size(), 0);
 	if (destTeams.front() != ShardsAffectedByTeamFailure::Team{ params.destinationTeam, true }) {
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -53,6 +53,20 @@
 #include "fdbserver/DDSharedContext.h"
 #include "flow/actorcompiler.h" // This must be the last #include.

+ShardSizeBounds ShardSizeBounds::shardSizeBoundsBeforeTrack() {
+	return ShardSizeBounds{
+		.max = StorageMetrics{ .bytes = -1,
+		                       .bytesPerKSecond = StorageMetrics::infinity,
+		                       .iosPerKSecond = StorageMetrics::infinity,
+		                       .bytesReadPerKSecond = StorageMetrics::infinity },
+		.min = StorageMetrics{ .bytes = -1, .bytesPerKSecond = 0, .iosPerKSecond = 0, .bytesReadPerKSecond = 0 },
+		.permittedError = StorageMetrics{ .bytes = -1,
+		                                  .bytesPerKSecond = StorageMetrics::infinity,
+		                                  .iosPerKSecond = StorageMetrics::infinity,
+		                                  .bytesReadPerKSecond = StorageMetrics::infinity }
+	};
+}
+
 struct DDAudit {
 	DDAudit(UID id, KeyRange range, AuditType type)
 	  : id(id), range(range), type(type), auditMap(AuditPhase::Invalid, allKeys.end), actors(true) {}
@ -286,8 +300,6 @@ public:
 	PromiseStream<RelocateShard> relocationProducer, relocationConsumer;
 	Reference<PhysicalShardCollection> physicalShardCollection;

-	StorageQuotaInfo storageQuotaInfo;
-
 	Promise<Void> initialized;

 	std::unordered_map<AuditType, std::vector<std::shared_ptr<DDAudit>>> audits;
@ -542,27 +554,6 @@ public:
 	}
 };

-ACTOR Future<Void> storageQuotaTracker(Database cx, StorageQuotaInfo* storageQuotaInfo) {
-	loop {
-		state Transaction tr(cx);
-		loop {
-			try {
-				state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY));
-				TraceEvent("StorageQuota_ReadCurrentQuotas").detail("Size", currentQuotas.size());
-				for (auto const kv : currentQuotas) {
-					Key const key = kv.key.removePrefix(storageQuotaPrefix);
-					uint64_t const quota = BinaryReader::fromStringRef<uint64_t>(kv.value, Unversioned());
-					storageQuotaInfo->quotaMap[key] = quota;
-				}
-				wait(delay(5.0));
-				break;
-			} catch (Error& e) {
-				wait(tr.onError(e));
-			}
-		}
-	}
-}
-
 // Periodically check and log the physicalShard status; clean up empty physicalShard;
 ACTOR Future<Void> monitorPhysicalShardStatus(Reference<PhysicalShardCollection> self) {
 	ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
@ -683,16 +674,15 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 			                                    self->ddId,
 			                                    &normalDDQueueErrors()));

-			actors.push_back(reportErrorsExcept(storageQuotaTracker(cx, &self->storageQuotaInfo),
-			                                    "StorageQuotaTracker",
-			                                    self->ddId,
-			                                    &normalDDQueueErrors()));
-
 			if (ddIsTenantAware) {
 				actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorTenantMap(),
 				                                    "DDTenantCacheMonitor",
 				                                    self->ddId,
 				                                    &normalDDQueueErrors()));
+				actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorStorageQuota(),
+				                                    "StorageQuotaTracker",
+				                                    self->ddId,
+				                                    &normalDDQueueErrors()));
 				actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorStorageUsage(),
 				                                    "StorageUsageTracker",
 				                                    self->ddId,
--- a/fdbserver/GlobalTagThrottler.actor.cpp
+++ b/fdbserver/GlobalTagThrottler.actor.cpp
@ -202,7 +202,8 @@ class GlobalTagThrottlerImpl {
 		for (const auto& [id, _] : throughput) {
 			result += getCurrentCost(id, tag).orDefault(0);
 		}
-		TraceEvent("GlobalTagThrottler_GetCurrentCost").detail("Tag", printable(tag)).detail("Cost", result);
+		// FIXME: Disabled due to noisy trace events. Fix the noise and reenabled
+		//TraceEvent("GlobalTagThrottler_GetCurrentCost").detail("Tag", printable(tag)).detail("Cost", result);

 		return result;
 	}
@ -235,10 +236,13 @@ class GlobalTagThrottlerImpl {
 			return 1.0;
 		}
 		auto const transactionRate = stats.get().getTransactionRate();
+		// FIXME: Disabled due to noisy trace events. Fix the noise and reenabled
+		/*
 		TraceEvent("GlobalTagThrottler_GetAverageTransactionCost")
 		    .detail("Tag", tag)
 		    .detail("TransactionRate", transactionRate)
 		    .detail("Cost", cost);
+		*/
 		if (transactionRate == 0.0) {
 			return 1.0;
 		} else {
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@ -154,7 +154,7 @@ struct GrvProxyStats {
 			return int64_t(100 * this->percentageOfBatchGRVQueueProcessed);
 		});

-		logger = traceCounters("GrvProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "GrvProxyMetrics");
+		logger = cc.traceCounters("GrvProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "GrvProxyMetrics");
 		for (int i = 0; i < FLOW_KNOBS->BASIC_LOAD_BALANCE_BUCKETS; i++) {
 			requestBuckets.push_back(0);
 		}
@ -459,9 +459,9 @@ void dropRequestFromQueue(Deque<GetReadVersionRequest>* queue, GrvProxyStats* st

 // Put a GetReadVersion request into the queue corresponding to its priority.
 ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo> const> db,
-                                               SpannedDeque<GetReadVersionRequest>* systemQueue,
-                                               SpannedDeque<GetReadVersionRequest>* defaultQueue,
-                                               SpannedDeque<GetReadVersionRequest>* batchQueue,
+                                               Deque<GetReadVersionRequest>* systemQueue,
+                                               Deque<GetReadVersionRequest>* defaultQueue,
+                                               Deque<GetReadVersionRequest>* batchQueue,
                                               FutureStream<GetReadVersionRequest> readVersionRequests,
                                               PromiseStream<Void> GRVTimer,
                                               double* lastGRVTime,
@ -531,7 +531,6 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>
 					stats->txnSystemPriorityStartIn += req.transactionCount;
 					++stats->systemGRVQueueSize;
 					systemQueue->push_back(req);
-					// systemQueue->span.addParent(req.spanContext);
 				} else if (req.priority >= TransactionPriority::DEFAULT) {
 					++stats->txnRequestIn;
 					stats->txnStartIn += req.transactionCount;
@ -542,7 +541,6 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>
 					} else {
 						defaultQueue->push_back(req);
 					}
-					// defaultQueue->span.addParent(req.spanContext);
 				} else {
 					// Return error for batch_priority GRV requests
 					int64_t proxiesCount = std::max((int)db->get().client.grvProxies.size(), 1);
@ -559,7 +557,6 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>
 						} else {
 							batchQueue->push_back(req);
 						}
-						// batchQueue->span.addParent(req.spanContext);
 					}
 				}
 			}
@ -607,7 +604,7 @@ ACTOR Future<Void> lastCommitUpdater(GrvProxyData* self, PromiseStream<Future<Vo
 	}
 }

-ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(SpanContext parentSpan,
+ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(std::vector<SpanContext> spanContexts,
                                                          GrvProxyData* grvProxyData,
                                                          uint32_t flags,
                                                          Optional<UID> debugID,
@ -620,7 +617,10 @@ ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(SpanContext parentSpan
 	// before the request returns, so it is committed. (2) No proxy on our list reported committed a higher version
 	// before this request was received, because then its committedVersion would have been higher,
 	//     and no other proxy could have already committed anything without first ending the epoch
-	state Span span("GP:getLiveCommittedVersion"_loc, parentSpan);
+	state Span span("GP:getLiveCommittedVersion"_loc);
+	for (const SpanContext& spanContext : spanContexts) {
+		span.addLink(spanContext);
+	}
 	++grvProxyData->stats.txnStartBatch;

 	state double grvStart = now();
@ -826,15 +826,14 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 	state GrvTransactionRateInfo batchRateInfo(0);
 	state GrvProxyTransactionTagThrottler tagThrottler;

-	state SpannedDeque<GetReadVersionRequest> systemQueue("GP:transactionStarterSystemQueue"_loc);
-	state SpannedDeque<GetReadVersionRequest> defaultQueue("GP:transactionStarterDefaultQueue"_loc);
-	state SpannedDeque<GetReadVersionRequest> batchQueue("GP:transactionStarterBatchQueue"_loc);
+	state Deque<GetReadVersionRequest> systemQueue;
+	state Deque<GetReadVersionRequest> defaultQueue;
+	state Deque<GetReadVersionRequest> batchQueue;

 	state TransactionTagMap<uint64_t> transactionTagCounter;
 	state PrioritizedTransactionTagMap<ClientTagThrottleLimits> clientThrottledTags;

 	state PromiseStream<double> normalGRVLatency;
-	// state Span span;

 	state int64_t midShardSize = SERVER_KNOBS->MIN_SHARD_BYTES;
 	getCurrentLineage()->modify(&TransactionLineage::operation) =
@ -911,7 +910,7 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 		uint32_t defaultQueueSize = defaultQueue.size();
 		uint32_t batchQueueSize = batchQueue.size();
 		while (requestsToStart < SERVER_KNOBS->START_TRANSACTION_MAX_REQUESTS_TO_START) {
-			SpannedDeque<GetReadVersionRequest>* transactionQueue;
+			Deque<GetReadVersionRequest>* transactionQueue;
 			if (!systemQueue.empty()) {
 				transactionQueue = &systemQueue;
 			} else if (!defaultQueue.empty()) {
@ -921,7 +920,6 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 			} else {
 				break;
 			}
-			// transactionQueue->span.swap(span);

 			auto& req = transactionQueue->front();
 			int tc = req.transactionCount;
@ -1017,7 +1015,13 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 		int batchGRVProcessed = 0;
 		for (int i = 0; i < start.size(); i++) {
 			if (start[i].size()) {
-				Future<GetReadVersionReply> readVersionReply = getLiveCommittedVersion(SpanContext(),
+				std::vector<SpanContext> spanContexts;
+				spanContexts.reserve(start[i].size());
+				for (const GetReadVersionRequest& request : start[i]) {
+					spanContexts.push_back(request.spanContext);
+				}
+
+				Future<GetReadVersionReply> readVersionReply = getLiveCommittedVersion(spanContexts,
 				                                                                       grvProxyData,
 				                                                                       i,
 				                                                                       debugID,
@ -1041,7 +1045,6 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 				batchGRVProcessed += batchPriTransactionsStarted[i];
 			}
 		}
-		// span = Span(span.location);

 		grvProxyData->stats.percentageOfDefaultGRVQueueProcessed =
 		    defaultQueueSize ? (double)defaultGRVProcessed / defaultQueueSize : 1;
--- a/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp
+++ b/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp
@ -18,6 +18,7 @@
 * limitations under the License.
 */

+#include "fdbclient/Knobs.h"
 #include "fdbserver/GrvProxyTransactionTagThrottler.h"
 #include "flow/UnitTest.h"
 #include "flow/actorcompiler.h" // must be last include
@ -28,6 +29,10 @@ void GrvProxyTransactionTagThrottler::DelayedRequest::updateProxyTagThrottledDur
 	req.proxyTagThrottledDuration = now() - startTime;
 }

+bool GrvProxyTransactionTagThrottler::DelayedRequest::isMaxThrottled() const {
+	return now() - startTime > CLIENT_KNOBS->PROXY_MAX_TAG_THROTTLE_DURATION;
+}
+
 void GrvProxyTransactionTagThrottler::TagQueue::setRate(double rate) {
 	if (rateInfo.present()) {
 		rateInfo.get().setRate(rate);
@ -36,6 +41,20 @@ void GrvProxyTransactionTagThrottler::TagQueue::setRate(double rate) {
 	}
 }

+bool GrvProxyTransactionTagThrottler::TagQueue::isMaxThrottled() const {
+	return !requests.empty() && requests.front().isMaxThrottled();
+}
+
+void GrvProxyTransactionTagThrottler::TagQueue::rejectRequests() {
+	CODE_PROBE(true, "GrvProxyTransactionTagThrottler rejecting requests");
+	while (!requests.empty()) {
+		auto& delayedReq = requests.front();
+		delayedReq.updateProxyTagThrottledDuration();
+		delayedReq.req.reply.sendError(proxy_tag_throttled());
+		requests.pop_front();
+	}
+}
+
 void GrvProxyTransactionTagThrottler::updateRates(TransactionTagMap<double> const& newRates) {
 	for (const auto& [tag, rate] : newRates) {
 		auto it = queues.find(tag);
@ -73,6 +92,7 @@ void GrvProxyTransactionTagThrottler::addRequest(GetReadVersionRequest const& re
 		// SERVER_KNOBS->ENFORCE_TAG_THROTTLING_ON_PROXIES is enabled, there may be
 		// unexpected behaviour, because only one tag is used for throttling.
 		TraceEvent(SevWarnAlways, "GrvProxyTransactionTagThrottler_MultipleTags")
+		    .suppressFor(1.0)
 		    .detail("NumTags", req.tags.size())
 		    .detail("UsingTag", printable(tag));
 	}
@ -80,8 +100,8 @@ void GrvProxyTransactionTagThrottler::addRequest(GetReadVersionRequest const& re
 }

 void GrvProxyTransactionTagThrottler::releaseTransactions(double elapsed,
-                                                          SpannedDeque<GetReadVersionRequest>& outBatchPriority,
-                                                          SpannedDeque<GetReadVersionRequest>& outDefaultPriority) {
+                                                          Deque<GetReadVersionRequest>& outBatchPriority,
+                                                          Deque<GetReadVersionRequest>& outDefaultPriority) {
 	// Pointer to a TagQueue with some extra metadata stored alongside
 	struct TagQueueHandle {
 		// Store pointers here to avoid frequent std::unordered_map lookups
@ -140,6 +160,11 @@ void GrvProxyTransactionTagThrottler::releaseTransactions(double elapsed,
 				// Cannot release any more transaction from this tag (don't push the tag queue handle back into
 				// pqOfQueues)
 				CODE_PROBE(true, "GrvProxyTransactionTagThrottler throttling transaction");
+				if (tagQueueHandle.queue->isMaxThrottled()) {
+					// Requests in this queue have been throttled too long and errors
+					// should be sent to clients.
+					tagQueueHandle.queue->rejectRequests();
+				}
 				break;
 			} else {
 				if (tagQueueHandle.nextSeqNo < nextQueueSeqNo) {
@ -255,8 +280,8 @@ ACTOR static Future<Void> mockFifoClient(GrvProxyTransactionTagThrottler* thrott
 }

 ACTOR static Future<Void> mockServer(GrvProxyTransactionTagThrottler* throttler) {
-	state SpannedDeque<GetReadVersionRequest> outBatchPriority("TestGrvProxyTransactionTagThrottler_Batch"_loc);
-	state SpannedDeque<GetReadVersionRequest> outDefaultPriority("TestGrvProxyTransactionTagThrottler_Default"_loc);
+	state Deque<GetReadVersionRequest> outBatchPriority;
+	state Deque<GetReadVersionRequest> outDefaultPriority;
 	loop {
 		state double elapsed = (0.009 + 0.002 * deterministicRandom()->random01());
 		wait(delay(elapsed));
@ -379,8 +404,8 @@ TEST_CASE("/GrvProxyTransactionTagThrottler/Cleanup2") {
 	throttler.updateRates(TransactionTagMap<double>{});
 	ASSERT_EQ(throttler.size(), 1);
 	{
-		SpannedDeque<GetReadVersionRequest> outBatchPriority("TestGrvProxyTransactionTagThrottler_Batch"_loc);
-		SpannedDeque<GetReadVersionRequest> outDefaultPriority("TestGrvProxyTransactionTagThrottler_Default"_loc);
+		Deque<GetReadVersionRequest> outBatchPriority;
+		Deque<GetReadVersionRequest> outDefaultPriority;
 		throttler.releaseTransactions(0.1, outBatchPriority, outDefaultPriority);
 	}
 	// Calling updates cleans up the queues in throttler
--- a/fdbserver/LocalConfiguration.actor.cpp
+++ b/fdbserver/LocalConfiguration.actor.cpp
@ -347,8 +347,8 @@ public:
 			                            Randomize::False,
 			                            g_network->isSimulated() ? IsSimulated::True : IsSimulated::False);
 		}
-		logger = traceCounters(
-		    "LocalConfigurationMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "LocalConfigurationMetrics");
+		logger = cc.traceCounters(
+		    "LocalConfigurationMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "LocalConfigurationMetrics");
 	}

 	Future<Void> addChanges(Standalone<VectorRef<VersionedConfigMutationRef>> changes,
--- a/fdbserver/LogRouter.actor.cpp
+++ b/fdbserver/LogRouter.actor.cpp
@ -190,15 +190,14 @@ struct LogRouterData {
 		});
 		specialCounter(cc, "Generation", [this]() { return this->generation; });
 		specialCounter(cc, "ActivePeekStreams", [this]() { return this->activePeekStreams; });
-		logger = traceCounters("LogRouterMetrics",
-		                       dbgid,
-		                       SERVER_KNOBS->WORKER_LOGGING_INTERVAL,
-		                       &cc,
-		                       "LogRouterMetrics",
-		                       [this](TraceEvent& te) {
-			                       te.detail("PrimaryPeekLocation", this->primaryPeekLocation);
-			                       te.detail("RouterTag", this->routerTag.toString());
-		                       });
+		logger = cc.traceCounters("LogRouterMetrics",
+		                          dbgid,
+		                          SERVER_KNOBS->WORKER_LOGGING_INTERVAL,
+		                          "LogRouterMetrics",
+		                          [this](TraceEvent& te) {
+			                          te.detail("PrimaryPeekLocation", this->primaryPeekLocation);
+			                          te.detail("RouterTag", this->routerTag.toString());
+		                          });
 	}
 };

--- a/fdbserver/MockGlobalState.actor.cpp
+++ b/fdbserver/MockGlobalState.actor.cpp
@ -0,0 +1,623 @@
+/*
+ * MockGlobalState.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbserver/MockGlobalState.h"
+#include "fdbserver/workloads/workloads.actor.h"
+#include "fdbserver/DataDistribution.actor.h"
+#include "flow/actorcompiler.h"
+
+class MockGlobalStateImpl {
+public:
+	ACTOR static Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(MockGlobalState* mgs,
+	                                                                                 KeyRange keys,
+	                                                                                 StorageMetrics min,
+	                                                                                 StorageMetrics max,
+	                                                                                 StorageMetrics permittedError,
+	                                                                                 int shardLimit,
+	                                                                                 int expectedShardCount) {
+		state TenantInfo tenantInfo;
+		loop {
+			auto locations = mgs->getKeyRangeLocations(tenantInfo,
+			                                           keys,
+			                                           shardLimit,
+			                                           Reverse::False,
+			                                           SpanContext(),
+			                                           Optional<UID>(),
+			                                           UseProvisionalProxies::False,
+			                                           0)
+			                     .get();
+			TraceEvent(SevDebug, "MGSWaitStorageMetrics").detail("Phase", "GetLocation");
+			// NOTE(xwang): in native API, there's code handling the non-equal situation, but I think in mock world
+			// there shouldn't have any delay to update the locations.
+			ASSERT_EQ(expectedShardCount, locations.size());
+
+			Optional<StorageMetrics> res =
+			    wait(::waitStorageMetricsWithLocation(tenantInfo, keys, locations, min, max, permittedError));
+
+			if (res.present()) {
+				return std::make_pair(res, -1);
+			}
+			wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
+		}
+	}
+
+	// SOMEDAY: reuse the NativeAPI implementation
+	ACTOR static Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(MockGlobalState* mgs,
+	                                                                       KeyRange keys,
+	                                                                       StorageMetrics limit,
+	                                                                       StorageMetrics estimated,
+	                                                                       Optional<int> minSplitBytes) {
+		state TenantInfo tenantInfo;
+		loop {
+			state std::vector<KeyRangeLocationInfo> locations =
+			    mgs->getKeyRangeLocations(tenantInfo,
+			                              keys,
+			                              CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT,
+			                              Reverse::False,
+			                              SpanContext(),
+			                              Optional<UID>(),
+			                              UseProvisionalProxies::False,
+			                              0)
+			        .get();
+
+			// Same solution to NativeAPI::splitStorageMetrics, wait some merge finished
+			if (locations.size() == CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT) {
+				wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
+			}
+
+			Optional<Standalone<VectorRef<KeyRef>>> results =
+			    wait(splitStorageMetricsWithLocations(locations, keys, limit, estimated, minSplitBytes));
+
+			if (results.present()) {
+				return results.get();
+			}
+
+			wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
+		}
+	}
+};
+
+class MockStorageServerImpl {
+public:
+	ACTOR static Future<Void> waitMetricsTenantAware(MockStorageServer* self, WaitMetricsRequest req) {
+		if (req.tenantInfo.present() && req.tenantInfo.get().tenantId != TenantInfo::INVALID_TENANT) {
+			// TODO(xwang) add support for tenant test, search for tenant entry
+			Optional<TenantMapEntry> entry;
+			Optional<Key> tenantPrefix = entry.map<Key>([](TenantMapEntry e) { return e.prefix; });
+			if (tenantPrefix.present()) {
+				UNREACHABLE();
+				// req.keys = req.keys.withPrefix(tenantPrefix.get(), req.arena);
+			}
+		}
+
+		if (!self->isReadable(req.keys)) {
+			self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
+		} else {
+			wait(self->metrics.waitMetrics(req, delayJittered(SERVER_KNOBS->STORAGE_METRIC_TIMEOUT)));
+		}
+		return Void();
+	}
+};
+
+bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus status) {
+	auto ranges = serverKeys.intersectingRanges(range);
+	ASSERT(!ranges.empty()); // at least the range is allKeys
+
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		if (it->cvalue().status != status)
+			return false;
+	}
+	return true;
+}
+
+void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status, bool restrictSize) {
+	auto ranges = serverKeys.intersectingRanges(range);
+	ASSERT(!ranges.empty());
+	if (ranges.begin().range().contains(range)) {
+		CODE_PROBE(true, "Implicitly split single shard to 3 pieces");
+		threeWayShardSplitting(ranges.begin().range(), range, ranges.begin().cvalue().shardSize, restrictSize);
+		return;
+	}
+	if (ranges.begin().begin() < range.begin) {
+		CODE_PROBE(true, "Implicitly split begin range to 2 pieces");
+		twoWayShardSplitting(ranges.begin().range(), range.begin, ranges.begin().cvalue().shardSize, restrictSize);
+	}
+	if (ranges.end().end() > range.end) {
+		CODE_PROBE(true, "Implicitly split end range to 2 pieces");
+		twoWayShardSplitting(ranges.end().range(), range.end, ranges.end().cvalue().shardSize, restrictSize);
+	}
+	ranges = serverKeys.containedRanges(range);
+	// now the boundary must be aligned
+	ASSERT(ranges.begin().begin() == range.begin);
+	ASSERT(ranges.end().end() == range.end);
+	uint64_t newSize = 0;
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		newSize += it->cvalue().shardSize;
+	}
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		auto oldStatus = it.value().status;
+		if (isStatusTransitionValid(oldStatus, status)) {
+			it.value() = ShardInfo{ status, newSize };
+		} else if (oldStatus == MockShardStatus::COMPLETED && status == MockShardStatus::INFLIGHT) {
+			CODE_PROBE(true, "Shard already on server");
+		} else {
+			TraceEvent(SevError, "MockShardStatusTransitionError")
+			    .detail("From", oldStatus)
+			    .detail("To", status)
+			    .detail("ID", id)
+			    .detail("KeyBegin", range.begin.toHexString())
+			    .detail("KeyEnd", range.begin.toHexString());
+		}
+	}
+	serverKeys.coalesce(range);
+}
+
+// split the out range [a, d) based on the inner range's boundary [b, c). The result would be [a,b), [b,c), [c,d). The
+// size of the new shards are randomly split from old size of [a, d)
+void MockStorageServer::threeWayShardSplitting(KeyRangeRef outerRange,
+                                               KeyRangeRef innerRange,
+                                               uint64_t outerRangeSize,
+                                               bool restrictSize) {
+	ASSERT(outerRange.contains(innerRange));
+
+	Key left = outerRange.begin;
+	// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
+	int leftSize = deterministicRandom()->randomInt(
+	    SERVER_KNOBS->MIN_SHARD_BYTES,
+	    restrictSize ? outerRangeSize - 2 * SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
+	int midSize = deterministicRandom()->randomInt(
+	    SERVER_KNOBS->MIN_SHARD_BYTES,
+	    restrictSize ? outerRangeSize - leftSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
+	int rightSize =
+	    restrictSize ? outerRangeSize - leftSize - midSize
+	                 : deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
+
+	serverKeys.insert(innerRange, { serverKeys[left].status, (uint64_t)midSize });
+	serverKeys[left].shardSize = leftSize;
+	serverKeys[innerRange.end].shardSize = rightSize;
+}
+
+// split the range [a,c) with split point b. The result would be [a, b), [b, c). The
+// size of the new shards are randomly split from old size of [a, c)
+void MockStorageServer::twoWayShardSplitting(KeyRangeRef range,
+                                             KeyRef splitPoint,
+                                             uint64_t rangeSize,
+                                             bool restrictSize) {
+	Key left = range.begin;
+	// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
+	int leftSize = deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES,
+	                                                restrictSize ? rangeSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1
+	                                                             : SERVER_KNOBS->MAX_SHARD_BYTES);
+	int rightSize =
+	    restrictSize ? rangeSize - leftSize
+	                 : deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
+	serverKeys.rawInsert(splitPoint, { serverKeys[left].status, (uint64_t)rightSize });
+	serverKeys[left].shardSize = leftSize;
+}
+
+void MockStorageServer::removeShard(KeyRangeRef range) {
+	auto ranges = serverKeys.containedRanges(range);
+	ASSERT(ranges.begin().range() == range);
+	serverKeys.rawErase(range);
+}
+
+uint64_t MockStorageServer::sumRangeSize(KeyRangeRef range) const {
+	auto ranges = serverKeys.intersectingRanges(range);
+	uint64_t totalSize = 0;
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		totalSize += it->cvalue().shardSize;
+	}
+	return totalSize;
+}
+
+void MockStorageServer::addActor(Future<Void> future) {
+	actors.add(future);
+}
+
+void MockStorageServer::getSplitPoints(const SplitRangeRequest& req) {}
+
+Future<Void> MockStorageServer::waitMetricsTenantAware(const WaitMetricsRequest& req) {
+	return MockStorageServerImpl::waitMetricsTenantAware(this, req);
+}
+
+void MockStorageServer::getStorageMetrics(const GetStorageMetricsRequest& req) {}
+
+Future<Void> MockStorageServer::run() {
+	ssi.locality = LocalityData(Optional<Standalone<StringRef>>(),
+	                            Standalone<StringRef>(deterministicRandom()->randomUniqueID().toString()),
+	                            Standalone<StringRef>(deterministicRandom()->randomUniqueID().toString()),
+	                            Optional<Standalone<StringRef>>());
+	ssi.initEndpoints();
+	ssi.startAcceptingRequests();
+	TraceEvent("MockStorageServerStart").detail("Address", ssi.address());
+	return serveStorageMetricsRequests(this, ssi);
+}
+
+void MockGlobalState::initializeAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) {
+	ASSERT(conf.storageTeamSize > 0);
+	configuration = conf;
+	std::vector<UID> serverIds;
+	for (int i = 1; i <= conf.storageTeamSize; ++i) {
+		UID id = indexToUID(i);
+		serverIds.push_back(id);
+		allServers[id] = MockStorageServer(id, defaultDiskSpace);
+		allServers[id].serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 });
+	}
+	shardMapping->assignRangeToTeams(allKeys, { Team(serverIds, true) });
+}
+
+void MockGlobalState::addStorageServer(StorageServerInterface server, uint64_t diskSpace) {
+	allServers[server.id()] = MockStorageServer(server, diskSpace);
+}
+
+bool MockGlobalState::serverIsSourceForShard(const UID& serverId, KeyRangeRef shard, bool inFlightShard) {
+	if (!allServers.count(serverId))
+		return false;
+
+	// check serverKeys
+	auto& mss = allServers.at(serverId);
+	if (!mss.allShardStatusEqual(shard, MockShardStatus::COMPLETED)) {
+		return false;
+	}
+
+	// check keyServers
+	auto teams = shardMapping->getTeamsForFirstShard(shard);
+	if (inFlightShard) {
+		return std::any_of(teams.second.begin(), teams.second.end(), [&serverId](const Team& team) {
+			return team.hasServer(serverId);
+		});
+	}
+	return std::any_of(
+	    teams.first.begin(), teams.first.end(), [&serverId](const Team& team) { return team.hasServer(serverId); });
+}
+
+bool MockGlobalState::serverIsDestForShard(const UID& serverId, KeyRangeRef shard) {
+	if (!allServers.count(serverId))
+		return false;
+
+	// check serverKeys
+	auto& mss = allServers.at(serverId);
+	if (!mss.allShardStatusEqual(shard, MockShardStatus::INFLIGHT)) {
+		return false;
+	}
+
+	// check keyServers
+	auto teams = shardMapping->getTeamsForFirstShard(shard);
+	return !teams.second.empty() && std::any_of(teams.first.begin(), teams.first.end(), [&serverId](const Team& team) {
+		return team.hasServer(serverId);
+	});
+}
+
+bool MockGlobalState::allShardsRemovedFromServer(const UID& serverId) {
+	return allServers.count(serverId) && shardMapping->getNumberOfShards(serverId) == 0;
+}
+
+Future<std::pair<Optional<StorageMetrics>, int>> MockGlobalState::waitStorageMetrics(
+    const KeyRange& keys,
+    const StorageMetrics& min,
+    const StorageMetrics& max,
+    const StorageMetrics& permittedError,
+    int shardLimit,
+    int expectedShardCount) {
+	return MockGlobalStateImpl::waitStorageMetrics(
+	    this, keys, min, max, permittedError, shardLimit, expectedShardCount);
+}
+
+Reference<LocationInfo> buildLocationInfo(const std::vector<StorageServerInterface>& interfaces) {
+	// construct the location info with the servers
+	std::vector<Reference<ReferencedInterface<StorageServerInterface>>> serverRefs;
+	serverRefs.reserve(interfaces.size());
+	for (const auto& interf : interfaces) {
+		serverRefs.push_back(makeReference<ReferencedInterface<StorageServerInterface>>(interf));
+	}
+
+	return makeReference<LocationInfo>(serverRefs);
+}
+
+Future<KeyRangeLocationInfo> MockGlobalState::getKeyLocation(TenantInfo tenant,
+                                                             Key key,
+                                                             SpanContext spanContext,
+                                                             Optional<UID> debugID,
+                                                             UseProvisionalProxies useProvisionalProxies,
+                                                             Reverse isBackward,
+                                                             Version version) {
+	if (isBackward) {
+		// DD never ask for backward range.
+		UNREACHABLE();
+	}
+	ASSERT(key < allKeys.end);
+
+	GetKeyServerLocationsReply rep;
+	KeyRange single = singleKeyRange(key);
+	auto teamPair = shardMapping->getTeamsForFirstShard(single);
+	auto& srcTeam = teamPair.second.empty() ? teamPair.first : teamPair.second;
+	ASSERT_EQ(srcTeam.size(), 1);
+	rep.results.emplace_back(single, extractStorageServerInterfaces(srcTeam.front().servers));
+
+	return KeyRangeLocationInfo(
+	    rep.tenantEntry,
+	    KeyRange(toPrefixRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena),
+	    buildLocationInfo(rep.results[0].second));
+}
+
+Future<std::vector<KeyRangeLocationInfo>> MockGlobalState::getKeyRangeLocations(
+    TenantInfo tenant,
+    KeyRange keys,
+    int limit,
+    Reverse reverse,
+    SpanContext spanContext,
+    Optional<UID> debugID,
+    UseProvisionalProxies useProvisionalProxies,
+    Version version) {
+
+	if (reverse) {
+		// DD never ask for backward range.
+		ASSERT(false);
+	}
+	ASSERT(keys.begin < keys.end);
+
+	GetKeyServerLocationsReply rep;
+	auto ranges = shardMapping->intersectingRanges(keys);
+	auto it = ranges.begin();
+	for (int count = 0; it != ranges.end() && count < limit; ++it, ++count) {
+		auto teamPair = shardMapping->getTeamsFor(it->begin());
+		auto& srcTeam = teamPair.second.empty() ? teamPair.first : teamPair.second;
+		ASSERT_EQ(srcTeam.size(), 1);
+		rep.results.emplace_back(it->range(), extractStorageServerInterfaces(srcTeam.front().servers));
+	}
+	CODE_PROBE(it != ranges.end(), "getKeyRangeLocations is limited", probe::decoration::rare);
+
+	std::vector<KeyRangeLocationInfo> results;
+	for (int shard = 0; shard < rep.results.size(); shard++) {
+		results.emplace_back(rep.tenantEntry,
+		                     (toPrefixRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys),
+		                     buildLocationInfo(rep.results[shard].second));
+	}
+	return results;
+}
+
+std::vector<StorageServerInterface> MockGlobalState::extractStorageServerInterfaces(const std::vector<UID>& ids) const {
+	std::vector<StorageServerInterface> interfaces;
+	for (auto& id : ids) {
+		interfaces.emplace_back(allServers.at(id).ssi);
+	}
+	return interfaces;
+}
+
+Future<Standalone<VectorRef<KeyRef>>> MockGlobalState::splitStorageMetrics(const KeyRange& keys,
+                                                                           const StorageMetrics& limit,
+                                                                           const StorageMetrics& estimated,
+                                                                           const Optional<int>& minSplitBytes) {
+	return MockGlobalStateImpl::splitStorageMetrics(this, keys, limit, estimated, minSplitBytes);
+}
+
+TEST_CASE("/MockGlobalState/initializeAsEmptyDatabaseMGS/SimpleThree") {
+	BasicTestConfig testConfig;
+	testConfig.simpleConfig = true;
+	testConfig.minimumReplication = 3;
+	testConfig.logAntiQuorum = 0;
+	DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
+	TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
+
+	auto mgs = std::make_shared<MockGlobalState>();
+	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
+	for (int i = 1; i <= dbConfig.storageTeamSize; ++i) {
+		auto id = MockGlobalState::indexToUID(i);
+		std::cout << "Check server " << i << "\n";
+		ASSERT(mgs->serverIsSourceForShard(id, allKeys));
+		ASSERT(mgs->allServers.at(id).sumRangeSize(allKeys) == 0);
+	}
+
+	return Void();
+}
+
+struct MockGlobalStateTester {
+
+	// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, x2), [x2, r0.end)
+	void testThreeWaySplitFirstRange(MockStorageServer& mss) {
+		auto it = mss.serverKeys.ranges().begin();
+		uint64_t oldSize =
+		    deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
+		MockShardStatus oldStatus = it.cvalue().status;
+		it->value().shardSize = oldSize;
+		KeyRangeRef outerRange = it->range();
+		Key x1 = keyAfter(it->range().begin);
+		Key x2 = keyAfter(x1);
+		std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
+
+		mss.threeWayShardSplitting(outerRange, KeyRangeRef(x1, x2), oldSize, false);
+		auto ranges = mss.serverKeys.containedRanges(outerRange);
+		ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
+		ranges.pop_front();
+		ASSERT(ranges.begin().range() == KeyRangeRef(x1, x2));
+		ASSERT(ranges.begin().cvalue().status == oldStatus);
+		ranges.pop_front();
+		ASSERT(ranges.begin().range() == KeyRangeRef(x2, outerRange.end));
+		ranges.pop_front();
+		ASSERT(ranges.empty());
+	}
+
+	// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, r0.end)
+	void testTwoWaySplitFirstRange(MockStorageServer& mss) {
+		auto it = mss.serverKeys.nthRange(0);
+		MockShardStatus oldStatus = it.cvalue().status;
+		uint64_t oldSize =
+		    deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
+		it->value().shardSize = oldSize;
+		KeyRangeRef outerRange = it->range();
+		Key x1 = keyAfter(it->range().begin);
+		std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
+
+		mss.twoWayShardSplitting(it->range(), x1, oldSize, false);
+		auto ranges = mss.serverKeys.containedRanges(outerRange);
+		ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
+		ranges.pop_front();
+		ASSERT(ranges.begin().range() == KeyRangeRef(x1, outerRange.end));
+		ASSERT(ranges.begin().cvalue().status == oldStatus);
+		ranges.pop_front();
+		ASSERT(ranges.empty());
+	}
+
+	KeyRangeLocationInfo getKeyLocationInfo(KeyRef key, std::shared_ptr<MockGlobalState> mgs) {
+		return mgs
+		    ->getKeyLocation(
+		        TenantInfo(), key, SpanContext(), Optional<UID>(), UseProvisionalProxies::False, Reverse::False, 0)
+		    .get();
+	}
+
+	std::vector<KeyRangeLocationInfo> getKeyRangeLocations(KeyRangeRef keys,
+	                                                       int limit,
+	                                                       std::shared_ptr<MockGlobalState> mgs) {
+		return mgs
+		    ->getKeyRangeLocations(TenantInfo(),
+		                           keys,
+		                           limit,
+		                           Reverse::False,
+		                           SpanContext(),
+		                           Optional<UID>(),
+		                           UseProvisionalProxies::False,
+		                           0)
+		    .get();
+	}
+};
+
+TEST_CASE("/MockGlobalState/MockStorageServer/SplittingFunctions") {
+	BasicTestConfig testConfig;
+	testConfig.simpleConfig = true;
+	testConfig.minimumReplication = 1;
+	testConfig.logAntiQuorum = 0;
+	DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
+	TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
+
+	auto mgs = std::make_shared<MockGlobalState>();
+	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
+
+	MockGlobalStateTester tester;
+	auto& mss = mgs->allServers.at(MockGlobalState::indexToUID(1));
+	std::cout << "Test 3-way splitting...\n";
+	tester.testThreeWaySplitFirstRange(mss);
+	std::cout << "Test 2-way splitting...\n";
+	mss.serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 }); // reset to empty
+	tester.testTwoWaySplitFirstRange(mss);
+
+	return Void();
+}
+
+namespace {
+inline bool locationInfoEqualsToTeam(Reference<LocationInfo> loc, const std::vector<UID>& ids) {
+	return loc->locations()->size() == ids.size() &&
+	       std::all_of(ids.begin(), ids.end(), [loc](const UID& id) { return loc->locations()->hasInterface(id); });
+}
+}; // namespace
+TEST_CASE("/MockGlobalState/MockStorageServer/GetKeyLocations") {
+	BasicTestConfig testConfig;
+	testConfig.simpleConfig = true;
+	testConfig.minimumReplication = 1;
+	testConfig.logAntiQuorum = 0;
+	DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
+	TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
+
+	auto mgs = std::make_shared<MockGlobalState>();
+	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
+	// add one empty server
+	mgs->addStorageServer(StorageServerInterface(mgs->indexToUID(mgs->allServers.size() + 1)));
+
+	// define 3 ranges:
+	// team 1 (UID 1,2,...,n-1):[begin, 1.0), [2.0, end)
+	// team 2 (UID 2,3,...n-1, n): [1.0, 2.0)
+	ShardsAffectedByTeamFailure::Team team1, team2;
+	for (int i = 0; i < mgs->allServers.size() - 1; ++i) {
+		UID id = mgs->indexToUID(i + 1);
+		team1.servers.emplace_back(id);
+		id = mgs->indexToUID(i + 2);
+		team2.servers.emplace_back(id);
+	}
+	Key one = doubleToTestKey(1.0), two = doubleToTestKey(2.0);
+	std::vector<KeyRangeRef> ranges{ KeyRangeRef(allKeys.begin, one),
+		                             KeyRangeRef(one, two),
+		                             KeyRangeRef(two, allKeys.end) };
+	mgs->shardMapping->assignRangeToTeams(ranges[0], { team1 });
+	mgs->shardMapping->assignRangeToTeams(ranges[1], { team2 });
+	mgs->shardMapping->assignRangeToTeams(ranges[2], { team1 });
+
+	// query key location
+	MockGlobalStateTester tester;
+	// -- team 1
+	Key testKey = doubleToTestKey(0.5);
+	auto locInfo = tester.getKeyLocationInfo(testKey, mgs);
+	ASSERT(locationInfoEqualsToTeam(locInfo.locations, team1.servers));
+
+	// -- team 2
+	testKey = doubleToTestKey(1.3);
+	locInfo = tester.getKeyLocationInfo(testKey, mgs);
+	ASSERT(locationInfoEqualsToTeam(locInfo.locations, team2.servers));
+
+	// query range location
+	testKey = doubleToTestKey(3.0);
+	// team 1,2,1
+	auto locInfos = tester.getKeyRangeLocations(KeyRangeRef(allKeys.begin, testKey), 100, mgs);
+	ASSERT(locInfos.size() == 3);
+	ASSERT(locInfos[0].range == ranges[0]);
+	ASSERT(locationInfoEqualsToTeam(locInfos[0].locations, team1.servers));
+	ASSERT(locInfos[1].range == ranges[1]);
+	ASSERT(locationInfoEqualsToTeam(locInfos[1].locations, team2.servers));
+	ASSERT(locInfos[2].range == KeyRangeRef(ranges[2].begin, testKey));
+	ASSERT(locationInfoEqualsToTeam(locInfos[2].locations, team1.servers));
+
+	// team 1,2
+	locInfos = tester.getKeyRangeLocations(KeyRangeRef(allKeys.begin, testKey), 2, mgs);
+	ASSERT(locInfos.size() == 2);
+	ASSERT(locInfos[0].range == ranges[0]);
+	ASSERT(locationInfoEqualsToTeam(locInfos[0].locations, team1.servers));
+	ASSERT(locInfos[1].range == ranges[1]);
+	ASSERT(locationInfoEqualsToTeam(locInfos[1].locations, team2.servers));
+
+	return Void();
+}
+
+TEST_CASE("/MockGlobalState/MockStorageServer/WaitStorageMetricsRequest") {
+	BasicTestConfig testConfig;
+	testConfig.simpleConfig = true;
+	testConfig.minimumReplication = 1;
+	testConfig.logAntiQuorum = 0;
+	DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
+	TraceEvent("WaitStorageMetricsRequestUnitTestConfig").detail("Config", dbConfig.toString());
+
+	state std::shared_ptr<MockGlobalState> mgs = std::make_shared<MockGlobalState>();
+	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
+	state ActorCollection actors;
+
+	ActorCollection* ptr = &actors; // get around ACTOR syntax restriction
+	std::for_each(mgs->allServers.begin(), mgs->allServers.end(), [ptr](auto& server) {
+		ptr->add(server.second.run());
+		IFailureMonitor::failureMonitor().setStatus(server.second.ssi.address(), FailureStatus(false));
+		server.second.metrics.byteSample.sample.insert("something"_sr, 500000);
+	});
+
+	KeyRange testRange = allKeys;
+	ShardSizeBounds bounds = ShardSizeBounds::shardSizeBoundsBeforeTrack();
+	std::pair<Optional<StorageMetrics>, int> res =
+	    wait(mgs->waitStorageMetrics(testRange, bounds.min, bounds.max, bounds.permittedError, 1, 1));
+	// std::cout << "get result " << res.second << "\n";
+	// std::cout << "get byte "<< res.first.get().bytes << "\n";
+	ASSERT_EQ(res.second, -1); // the valid result always return -1, strange contraction though.
+	ASSERT_EQ(res.first.get().bytes, 500000);
+	return Void();
+}
--- a/fdbserver/MockGlobalState.cpp
+++ b/fdbserver/MockGlobalState.cpp
@ -1,281 +0,0 @@
-/*
- * MockGlobalState.cpp
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "fdbserver/MockGlobalState.h"
-
-bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus status) {
-	auto ranges = serverKeys.intersectingRanges(range);
-	ASSERT(!ranges.empty()); // at least the range is allKeys
-
-	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
-		if (it->cvalue().status != status)
-			return false;
-	}
-	return true;
-}
-
-void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status, bool restrictSize) {
-	auto ranges = serverKeys.intersectingRanges(range);
-	ASSERT(!ranges.empty());
-	if (ranges.begin().range().contains(range)) {
-		CODE_PROBE(true, "Implicitly split single shard to 3 pieces");
-		threeWayShardSplitting(ranges.begin().range(), range, ranges.begin().cvalue().shardSize, restrictSize);
-		return;
-	}
-	if (ranges.begin().begin() < range.begin) {
-		CODE_PROBE(true, "Implicitly split begin range to 2 pieces");
-		twoWayShardSplitting(ranges.begin().range(), range.begin, ranges.begin().cvalue().shardSize, restrictSize);
-	}
-	if (ranges.end().end() > range.end) {
-		CODE_PROBE(true, "Implicitly split end range to 2 pieces");
-		twoWayShardSplitting(ranges.end().range(), range.end, ranges.end().cvalue().shardSize, restrictSize);
-	}
-	ranges = serverKeys.containedRanges(range);
-	// now the boundary must be aligned
-	ASSERT(ranges.begin().begin() == range.begin);
-	ASSERT(ranges.end().end() == range.end);
-	uint64_t newSize = 0;
-	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
-		newSize += it->cvalue().shardSize;
-	}
-	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
-		auto oldStatus = it.value().status;
-		if (isStatusTransitionValid(oldStatus, status)) {
-			it.value() = ShardInfo{ status, newSize };
-		} else if (oldStatus == MockShardStatus::COMPLETED && status == MockShardStatus::INFLIGHT) {
-			CODE_PROBE(true, "Shard already on server");
-		} else {
-			TraceEvent(SevError, "MockShardStatusTransitionError")
-			    .detail("From", oldStatus)
-			    .detail("To", status)
-			    .detail("ID", id)
-			    .detail("KeyBegin", range.begin.toHexString())
-			    .detail("KeyEnd", range.begin.toHexString());
-		}
-	}
-	serverKeys.coalesce(range);
-}
-
-// split the out range [a, d) based on the inner range's boundary [b, c). The result would be [a,b), [b,c), [c,d). The
-// size of the new shards are randomly split from old size of [a, d)
-void MockStorageServer::threeWayShardSplitting(KeyRangeRef outerRange,
-                                               KeyRangeRef innerRange,
-                                               uint64_t outerRangeSize,
-                                               bool restrictSize) {
-	ASSERT(outerRange.contains(innerRange));
-
-	Key left = outerRange.begin;
-	// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
-	int leftSize = deterministicRandom()->randomInt(
-	    SERVER_KNOBS->MIN_SHARD_BYTES,
-	    restrictSize ? outerRangeSize - 2 * SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
-	int midSize = deterministicRandom()->randomInt(
-	    SERVER_KNOBS->MIN_SHARD_BYTES,
-	    restrictSize ? outerRangeSize - leftSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
-	int rightSize =
-	    restrictSize ? outerRangeSize - leftSize - midSize
-	                 : deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
-
-	serverKeys.insert(innerRange, { serverKeys[left].status, (uint64_t)midSize });
-	serverKeys[left].shardSize = leftSize;
-	serverKeys[innerRange.end].shardSize = rightSize;
-}
-
-// split the range [a,c) with split point b. The result would be [a, b), [b, c). The
-// size of the new shards are randomly split from old size of [a, c)
-void MockStorageServer::twoWayShardSplitting(KeyRangeRef range,
-                                             KeyRef splitPoint,
-                                             uint64_t rangeSize,
-                                             bool restrictSize) {
-	Key left = range.begin;
-	// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
-	int leftSize = deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES,
-	                                                restrictSize ? rangeSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1
-	                                                             : SERVER_KNOBS->MAX_SHARD_BYTES);
-	int rightSize =
-	    restrictSize ? rangeSize - leftSize
-	                 : deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
-	serverKeys.rawInsert(splitPoint, { serverKeys[left].status, (uint64_t)rightSize });
-	serverKeys[left].shardSize = leftSize;
-}
-
-void MockStorageServer::removeShard(KeyRangeRef range) {
-	auto ranges = serverKeys.containedRanges(range);
-	ASSERT(ranges.begin().range() == range);
-	serverKeys.rawErase(range);
-}
-
-uint64_t MockStorageServer::sumRangeSize(KeyRangeRef range) const {
-	auto ranges = serverKeys.intersectingRanges(range);
-	uint64_t totalSize = 0;
-	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
-		totalSize += it->cvalue().shardSize;
-	}
-	return totalSize;
-}
-
-void MockGlobalState::initializeAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) {
-	ASSERT(conf.storageTeamSize > 0);
-	configuration = conf;
-	std::vector<UID> serverIds;
-	for (int i = 1; i <= conf.storageTeamSize; ++i) {
-		UID id = indexToUID(i);
-		serverIds.push_back(id);
-		allServers[id] = MockStorageServer(id, defaultDiskSpace);
-		allServers[id].serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 });
-	}
-	shardMapping->assignRangeToTeams(allKeys, { Team(serverIds, true) });
-}
-
-void MockGlobalState::addStorageServer(StorageServerInterface server, uint64_t diskSpace) {
-	allServers[server.id()] = MockStorageServer(server, diskSpace);
-}
-
-bool MockGlobalState::serverIsSourceForShard(const UID& serverId, KeyRangeRef shard, bool inFlightShard) {
-	if (!allServers.count(serverId))
-		return false;
-
-	// check serverKeys
-	auto& mss = allServers.at(serverId);
-	if (!mss.allShardStatusEqual(shard, MockShardStatus::COMPLETED)) {
-		return false;
-	}
-
-	// check keyServers
-	auto teams = shardMapping->getTeamsFor(shard);
-	if (inFlightShard) {
-		return std::any_of(teams.second.begin(), teams.second.end(), [&serverId](const Team& team) {
-			return team.hasServer(serverId);
-		});
-	}
-	return std::any_of(
-	    teams.first.begin(), teams.first.end(), [&serverId](const Team& team) { return team.hasServer(serverId); });
-}
-
-bool MockGlobalState::serverIsDestForShard(const UID& serverId, KeyRangeRef shard) {
-	if (!allServers.count(serverId))
-		return false;
-
-	// check serverKeys
-	auto& mss = allServers.at(serverId);
-	if (!mss.allShardStatusEqual(shard, MockShardStatus::INFLIGHT)) {
-		return false;
-	}
-
-	// check keyServers
-	auto teams = shardMapping->getTeamsFor(shard);
-	return !teams.second.empty() && std::any_of(teams.first.begin(), teams.first.end(), [&serverId](const Team& team) {
-		return team.hasServer(serverId);
-	});
-}
-
-bool MockGlobalState::allShardRemovedFromServer(const UID& serverId) {
-	return allServers.count(serverId) && shardMapping->getNumberOfShards(serverId) == 0;
-}
-
-TEST_CASE("/MockGlobalState/initializeAsEmptyDatabaseMGS/SimpleThree") {
-	BasicTestConfig testConfig;
-	testConfig.simpleConfig = true;
-	testConfig.minimumReplication = 3;
-	testConfig.logAntiQuorum = 0;
-	DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
-	TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
-
-	auto mgs = std::make_shared<MockGlobalState>();
-	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
-	for (int i = 1; i <= dbConfig.storageTeamSize; ++i) {
-		auto id = MockGlobalState::indexToUID(i);
-		std::cout << "Check server " << i << "\n";
-		ASSERT(mgs->serverIsSourceForShard(id, allKeys));
-		ASSERT(mgs->allServers.at(id).sumRangeSize(allKeys) == 0);
-	}
-
-	return Void();
-}
-
-struct MockGlobalStateTester {
-
-	// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, x2), [x2, r0.end)
-	void testThreeWaySplitFirstRange(MockStorageServer& mss) {
-		auto it = mss.serverKeys.ranges().begin();
-		uint64_t oldSize =
-		    deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
-		MockShardStatus oldStatus = it.cvalue().status;
-		it->value().shardSize = oldSize;
-		KeyRangeRef outerRange = it->range();
-		Key x1 = keyAfter(it->range().begin);
-		Key x2 = keyAfter(x1);
-		std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
-
-		mss.threeWayShardSplitting(outerRange, KeyRangeRef(x1, x2), oldSize, false);
-		auto ranges = mss.serverKeys.containedRanges(outerRange);
-		ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
-		ranges.pop_front();
-		ASSERT(ranges.begin().range() == KeyRangeRef(x1, x2));
-		ASSERT(ranges.begin().cvalue().status == oldStatus);
-		ranges.pop_front();
-		ASSERT(ranges.begin().range() == KeyRangeRef(x2, outerRange.end));
-		ranges.pop_front();
-		ASSERT(ranges.empty());
-	}
-
-	// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, r0.end)
-	void testTwoWaySplitFirstRange(MockStorageServer& mss) {
-		auto it = mss.serverKeys.nthRange(0);
-		MockShardStatus oldStatus = it.cvalue().status;
-		uint64_t oldSize =
-		    deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
-		it->value().shardSize = oldSize;
-		KeyRangeRef outerRange = it->range();
-		Key x1 = keyAfter(it->range().begin);
-		std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
-
-		mss.twoWayShardSplitting(it->range(), x1, oldSize, false);
-		auto ranges = mss.serverKeys.containedRanges(outerRange);
-		ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
-		ranges.pop_front();
-		ASSERT(ranges.begin().range() == KeyRangeRef(x1, outerRange.end));
-		ASSERT(ranges.begin().cvalue().status == oldStatus);
-		ranges.pop_front();
-		ASSERT(ranges.empty());
-	}
-};
-
-TEST_CASE("/MockGlobalState/MockStorageServer/SplittingFunctions") {
-	BasicTestConfig testConfig;
-	testConfig.simpleConfig = true;
-	testConfig.minimumReplication = 1;
-	testConfig.logAntiQuorum = 0;
-	DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
-	TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
-
-	auto mgs = std::make_shared<MockGlobalState>();
-	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
-
-	MockGlobalStateTester tester;
-	auto& mss = mgs->allServers.at(MockGlobalState::indexToUID(1));
-	std::cout << "Test 3-way splitting...\n";
-	tester.testThreeWaySplitFirstRange(mss);
-	std::cout << "Test 2-way splitting...\n";
-	mss.serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 }); // reset to empty
-	tester.testTwoWaySplitFirstRange(mss);
-
-	return Void();
-}
--- a/fdbserver/OldTLogServer_4_6.actor.cpp
+++ b/fdbserver/OldTLogServer_4_6.actor.cpp
@ -447,10 +447,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 		          "Restored");
 		addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id()));

-		persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id);
-		persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id);
-		version.initMetric("TLog.Version"_sr, cc.id);
-		queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id);
+		persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId());
+		persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId());
+		version.initMetric("TLog.Version"_sr, cc.getId());
+		queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId());

 		specialCounter(cc, "Version", [this]() { return this->version.get(); });
 		specialCounter(cc, "SharedBytesInput", [tLogData]() { return tLogData->bytesInput; });
@ -1399,26 +1399,26 @@ ACTOR Future<Void> tLogCore(TLogData* self, Reference<LogData> logData) {
 	logData->addActor.send(waitFailureServer(logData->tli.waitFailure.getFuture()));
 	logData->addActor.send(logData->removed);
 	// FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance
-	logData->addActor.send(traceCounters("TLogMetrics",
-	                                     logData->logId,
-	                                     SERVER_KNOBS->STORAGE_LOGGING_DELAY,
-	                                     &logData->cc,
-	                                     logData->logId.toString() + "/TLogMetrics",
-	                                     [self = self](TraceEvent& te) {
-		                                     StorageBytes sbTlog = self->persistentData->getStorageBytes();
-		                                     te.detail("KvstoreBytesUsed", sbTlog.used);
-		                                     te.detail("KvstoreBytesFree", sbTlog.free);
-		                                     te.detail("KvstoreBytesAvailable", sbTlog.available);
-		                                     te.detail("KvstoreBytesTotal", sbTlog.total);
-		                                     te.detail("KvstoreBytesTemp", sbTlog.temp);
+	logData->addActor.send(logData->cc.traceCounters("TLogMetrics",
+	                                                 logData->logId,
+	                                                 SERVER_KNOBS->STORAGE_LOGGING_DELAY,
+	                                                 logData->logId.toString() + "/TLogMetrics",
+	                                                 [self = self](TraceEvent& te) {
+		                                                 StorageBytes sbTlog = self->persistentData->getStorageBytes();
+		                                                 te.detail("KvstoreBytesUsed", sbTlog.used);
+		                                                 te.detail("KvstoreBytesFree", sbTlog.free);
+		                                                 te.detail("KvstoreBytesAvailable", sbTlog.available);
+		                                                 te.detail("KvstoreBytesTotal", sbTlog.total);
+		                                                 te.detail("KvstoreBytesTemp", sbTlog.temp);

-		                                     StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
-		                                     te.detail("QueueDiskBytesUsed", sbQueue.used);
-		                                     te.detail("QueueDiskBytesFree", sbQueue.free);
-		                                     te.detail("QueueDiskBytesAvailable", sbQueue.available);
-		                                     te.detail("QueueDiskBytesTotal", sbQueue.total);
-		                                     te.detail("QueueDiskBytesTemp", sbQueue.temp);
-	                                     }));
+		                                                 StorageBytes sbQueue =
+		                                                     self->rawPersistentQueue->getStorageBytes();
+		                                                 te.detail("QueueDiskBytesUsed", sbQueue.used);
+		                                                 te.detail("QueueDiskBytesFree", sbQueue.free);
+		                                                 te.detail("QueueDiskBytesAvailable", sbQueue.available);
+		                                                 te.detail("QueueDiskBytesTotal", sbQueue.total);
+		                                                 te.detail("QueueDiskBytesTemp", sbQueue.temp);
+	                                                 }));

 	logData->addActor.send(serveTLogInterface(self, logData->tli, logData, warningCollectorInput));

--- a/fdbserver/OldTLogServer_6_0.actor.cpp
+++ b/fdbserver/OldTLogServer_6_0.actor.cpp
@ -533,10 +533,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 		          context);
 		addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id()));

-		persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id);
-		persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id);
-		version.initMetric("TLog.Version"_sr, cc.id);
-		queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id);
+		persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId());
+		persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId());
+		version.initMetric("TLog.Version"_sr, cc.getId());
+		queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId());

 		specialCounter(cc, "Version", [this]() { return this->version.get(); });
 		specialCounter(cc, "QueueCommittedVersion", [this]() { return this->queueCommittedVersion.get(); });
@ -2212,26 +2212,26 @@ ACTOR Future<Void> tLogCore(TLogData* self,
 	logData->addActor.send(waitFailureServer(tli.waitFailure.getFuture()));
 	logData->addActor.send(logData->removed);
 	// FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance
-	logData->addActor.send(traceCounters("TLogMetrics",
-	                                     logData->logId,
-	                                     SERVER_KNOBS->STORAGE_LOGGING_DELAY,
-	                                     &logData->cc,
-	                                     logData->logId.toString() + "/TLogMetrics",
-	                                     [self = self](TraceEvent& te) {
-		                                     StorageBytes sbTlog = self->persistentData->getStorageBytes();
-		                                     te.detail("KvstoreBytesUsed", sbTlog.used);
-		                                     te.detail("KvstoreBytesFree", sbTlog.free);
-		                                     te.detail("KvstoreBytesAvailable", sbTlog.available);
-		                                     te.detail("KvstoreBytesTotal", sbTlog.total);
-		                                     te.detail("KvstoreBytesTemp", sbTlog.temp);
+	logData->addActor.send(logData->cc.traceCounters("TLogMetrics",
+	                                                 logData->logId,
+	                                                 SERVER_KNOBS->STORAGE_LOGGING_DELAY,
+	                                                 logData->logId.toString() + "/TLogMetrics",
+	                                                 [self = self](TraceEvent& te) {
+		                                                 StorageBytes sbTlog = self->persistentData->getStorageBytes();
+		                                                 te.detail("KvstoreBytesUsed", sbTlog.used);
+		                                                 te.detail("KvstoreBytesFree", sbTlog.free);
+		                                                 te.detail("KvstoreBytesAvailable", sbTlog.available);
+		                                                 te.detail("KvstoreBytesTotal", sbTlog.total);
+		                                                 te.detail("KvstoreBytesTemp", sbTlog.temp);

-		                                     StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
-		                                     te.detail("QueueDiskBytesUsed", sbQueue.used);
-		                                     te.detail("QueueDiskBytesFree", sbQueue.free);
-		                                     te.detail("QueueDiskBytesAvailable", sbQueue.available);
-		                                     te.detail("QueueDiskBytesTotal", sbQueue.total);
-		                                     te.detail("QueueDiskBytesTemp", sbQueue.temp);
-	                                     }));
+		                                                 StorageBytes sbQueue =
+		                                                     self->rawPersistentQueue->getStorageBytes();
+		                                                 te.detail("QueueDiskBytesUsed", sbQueue.used);
+		                                                 te.detail("QueueDiskBytesFree", sbQueue.free);
+		                                                 te.detail("QueueDiskBytesAvailable", sbQueue.available);
+		                                                 te.detail("QueueDiskBytesTotal", sbQueue.total);
+		                                                 te.detail("QueueDiskBytesTemp", sbQueue.temp);
+	                                                 }));

 	logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
 	logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));
--- a/fdbserver/OldTLogServer_6_2.actor.cpp
+++ b/fdbserver/OldTLogServer_6_2.actor.cpp
@ -616,10 +616,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 		          context);
 		addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id()));

-		persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id);
-		persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id);
-		version.initMetric("TLog.Version"_sr, cc.id);
-		queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id);
+		persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId());
+		persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId());
+		version.initMetric("TLog.Version"_sr, cc.getId());
+		queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId());

 		specialCounter(cc, "Version", [this]() { return this->version.get(); });
 		specialCounter(cc, "QueueCommittedVersion", [this]() { return this->queueCommittedVersion.get(); });
@ -2671,26 +2671,26 @@ ACTOR Future<Void> tLogCore(TLogData* self,
 	logData->addActor.send(waitFailureServer(tli.waitFailure.getFuture()));
 	logData->addActor.send(logData->removed);
 	// FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance
-	logData->addActor.send(traceCounters("TLogMetrics",
-	                                     logData->logId,
-	                                     SERVER_KNOBS->STORAGE_LOGGING_DELAY,
-	                                     &logData->cc,
-	                                     logData->logId.toString() + "/TLogMetrics",
-	                                     [self = self](TraceEvent& te) {
-		                                     StorageBytes sbTlog = self->persistentData->getStorageBytes();
-		                                     te.detail("KvstoreBytesUsed", sbTlog.used);
-		                                     te.detail("KvstoreBytesFree", sbTlog.free);
-		                                     te.detail("KvstoreBytesAvailable", sbTlog.available);
-		                                     te.detail("KvstoreBytesTotal", sbTlog.total);
-		                                     te.detail("KvstoreBytesTemp", sbTlog.temp);
+	logData->addActor.send(logData->cc.traceCounters("TLogMetrics",
+	                                                 logData->logId,
+	                                                 SERVER_KNOBS->STORAGE_LOGGING_DELAY,
+	                                                 logData->logId.toString() + "/TLogMetrics",
+	                                                 [self = self](TraceEvent& te) {
+		                                                 StorageBytes sbTlog = self->persistentData->getStorageBytes();
+		                                                 te.detail("KvstoreBytesUsed", sbTlog.used);
+		                                                 te.detail("KvstoreBytesFree", sbTlog.free);
+		                                                 te.detail("KvstoreBytesAvailable", sbTlog.available);
+		                                                 te.detail("KvstoreBytesTotal", sbTlog.total);
+		                                                 te.detail("KvstoreBytesTemp", sbTlog.temp);

-		                                     StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
-		                                     te.detail("QueueDiskBytesUsed", sbQueue.used);
-		                                     te.detail("QueueDiskBytesFree", sbQueue.free);
-		                                     te.detail("QueueDiskBytesAvailable", sbQueue.available);
-		                                     te.detail("QueueDiskBytesTotal", sbQueue.total);
-		                                     te.detail("QueueDiskBytesTemp", sbQueue.temp);
-	                                     }));
+		                                                 StorageBytes sbQueue =
+		                                                     self->rawPersistentQueue->getStorageBytes();
+		                                                 te.detail("QueueDiskBytesUsed", sbQueue.used);
+		                                                 te.detail("QueueDiskBytesFree", sbQueue.free);
+		                                                 te.detail("QueueDiskBytesAvailable", sbQueue.available);
+		                                                 te.detail("QueueDiskBytesTotal", sbQueue.total);
+		                                                 te.detail("QueueDiskBytesTemp", sbQueue.temp);
+	                                                 }));

 	logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
 	logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));
--- a/fdbserver/Resolver.actor.cpp
+++ b/fdbserver/Resolver.actor.cpp
@ -35,7 +35,7 @@
 #include "fdbserver/ResolverInterface.h"
 #include "fdbserver/RestoreUtil.h"
 #include "fdbserver/ServerDBInfo.h"
-#include "fdbserver/StorageMetrics.h"
+#include "fdbserver/StorageMetrics.actor.h"
 #include "fdbserver/WaitFailure.h"
 #include "fdbserver/WorkerInterface.actor.h"
 #include "flow/ActorCollection.h"
@ -188,7 +188,7 @@ struct Resolver : ReferenceCounted<Resolver> {
 		specialCounter(cc, "NeededVersion", [this]() { return this->neededVersion.get(); });
 		specialCounter(cc, "TotalStateBytes", [this]() { return this->totalStateBytes.get(); });

-		logger = traceCounters("ResolverMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ResolverMetrics");
+		logger = cc.traceCounters("ResolverMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ResolverMetrics");
 	}
 	~Resolver() { destroyConflictSet(conflictSet); }
 };
--- a/fdbserver/RestoreLoader.actor.cpp
+++ b/fdbserver/RestoreLoader.actor.cpp
@ -30,7 +30,7 @@
 #include "fdbserver/RestoreLoader.actor.h"
 #include "fdbserver/RestoreRoleCommon.actor.h"
 #include "fdbserver/MutationTracking.h"
-#include "fdbserver/StorageMetrics.h"
+#include "fdbserver/StorageMetrics.actor.h"

 #include "flow/actorcompiler.h" // This must be the last #include.

@ -405,10 +405,6 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
 	    .detail("Offset", asset.offset)
 	    .detail("Length", asset.len);

-	// Ensure data blocks in the same file are processed in order
-	wait(processedFileOffset->whenAtLeast(asset.offset));
-	ASSERT(processedFileOffset->get() == asset.offset);
-
 	state Arena tempArena;
 	state StringRefReader reader(buf, restore_corrupted_data());
 	try {
@ -430,8 +426,9 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
 			const uint8_t* message = reader.consume(msgSize);

 			// Skip mutations out of the version range
-			if (!asset.isInVersionRange(msgVersion.version))
+			if (!asset.isInVersionRange(msgVersion.version)) {
 				continue;
+			}

 			state VersionedMutationsMap::iterator it;
 			bool inserted;
@ -452,6 +449,7 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
 			// Skip mutation whose commitVesion < range kv's version
 			if (logMutationTooOld(pRangeVersions, mutation, msgVersion.version)) {
 				cc->oldLogMutations += 1;
+				wait(yield()); // avoid potential stack overflows
 				continue;
 			}

@ -459,6 +457,7 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
 			if (mutation.param1 >= asset.range.end ||
 			    (isRangeMutation(mutation) && mutation.param2 < asset.range.begin) ||
 			    (!isRangeMutation(mutation) && mutation.param1 < asset.range.begin)) {
+				wait(yield()); // avoid potential stack overflows
 				continue;
 			}

@ -509,7 +508,6 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
 		    .detail("BlockLen", asset.len);
 		throw;
 	}
-	processedFileOffset->set(asset.offset + asset.len);
 	return Void();
 }

@ -526,8 +524,19 @@ ACTOR static Future<Void> parsePartitionedLogFileOnLoader(
 	state int readFileRetries = 0;
 	loop {
 		try {
+			// Ensure data blocks in the same file are processed in order
+			wait(processedFileOffset->whenAtLeast(asset.offset));
+			ASSERT(processedFileOffset->get() == asset.offset);
+
 			wait(_parsePartitionedLogFileOnLoader(
 			    pRangeVersions, processedFileOffset, kvOpsIter, samplesIter, cc, bc, asset, cx));
+			processedFileOffset->set(asset.offset + asset.len);
+
+			TraceEvent("FastRestoreLoaderDecodingLogFileDone")
+			    .detail("BatchIndex", asset.batchIndex)
+			    .detail("Filename", asset.filename)
+			    .detail("Offset", asset.offset)
+			    .detail("Length", asset.len);
 			break;
 		} catch (Error& e) {
 			if (e.code() == error_code_restore_bad_read || e.code() == error_code_restore_unsupported_file_version ||
--- a/fdbserver/ShardsAffectedByTeamFailure.cpp
+++ b/fdbserver/ShardsAffectedByTeamFailure.cpp
@ -40,10 +40,16 @@ int ShardsAffectedByTeamFailure::getNumberOfShards(UID ssID) const {
 }

 std::pair<std::vector<ShardsAffectedByTeamFailure::Team>, std::vector<ShardsAffectedByTeamFailure::Team>>
-ShardsAffectedByTeamFailure::getTeamsFor(KeyRangeRef keys) {
+ShardsAffectedByTeamFailure::getTeamsForFirstShard(KeyRangeRef keys) {
 	return shard_teams[keys.begin];
 }

+std::pair<std::vector<ShardsAffectedByTeamFailure::Team>, std::vector<ShardsAffectedByTeamFailure::Team>>
+
+ShardsAffectedByTeamFailure::getTeamsFor(KeyRef key) {
+	return shard_teams[key];
+}
+
 void ShardsAffectedByTeamFailure::erase(Team team, KeyRange const& range) {
 	DisabledTraceEvent(SevDebug, "ShardsAffectedByTeamFailureErase")
 	    .detail("Range", range)
@ -236,3 +242,7 @@ void ShardsAffectedByTeamFailure::removeFailedServerForRange(KeyRangeRef keys, c
 	}
 	check();
 }
+
+auto ShardsAffectedByTeamFailure::intersectingRanges(KeyRangeRef keyRange) const -> decltype(shard_teams)::ConstRanges {
+	return shard_teams.intersectingRanges(keyRange);
+}
--- a/fdbserver/SimpleConfigConsumer.actor.cpp
+++ b/fdbserver/SimpleConfigConsumer.actor.cpp
@ -166,8 +166,8 @@ public:
 	    successfulChangeRequest("SuccessfulChangeRequest", cc), failedChangeRequest("FailedChangeRequest", cc),
 	    snapshotRequest("SnapshotRequest", cc) {
 		cfi = getConfigFollowerInterface(configSource);
-		logger = traceCounters(
-		    "ConfigConsumerMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ConfigConsumerMetrics");
+		logger = cc.traceCounters(
+		    "ConfigConsumerMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ConfigConsumerMetrics");
 	}

 	Future<Void> consume(ConfigBroadcaster& broadcaster) {
--- a/fdbserver/StorageCache.actor.cpp
+++ b/fdbserver/StorageCache.actor.cpp
@ -248,9 +248,9 @@ public:
 	    lastTLogVersion(0), lastVersionWithData(0), peekVersion(0), compactionInProgress(Void()),
 	    fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_BYTES), debug_inApplyUpdate(false),
 	    debug_lastValidateTime(0), versionLag(0), behind(false), counters(this) {
-		version.initMetric("StorageCacheData.Version"_sr, counters.cc.id);
-		desiredOldestVersion.initMetric("StorageCacheData.DesriedOldestVersion"_sr, counters.cc.id);
-		oldestVersion.initMetric("StorageCacheData.OldestVersion"_sr, counters.cc.id);
+		version.initMetric("StorageCacheData.Version"_sr, counters.cc.getId());
+		desiredOldestVersion.initMetric("StorageCacheData.DesriedOldestVersion"_sr, counters.cc.getId());
+		oldestVersion.initMetric("StorageCacheData.OldestVersion"_sr, counters.cc.getId());

 		newestAvailableVersion.insert(allKeys, invalidVersion);
 		newestDirtyVersion.insert(allKeys, invalidVersion);
@ -2224,11 +2224,10 @@ ACTOR Future<Void> storageCacheServer(StorageServerInterface ssi,
 	self.ck = cacheKeysPrefixFor(id).withPrefix(systemKeys.begin); // FFFF/02cacheKeys/[this server]/

 	actors.add(waitFailureServer(ssi.waitFailure.getFuture()));
-	actors.add(traceCounters("CacheMetrics",
-	                         self.thisServerID,
-	                         SERVER_KNOBS->STORAGE_LOGGING_DELAY,
-	                         &self.counters.cc,
-	                         self.thisServerID.toString() + "/CacheMetrics"));
+	actors.add(self.counters.cc.traceCounters("CacheMetrics",
+	                                          self.thisServerID,
+	                                          SERVER_KNOBS->STORAGE_LOGGING_DELAY,
+	                                          self.thisServerID.toString() + "/CacheMetrics"));

 	// fetch already cached ranges from the database and apply them before proceeding
 	wait(storageCacheStartUpWarmup(&self));
--- a/fdbserver/StorageMetrics.actor.cpp
+++ b/fdbserver/StorageMetrics.actor.cpp
@ -19,7 +19,7 @@
 */

 #include "flow/UnitTest.h"
-#include "fdbserver/StorageMetrics.h"
+#include "fdbserver/StorageMetrics.actor.h"
 #include "flow/actorcompiler.h" // This must be the last #include.

 int64_t StorageMetricSample::getEstimate(KeyRangeRef keys) const {
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@ -652,10 +652,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 		          context);
 		addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id()));

-		persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id);
-		persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id);
-		version.initMetric("TLog.Version"_sr, cc.id);
-		queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id);
+		persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId());
+		persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId());
+		version.initMetric("TLog.Version"_sr, cc.getId());
+		queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId());

 		specialCounter(cc, "Version", [this]() { return this->version.get(); });
 		specialCounter(cc, "QueueCommittedVersion", [this]() { return this->queueCommittedVersion.get(); });
@ -2930,26 +2930,26 @@ ACTOR Future<Void> tLogCore(TLogData* self,
 	logData->addActor.send(waitFailureServer(tli.waitFailure.getFuture()));
 	logData->addActor.send(logData->removed);
 	// FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance
-	logData->addActor.send(traceCounters("TLogMetrics",
-	                                     logData->logId,
-	                                     SERVER_KNOBS->STORAGE_LOGGING_DELAY,
-	                                     &logData->cc,
-	                                     logData->logId.toString() + "/TLogMetrics",
-	                                     [self = self](TraceEvent& te) {
-		                                     StorageBytes sbTlog = self->persistentData->getStorageBytes();
-		                                     te.detail("KvstoreBytesUsed", sbTlog.used);
-		                                     te.detail("KvstoreBytesFree", sbTlog.free);
-		                                     te.detail("KvstoreBytesAvailable", sbTlog.available);
-		                                     te.detail("KvstoreBytesTotal", sbTlog.total);
-		                                     te.detail("KvstoreBytesTemp", sbTlog.temp);
+	logData->addActor.send(logData->cc.traceCounters("TLogMetrics",
+	                                                 logData->logId,
+	                                                 SERVER_KNOBS->STORAGE_LOGGING_DELAY,
+	                                                 logData->logId.toString() + "/TLogMetrics",
+	                                                 [self = self](TraceEvent& te) {
+		                                                 StorageBytes sbTlog = self->persistentData->getStorageBytes();
+		                                                 te.detail("KvstoreBytesUsed", sbTlog.used);
+		                                                 te.detail("KvstoreBytesFree", sbTlog.free);
+		                                                 te.detail("KvstoreBytesAvailable", sbTlog.available);
+		                                                 te.detail("KvstoreBytesTotal", sbTlog.total);
+		                                                 te.detail("KvstoreBytesTemp", sbTlog.temp);

-		                                     StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
-		                                     te.detail("QueueDiskBytesUsed", sbQueue.used);
-		                                     te.detail("QueueDiskBytesFree", sbQueue.free);
-		                                     te.detail("QueueDiskBytesAvailable", sbQueue.available);
-		                                     te.detail("QueueDiskBytesTotal", sbQueue.total);
-		                                     te.detail("QueueDiskBytesTemp", sbQueue.temp);
-	                                     }));
+		                                                 StorageBytes sbQueue =
+		                                                     self->rawPersistentQueue->getStorageBytes();
+		                                                 te.detail("QueueDiskBytesUsed", sbQueue.used);
+		                                                 te.detail("QueueDiskBytesFree", sbQueue.free);
+		                                                 te.detail("QueueDiskBytesAvailable", sbQueue.available);
+		                                                 te.detail("QueueDiskBytesTotal", sbQueue.total);
+		                                                 te.detail("QueueDiskBytesTemp", sbQueue.temp);
+	                                                 }));

 	logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
 	logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));
--- a/fdbserver/TenantCache.actor.cpp
+++ b/fdbserver/TenantCache.actor.cpp
@ -122,19 +122,20 @@ public:
 	ACTOR static Future<Void> monitorStorageUsage(TenantCache* tenantCache) {
 		TraceEvent(SevInfo, "StartingTenantCacheStorageUsageMonitor", tenantCache->id()).log();

-		state int refreshInterval = SERVER_KNOBS->TENANT_CACHE_STORAGE_REFRESH_INTERVAL;
+		state int refreshInterval = SERVER_KNOBS->TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL;
 		state double lastTenantListFetchTime = now();

 		loop {
 			state double fetchStartTime = now();
-			state std::vector<std::pair<KeyRef, TenantName>> tenantList = tenantCache->getTenantList();
+			state std::vector<TenantName> tenants = tenantCache->getTenantList();
 			state int i;
-			for (i = 0; i < tenantList.size(); i++) {
-				state ReadYourWritesTransaction tr(tenantCache->dbcx(), tenantList[i].second);
+			for (i = 0; i < tenants.size(); i++) {
+				state ReadYourWritesTransaction tr(tenantCache->dbcx(), tenants[i]);
 				loop {
 					try {
 						state int64_t size = wait(tr.getEstimatedRangeSizeBytes(normalKeys));
-						tenantCache->updateStorageUsage(tenantList[i].first, size);
+						tenantCache->tenantStorageMap[tenants[i]].usage = size;
+						break;
 					} catch (Error& e) {
 						TraceEvent("TenantCacheGetStorageUsageError", tenantCache->id()).error(e);
 						wait(tr.onError(e));
@ -149,6 +150,31 @@ public:
 			wait(delay(refreshInterval));
 		}
 	}
+
+	ACTOR static Future<Void> monitorStorageQuota(TenantCache* tenantCache) {
+		TraceEvent(SevInfo, "StartingTenantCacheStorageQuotaMonitor", tenantCache->id()).log();
+
+		state Transaction tr(tenantCache->dbcx());
+
+		loop {
+			loop {
+				try {
+					state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY));
+					for (auto const kv : currentQuotas) {
+						TenantName const tenant = kv.key.removePrefix(storageQuotaPrefix);
+						int64_t const quota = BinaryReader::fromStringRef<int64_t>(kv.value, Unversioned());
+						tenantCache->tenantStorageMap[tenant].quota = quota;
+					}
+					tr.reset();
+					break;
+				} catch (Error& e) {
+					TraceEvent("TenantCacheGetStorageQuotaError", tenantCache->id()).error(e);
+					wait(tr.onError(e));
+				}
+			}
+			wait(delay(SERVER_KNOBS->TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL));
+		}
+	}
 };

 void TenantCache::insert(TenantName& tenantName, TenantMapEntry& tenant) {
@ -203,21 +229,14 @@ int TenantCache::cleanup() {
 	return tenantsRemoved;
 }

-std::vector<std::pair<KeyRef, TenantName>> TenantCache::getTenantList() const {
-	std::vector<std::pair<KeyRef, TenantName>> tenants;
+std::vector<TenantName> TenantCache::getTenantList() const {
+	std::vector<TenantName> tenants;
 	for (const auto& [prefix, entry] : tenantCache) {
-		tenants.push_back({ prefix, entry->name() });
+		tenants.push_back(entry->name());
 	}
 	return tenants;
 }

-void TenantCache::updateStorageUsage(KeyRef prefix, int64_t size) {
-	auto it = tenantCache.find(prefix);
-	if (it != tenantCache.end()) {
-		it->value->updateStorageUsage(size);
-	}
-}
-
 std::string TenantCache::desc() const {
 	std::string s("@Generation: ");
 	s += std::to_string(generation) + " ";
@ -264,6 +283,16 @@ Optional<Reference<TCTenantInfo>> TenantCache::tenantOwning(KeyRef key) const {
 	return it->value;
 }

+std::vector<TenantName> TenantCache::getTenantsOverQuota() const {
+	std::vector<TenantName> tenants;
+	for (const auto& [tenant, storage] : tenantStorageMap) {
+		if (storage.usage > storage.quota) {
+			tenants.push_back(tenant);
+		}
+	}
+	return tenants;
+}
+
 Future<Void> TenantCache::monitorTenantMap() {
 	return TenantCacheImpl::monitorTenantMap(this);
 }
@ -272,6 +301,10 @@ Future<Void> TenantCache::monitorStorageUsage() {
 	return TenantCacheImpl::monitorStorageUsage(this);
 }

+Future<Void> TenantCache::monitorStorageQuota() {
+	return TenantCacheImpl::monitorStorageQuota(this);
+}
+
 class TenantCacheUnitTest {
 public:
 	ACTOR static Future<Void> InsertAndTestPresence() {
--- a/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h
+++ b/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h
@ -140,9 +140,27 @@ private:
 	Future<Void> collection;
 };

+// Defines granule info that interests full restore
+struct BlobGranuleRestoreVersion {
+	// Two constructors required by VectorRef
+	BlobGranuleRestoreVersion() {}
+	BlobGranuleRestoreVersion(Arena& a, const BlobGranuleRestoreVersion& copyFrom)
+	  : granuleID(copyFrom.granuleID), keyRange(a, copyFrom.keyRange), version(copyFrom.version),
+	    sizeInBytes(copyFrom.sizeInBytes) {}
+
+	UID granuleID;
+	KeyRangeRef keyRange;
+	Version version;
+	int64_t sizeInBytes;
+};
+
+// Defines a vector for BlobGranuleVersion
+typedef Standalone<VectorRef<BlobGranuleRestoreVersion>> BlobGranuleRestoreVersionVector;
+
 ACTOR Future<Void> dumpManifest(Database db, Reference<BlobConnectionProvider> blobConn, int64_t epoch, int64_t seqNo);
 ACTOR Future<Void> loadManifest(Database db, Reference<BlobConnectionProvider> blobConn);
 ACTOR Future<Void> printRestoreSummary(Database db, Reference<BlobConnectionProvider> blobConn);
+ACTOR Future<BlobGranuleRestoreVersionVector> listBlobGranules(Database db, Reference<BlobConnectionProvider> blobConn);
 inline bool isFullRestoreMode() {
 	return SERVER_KNOBS->BLOB_FULL_RESTORE_MODE;
 };
--- a/fdbserver/include/fdbserver/BlobMigratorInterface.h
+++ b/fdbserver/include/fdbserver/BlobMigratorInterface.h
@ -30,23 +30,25 @@
 struct BlobMigratorInterface {
 	constexpr static FileIdentifier file_identifier = 869199;
 	RequestStream<struct HaltBlobMigratorRequest> haltBlobMigrator;
-	RequestStream<ReplyPromise<Void>> waitFailure;
 	LocalityData locality;
 	UID uniqueID;
+	StorageServerInterface ssi;

 	BlobMigratorInterface() {}
-	BlobMigratorInterface(const struct LocalityData& l, UID id) : uniqueID(id), locality(l) {}
+	BlobMigratorInterface(const struct LocalityData& l, UID id) : uniqueID(id), locality(l) {
+		ssi.locality = l;
+		ssi.uniqueID = id;
+	}

-	void initEndpoints() {}
+	void initEndpoints() { ssi.initEndpoints(); }
 	UID id() const { return uniqueID; }
-	NetworkAddress address() const { return waitFailure.getEndpoint().getPrimaryAddress(); }
+	NetworkAddress address() const { return haltBlobMigrator.getEndpoint().getPrimaryAddress(); }
 	bool operator==(const BlobMigratorInterface& r) const { return id() == r.id(); }
 	bool operator!=(const BlobMigratorInterface& r) const { return !(*this == r); }

 	template <class Archive>
 	void serialize(Archive& ar) {
-		// StorageServerInterface::serialize(ar);
-		serializer(ar, waitFailure, haltBlobMigrator, locality, uniqueID);
+		serializer(ar, locality, uniqueID, haltBlobMigrator);
 	}
 };

--- a/fdbserver/include/fdbserver/ClusterRecovery.actor.h
+++ b/fdbserver/include/fdbserver/ClusterRecovery.actor.h
@ -289,11 +289,10 @@ struct ClusterRecoveryData : NonCopyable, ReferenceCounted<ClusterRecoveryData>
 		    getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_DURATION_EVENT_NAME));
 		clusterRecoveryAvailableEventHolder = makeReference<EventCacheHolder>(
 		    getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_AVAILABLE_EVENT_NAME));
-		logger = traceCounters(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME),
-		                       dbgid,
-		                       SERVER_KNOBS->WORKER_LOGGING_INTERVAL,
-		                       &cc,
-		                       getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME));
+		logger = cc.traceCounters(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME),
+		                          dbgid,
+		                          SERVER_KNOBS->WORKER_LOGGING_INTERVAL,
+		                          getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME));
 		if (forceRecovery && !controllerData->clusterControllerDcId.present()) {
 			TraceEvent(SevError, "ForcedRecoveryRequiresDcID").log();
 			forceRecovery = false;
--- a/fdbserver/include/fdbserver/DDTxnProcessor.h
+++ b/fdbserver/include/fdbserver/DDTxnProcessor.h
@ -117,6 +117,7 @@ public:

 	virtual Future<Void> moveKeys(const MoveKeysParams& params) = 0;

+	// metrics.second is the number of key-ranges (i.e., shards) in the 'keys' key-range
 	virtual Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(KeyRange const& keys,
 	                                                                            StorageMetrics const& min,
 	                                                                            StorageMetrics const& max,
--- a/fdbserver/include/fdbserver/DataDistribution.actor.h
+++ b/fdbserver/include/fdbserver/DataDistribution.actor.h
@ -476,6 +476,8 @@ struct ShardSizeBounds {
 	bool operator==(ShardSizeBounds const& rhs) const {
 		return max == rhs.max && min == rhs.min && permittedError == rhs.permittedError;
 	}
+
+	static ShardSizeBounds shardSizeBoundsBeforeTrack();
 };

 // Gets the permitted size and IO bounds for a shard
@ -484,10 +486,6 @@ ShardSizeBounds getShardSizeBounds(KeyRangeRef shard, int64_t maxShardSize);
 // Determines the maximum shard size based on the size of the database
 int64_t getMaxShardSize(double dbSizeEstimate);

-struct StorageQuotaInfo {
-	std::map<Key, uint64_t> quotaMap;
-};
-
 #ifndef __INTEL_COMPILER
 #pragma endregion
 #endif
--- a/fdbserver/include/fdbserver/GrvProxyTransactionTagThrottler.h
+++ b/fdbserver/include/fdbserver/GrvProxyTransactionTagThrottler.h
@ -46,6 +46,7 @@ class GrvProxyTransactionTagThrottler {
 		  : req(req), startTime(now()), sequenceNumber(++lastSequenceNumber) {}

 		void updateProxyTagThrottledDuration();
+		bool isMaxThrottled() const;
 	};

 	struct TagQueue {
@ -56,6 +57,8 @@ class GrvProxyTransactionTagThrottler {
 		explicit TagQueue(double rate) : rateInfo(rate) {}

 		void setRate(double rate);
+		bool isMaxThrottled() const;
+		void rejectRequests();
 	};

 	// Track the budgets for each tag
@ -69,8 +72,8 @@ public:
 	// If a request is ready to be executed, it is sent to the deque
 	// corresponding to its priority. If not, the request remains queued.
 	void releaseTransactions(double elapsed,
-	                         SpannedDeque<GetReadVersionRequest>& outBatchPriority,
-	                         SpannedDeque<GetReadVersionRequest>& outDefaultPriority);
+	                         Deque<GetReadVersionRequest>& outBatchPriority,
+	                         Deque<GetReadVersionRequest>& outDefaultPriority);

 	void addRequest(GetReadVersionRequest const&);

--- a/fdbserver/include/fdbserver/IKeyValueStore.h
+++ b/fdbserver/include/fdbserver/IKeyValueStore.h
@ -29,7 +29,7 @@
 #include "fdbserver/IClosable.h"
 #include "fdbserver/IPageEncryptionKeyProvider.actor.h"
 #include "fdbserver/ServerDBInfo.h"
-#include "fdbserver/StorageMetrics.h"
+#include "fdbserver/StorageMetrics.actor.h"

 struct CheckpointRequest {
 	const Version version; // The FDB version at which the checkpoint is created.
--- a/fdbserver/include/fdbserver/MockGlobalState.h
+++ b/fdbserver/include/fdbserver/MockGlobalState.h
@ -21,10 +21,11 @@
 #ifndef FOUNDATIONDB_MOCKGLOBALSTATE_H
 #define FOUNDATIONDB_MOCKGLOBALSTATE_H

-#include "StorageMetrics.h"
+#include "StorageMetrics.actor.h"
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbclient/DatabaseConfiguration.h"
+#include "fdbclient/KeyLocationService.h"
 #include "SimulatedCluster.h"
 #include "ShardsAffectedByTeamFailure.h"

@ -51,9 +52,11 @@ inline bool isStatusTransitionValid(MockShardStatus from, MockShardStatus to) {
 	return false;
 }

-class MockStorageServer {
+class MockStorageServer : public IStorageMetricsService {
 	friend struct MockGlobalStateTester;

+	ActorCollection actors;
+
 public:
 	struct ShardInfo {
 		MockShardStatus status;
@ -73,8 +76,6 @@ public:
 	// size() and nthRange() would use the metrics as index instead
 	KeyRangeMap<ShardInfo> serverKeys;

-	// sampled metrics
-	StorageServerMetrics metrics;
 	CoalescedKeyRangeMap<bool, int64_t, KeyBytesMetric<int64_t>> byteSampleClears;

 	StorageServerInterface ssi; // serve RPC requests
@ -103,6 +104,35 @@ public:

 	uint64_t sumRangeSize(KeyRangeRef range) const;

+	void addActor(Future<Void> future) override;
+
+	void getSplitPoints(SplitRangeRequest const& req) override;
+
+	Future<Void> waitMetricsTenantAware(const WaitMetricsRequest& req) override;
+
+	void getStorageMetrics(const GetStorageMetricsRequest& req) override;
+
+	template <class Reply>
+	static constexpr bool isLoadBalancedReply = std::is_base_of_v<LoadBalancedReply, Reply>;
+
+	template <class Reply>
+	typename std::enable_if_t<isLoadBalancedReply<Reply>, void> sendErrorWithPenalty(const ReplyPromise<Reply>& promise,
+	                                                                                 const Error& err,
+	                                                                                 double penalty) {
+		Reply reply;
+		reply.error = err;
+		reply.penalty = penalty;
+		promise.send(reply);
+	}
+
+	template <class Reply>
+	typename std::enable_if_t<!isLoadBalancedReply<Reply>, void>
+	sendErrorWithPenalty(const ReplyPromise<Reply>& promise, const Error& err, double) {
+		promise.sendError(err);
+	}
+
+	Future<Void> run();
+
 protected:
 	void threeWayShardSplitting(KeyRangeRef outerRange,
 	                            KeyRangeRef innerRange,
@ -112,8 +142,13 @@ protected:
 	void twoWayShardSplitting(KeyRangeRef range, KeyRef splitPoint, uint64_t rangeSize, bool restrictSize);
 };

-class MockGlobalState {
+class MockGlobalStateImpl;
+
+class MockGlobalState : public IKeyLocationService {
 	friend struct MockGlobalStateTester;
+	friend class MockGlobalStateImpl;
+
+	std::vector<StorageServerInterface> extractStorageServerInterfaces(const std::vector<UID>& ids) const;

 public:
 	typedef ShardsAffectedByTeamFailure::Team Team;
@ -162,7 +197,37 @@ public:
 	 * * mgs.shardMapping doesn’t have any information about X
 	 * * mgs.allServer[X] is existed
 	 */
-	bool allShardRemovedFromServer(const UID& serverId);
+	bool allShardsRemovedFromServer(const UID& serverId);
+
+	// SOMEDAY: NativeAPI::waitStorageMetrics should share the code in the future, this is a simpler version of it
+	Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(KeyRange const& keys,
+	                                                                    StorageMetrics const& min,
+	                                                                    StorageMetrics const& max,
+	                                                                    StorageMetrics const& permittedError,
+	                                                                    int shardLimit,
+	                                                                    int expectedShardCount);
+
+	Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(const KeyRange& keys,
+	                                                          const StorageMetrics& limit,
+	                                                          const StorageMetrics& estimated,
+	                                                          const Optional<int>& minSplitBytes);
+
+	Future<KeyRangeLocationInfo> getKeyLocation(TenantInfo tenant,
+	                                            Key key,
+	                                            SpanContext spanContext,
+	                                            Optional<UID> debugID,
+	                                            UseProvisionalProxies useProvisionalProxies,
+	                                            Reverse isBackward,
+	                                            Version version) override;
+
+	Future<std::vector<KeyRangeLocationInfo>> getKeyRangeLocations(TenantInfo tenant,
+	                                                               KeyRange keys,
+	                                                               int limit,
+	                                                               Reverse reverse,
+	                                                               SpanContext spanContext,
+	                                                               Optional<UID> debugID,
+	                                                               UseProvisionalProxies useProvisionalProxies,
+	                                                               Version version) override;
 };

 #endif // FOUNDATIONDB_MOCKGLOBALSTATE_H
--- a/fdbserver/include/fdbserver/ProxyCommitData.actor.h
+++ b/fdbserver/include/fdbserver/ProxyCommitData.actor.h
@ -156,7 +156,7 @@ struct ProxyStats {
 		specialCounter(cc, "NumTenants", [pTenantMap]() { return pTenantMap ? pTenantMap->size() : 0; });
 		specialCounter(cc, "MaxCompute", [this]() { return this->getAndResetMaxCompute(); });
 		specialCounter(cc, "MinCompute", [this]() { return this->getAndResetMinCompute(); });
-		logger = traceCounters("ProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ProxyMetrics");
+		logger = cc.traceCounters("ProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ProxyMetrics");
 	}
 };

--- a/fdbserver/include/fdbserver/RestoreApplier.actor.h
+++ b/fdbserver/include/fdbserver/RestoreApplier.actor.h
@ -284,11 +284,11 @@ struct ApplierBatchData : public ReferenceCounted<ApplierBatchData> {
 	  : vbState(ApplierVersionBatchState::NOT_INIT), receiveMutationReqs(0), receivedBytes(0), appliedBytes(0),
 	    targetWriteRateMB(SERVER_KNOBS->FASTRESTORE_WRITE_BW_MB / SERVER_KNOBS->FASTRESTORE_NUM_APPLIERS),
 	    totalBytesToWrite(-1), applyingDataBytes(0), counters(this, nodeID, batchIndex) {
-		pollMetrics = traceCounters(format("FastRestoreApplierMetrics%d", batchIndex),
-		                            nodeID,
-		                            SERVER_KNOBS->FASTRESTORE_ROLE_LOGGING_DELAY,
-		                            &counters.cc,
-		                            nodeID.toString() + "/RestoreApplierMetrics/" + std::to_string(batchIndex));
+		pollMetrics =
+		    counters.cc.traceCounters(format("FastRestoreApplierMetrics%d", batchIndex),
+		                              nodeID,
+		                              SERVER_KNOBS->FASTRESTORE_ROLE_LOGGING_DELAY,
+		                              nodeID.toString() + "/RestoreApplierMetrics/" + std::to_string(batchIndex));
 		TraceEvent("FastRestoreApplierMetricsCreated").detail("Node", nodeID);
 	}
 	~ApplierBatchData() {
--- a/fdbserver/include/fdbserver/RestoreLoader.actor.h
+++ b/fdbserver/include/fdbserver/RestoreLoader.actor.h
@ -93,11 +93,11 @@ struct LoaderBatchData : public ReferenceCounted<LoaderBatchData> {

 	explicit LoaderBatchData(UID nodeID, int batchIndex)
 	  : vbState(LoaderVersionBatchState::NOT_INIT), loadFileReqs(0), counters(this, nodeID, batchIndex) {
-		pollMetrics = traceCounters(format("FastRestoreLoaderMetrics%d", batchIndex),
-		                            nodeID,
-		                            SERVER_KNOBS->FASTRESTORE_ROLE_LOGGING_DELAY,
-		                            &counters.cc,
-		                            nodeID.toString() + "/RestoreLoaderMetrics/" + std::to_string(batchIndex));
+		pollMetrics =
+		    counters.cc.traceCounters(format("FastRestoreLoaderMetrics%d", batchIndex),
+		                              nodeID,
+		                              SERVER_KNOBS->FASTRESTORE_ROLE_LOGGING_DELAY,
+		                              nodeID.toString() + "/RestoreLoaderMetrics/" + std::to_string(batchIndex));
 		TraceEvent("FastRestoreLoaderMetricsCreated").detail("Node", nodeID);
 	}

--- a/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h
+++ b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h
@ -80,8 +80,12 @@ public:
 	bool hasShards(Team team) const;

 	// The first element of the pair is either the source for non-moving shards or the destination team for in-flight
-	// shards The second element of the pair is all previous sources for in-flight shards
-	std::pair<std::vector<Team>, std::vector<Team>> getTeamsFor(KeyRangeRef keys);
+	// shards. The second element of the pair is all previous sources for in-flight shards. This function only returns
+	// the teams for the first shard in [keys.begin, keys.end)
+	std::pair<std::vector<Team>, std::vector<Team>> getTeamsForFirstShard(KeyRangeRef keys);
+
+	std::pair<std::vector<Team>, std::vector<Team>> getTeamsFor(KeyRef key);
+
 	// Shard boundaries are modified in defineShard and the content of what servers correspond to each shard is a copy
 	// or union of the shards already there
 	void defineShard(KeyRangeRef keys);
@ -124,6 +128,7 @@ private:
 public:
 	// return the iterator that traversing all ranges
 	auto getAllRanges() const -> decltype(shard_teams)::ConstRanges;
+	auto intersectingRanges(KeyRangeRef keyRange) const -> decltype(shard_teams)::ConstRanges;
 	// get total shards count
 	size_t getNumberOfShards() const;
 	void removeFailedServerForRange(KeyRangeRef keys, const UID& serverID);
--- a/fdbserver/include/fdbserver/StorageMetrics.actor.h
+++ b/fdbserver/include/fdbserver/StorageMetrics.actor.h
@ -19,13 +19,18 @@
 */

 #pragma once
-
+#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_STORAGEMETRICS_G_H)
+#define FDBSERVER_STORAGEMETRICS_G_H
+#include "fdbserver/StorageMetrics.actor.g.h"
+#elif !defined(FDBSERVER_STORAGEMETRICS_H)
+#define FDBSERVER_STORAGEMETRICS_H
 #include "fdbclient/FDBTypes.h"
 #include "fdbrpc/simulator.h"
 #include "flow/UnitTest.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbserver/Knobs.h"
+#include "flow/actorcompiler.h"

 const StringRef STORAGESERVER_HISTOGRAM_GROUP = "StorageServer"_sr;
 const StringRef FETCH_KEYS_LATENCY_HISTOGRAM = "FetchKeysLatency"_sr;
@ -152,3 +157,76 @@ struct ByteSampleInfo {
 // Determines whether a key-value pair should be included in a byte sample
 // Also returns size information about the sample
 ByteSampleInfo isKeyValueInSample(KeyValueRef keyValue);
+
+class IStorageMetricsService {
+public:
+	StorageServerMetrics metrics;
+
+	// penalty used by loadBalance() to balance requests among service instances
+	virtual double getPenalty() const { return 1; }
+
+	virtual bool isReadable(KeyRangeRef const& keys) const { return true; }
+
+	virtual void addActor(Future<Void> future) = 0;
+
+	virtual void getSplitPoints(SplitRangeRequest const& req) = 0;
+
+	virtual Future<Void> waitMetricsTenantAware(const WaitMetricsRequest& req) = 0;
+
+	virtual void getStorageMetrics(const GetStorageMetricsRequest& req) = 0;
+
+	// NOTE: also need to have this function but template can't be a virtual so...
+	// template <class Reply>
+	// void sendErrorWithPenalty(const ReplyPromise<Reply>& promise, const Error& err, double penalty);
+};
+
+ACTOR template <class ServiceType>
+Future<Void> serveStorageMetricsRequests(ServiceType* self, StorageServerInterface ssi) {
+	state Future<Void> doPollMetrics = Void();
+	loop {
+		choose {
+			when(state WaitMetricsRequest req = waitNext(ssi.waitMetrics.getFuture())) {
+				if (!req.tenantInfo.present() && !self->isReadable(req.keys)) {
+					CODE_PROBE(true, "waitMetrics immediate wrong_shard_server()");
+					self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
+				} else {
+					self->addActor(self->waitMetricsTenantAware(req));
+				}
+			}
+			when(SplitMetricsRequest req = waitNext(ssi.splitMetrics.getFuture())) {
+				if (!self->isReadable(req.keys)) {
+					CODE_PROBE(true, "splitMetrics immediate wrong_shard_server()");
+					self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
+				} else {
+					self->metrics.splitMetrics(req);
+				}
+			}
+			when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) {
+				self->getStorageMetrics(req);
+			}
+			when(ReadHotSubRangeRequest req = waitNext(ssi.getReadHotRanges.getFuture())) {
+				if (!self->isReadable(req.keys)) {
+					CODE_PROBE(true, "readHotSubRanges immediate wrong_shard_server()", probe::decoration::rare);
+					self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
+				} else {
+					self->metrics.getReadHotRanges(req);
+				}
+			}
+			when(SplitRangeRequest req = waitNext(ssi.getRangeSplitPoints.getFuture())) {
+				if (!self->isReadable(req.keys)) {
+					CODE_PROBE(true, "getSplitPoints immediate wrong_shard_server()");
+					self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
+				} else {
+					self->getSplitPoints(req);
+				}
+			}
+			when(wait(doPollMetrics)) {
+				self->metrics.poll();
+				doPollMetrics = delay(SERVER_KNOBS->STORAGE_SERVER_POLL_METRICS_DELAY);
+			}
+		}
+	}
+}
+
+#include "flow/unactorcompiler.h"
+#endif // FDBSERVER_STORAGEMETRICS_H
--- a/fdbserver/include/fdbserver/TCInfo.h
+++ b/fdbserver/include/fdbserver/TCInfo.h
@ -268,5 +268,4 @@ public:
 	void removeTeam(TCTeamInfo team);
 	void updateCacheGeneration(int64_t generation) { m_cacheGeneration = generation; }
 	int64_t cacheGeneration() const { return m_cacheGeneration; }
-	void updateStorageUsage(int64_t size) { m_tenantInfo.storageUsage = size; }
 };
--- a/fdbserver/include/fdbserver/TenantCache.h
+++ b/fdbserver/include/fdbserver/TenantCache.h
@ -32,6 +32,12 @@

 typedef Map<KeyRef, Reference<TCTenantInfo>> TenantMapByPrefix;

+struct Storage {
+	int64_t quota = std::numeric_limits<int64_t>::max();
+	int64_t usage = 0;
+};
+typedef std::unordered_map<TenantName, Storage> TenantStorageMap;
+
 struct TenantCacheTenantCreated {
 	KeyRange keys;
 	Promise<bool> reply;
@ -50,6 +56,9 @@ private:
 	uint64_t generation;
 	TenantMapByPrefix tenantCache;

+	// Map from tenant names to storage quota and usage
+	TenantStorageMap tenantStorageMap;
+
 	// mark the start of a new sweep of the tenant cache
 	void startRefresh();

@ -62,11 +71,8 @@ private:
 	// return count of tenants that were found to be stale and removed from the cache
 	int cleanup();

-	// return the mapping from prefix -> tenant name for all tenants stored in the cache
-	std::vector<std::pair<KeyRef, TenantName>> getTenantList() const;
-
-	// update the size for a tenant; do nothing if the tenant doesn't exist in the map
-	void updateStorageUsage(KeyRef prefix, int64_t size);
+	// return all the TenantName for all tenants stored in the cache
+	std::vector<TenantName> getTenantList() const;

 	UID id() const { return distributorID; }

@ -85,9 +91,14 @@ public:

 	Future<Void> monitorStorageUsage();

+	Future<Void> monitorStorageQuota();
+
 	std::string desc() const;

 	bool isTenantKey(KeyRef key) const;

 	Optional<Reference<TCTenantInfo>> tenantOwning(KeyRef key) const;
+
+	// Get the list of tenants where the storage bytes currently used is greater than the quota allocated
+	std::vector<TenantName> getTenantsOverQuota() const;
 };
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@ -114,7 +114,7 @@ struct MasterData : NonCopyable, ReferenceCounted<MasterData> {
 	                         SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
 	                         SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
 	    addActor(addActor) {
-		logger = traceCounters("MasterMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "MasterMetrics");
+		logger = cc.traceCounters("MasterMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "MasterMetrics");
 		if (forceRecovery && !myInterface.locality.dcId().present()) {
 			TraceEvent(SevError, "ForcedRecoveryRequiresDcID").log();
 			forceRecovery = false;
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -81,11 +81,12 @@
 #include "fdbserver/ServerCheckpoint.actor.h"
 #include "fdbserver/ServerDBInfo.h"
 #include "fdbserver/SpanContextMessage.h"
-#include "fdbserver/StorageMetrics.h"
+#include "fdbserver/StorageMetrics.actor.h"
 #include "fdbserver/TLogInterface.h"
 #include "fdbserver/TransactionTagCounter.h"
 #include "fdbserver/WaitFailure.h"
 #include "fdbserver/WorkerInterface.actor.h"
+#include "fdbserver/BlobGranuleServerCommon.actor.h"
 #include "flow/ActorCollection.h"
 #include "flow/Arena.h"
 #include "flow/Error.h"
@ -641,7 +642,7 @@ struct BusiestWriteTagContext {
 	    busiestWriteTagEventHolder(makeReference<EventCacheHolder>(busiestWriteTagTrackingKey)), lastUpdateTime(-1) {}
 };

-struct StorageServer {
+struct StorageServer : public IStorageMetricsService {
 	typedef VersionedMap<KeyRef, ValueOrClearToRef> VersionedData;

 private:
@ -807,8 +808,8 @@ public:
 	VersionedData const& data() const { return versionedData; }
 	VersionedData& mutableData() { return versionedData; }

-	double old_rate = 1.0;
-	double currentRate() {
+	mutable double old_rate = 1.0;
+	double currentRate() const {
 		auto versionLag = version.get() - durableVersion.get();
 		double res;
 		if (versionLag >= SERVER_KNOBS->STORAGE_DURABILITY_LAG_HARD_MAX) {
@ -988,7 +989,6 @@ public:
 	Database cx;
 	ActorCollection actors;

-	StorageServerMetrics metrics;
 	CoalescedKeyRangeMap<bool, int64_t, KeyBytesMetric<int64_t>> byteSampleClears;
 	AsyncVar<bool> byteSampleClearsTooLarge;
 	Future<Void> byteSampleRecovery;
@ -1308,10 +1308,10 @@ public:
 	    storageServerSourceTLogIDEventHolder(
 	        makeReference<EventCacheHolder>(ssi.id().toString() + "/StorageServerSourceTLogID")) {

-		version.initMetric("StorageServer.Version"_sr, counters.cc.id);
-		oldestVersion.initMetric("StorageServer.OldestVersion"_sr, counters.cc.id);
-		durableVersion.initMetric("StorageServer.DurableVersion"_sr, counters.cc.id);
-		desiredOldestVersion.initMetric("StorageServer.DesiredOldestVersion"_sr, counters.cc.id);
+		version.initMetric("StorageServer.Version"_sr, counters.cc.getId());
+		oldestVersion.initMetric("StorageServer.OldestVersion"_sr, counters.cc.getId());
+		durableVersion.initMetric("StorageServer.DurableVersion"_sr, counters.cc.getId());
+		desiredOldestVersion.initMetric("StorageServer.DesiredOldestVersion"_sr, counters.cc.getId());

 		newestAvailableVersion.insert(allKeys, invalidVersion);
 		newestDirtyVersion.insert(allKeys, invalidVersion);
@ -1380,7 +1380,7 @@ public:
 	// This is the maximum version that might be read from storage (the minimum version is durableVersion)
 	Version storageVersion() const { return oldestVersion.get(); }

-	bool isReadable(KeyRangeRef const& keys) {
+	bool isReadable(KeyRangeRef const& keys) const override {
 		auto sh = shards.intersectingRanges(keys);
 		for (auto i = sh.begin(); i != sh.end(); ++i)
 			if (!i->value()->isReadable())
@ -1406,10 +1406,10 @@ public:
 		}
 	}

-	Counter::Value queueSize() { return counters.bytesInput.getValue() - counters.bytesDurable.getValue(); }
+	Counter::Value queueSize() const { return counters.bytesInput.getValue() - counters.bytesDurable.getValue(); }

 	// penalty used by loadBalance() to balance requests among SSes. We prefer SS with less write queue size.
-	double getPenalty() {
+	double getPenalty() const override {
 		return std::max(std::max(1.0,
 		                         (queueSize() - (SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER -
 		                                         2.0 * SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER)) /
@ -1503,7 +1503,7 @@ public:
 		}
 	}

-	void getSplitPoints(SplitRangeRequest const& req) {
+	void getSplitPoints(SplitRangeRequest const& req) override {
 		try {
 			Optional<TenantMapEntry> entry = getTenantEntry(version.get(), req.tenantInfo);
 			metrics.getSplitPoints(req, entry.map<Key>([](TenantMapEntry e) { return e.prefix; }));
@ -1533,6 +1533,15 @@ public:
 		}
 		return false;
 	}
+
+	Future<Void> waitMetricsTenantAware(const WaitMetricsRequest& req) override;
+
+	void addActor(Future<Void> future) override { actors.add(future); }
+
+	void getStorageMetrics(const GetStorageMetricsRequest& req) override {
+		StorageBytes sb = storage.getStorageBytes();
+		metrics.getStorageMetrics(req, sb, counters.bytesInput.getRate(), versionLag, lastUpdate);
+	}
 };

 const StringRef StorageServer::CurrentRunningFetchKeys::emptyString = ""_sr;
@ -5976,27 +5985,26 @@ ACTOR Future<Void> tryGetRangeFromBlob(PromiseStream<RangeResult> results,
                                       Reference<BlobConnectionProvider> blobConn) {
 	ASSERT(blobConn.isValid());
 	try {
-
 		state Standalone<VectorRef<BlobGranuleChunkRef>> chunks = wait(tryReadBlobGranules(tr, keys, fetchVersion));
-
 		if (chunks.size() == 0) {
 			throw blob_granule_transaction_too_old(); // no data on blob
 		}
-
 		if (!isRangeFullyCovered(keys, chunks)) {
 			throw blob_granule_transaction_too_old();
 		}
-
-		for (const BlobGranuleChunkRef& chunk : chunks) {
-			state KeyRangeRef chunkRange = chunk.keyRange;
-			state RangeResult rows = wait(readBlobGranule(chunk, keys, 0, fetchVersion, blobConn));
+		state int i;
+		for (i = 0; i < chunks.size(); ++i) {
+			state KeyRangeRef chunkRange = chunks[i].keyRange;
+			state RangeResult rows = wait(readBlobGranule(chunks[i], keys, 0, fetchVersion, blobConn));
 			TraceEvent("ReadBlobData")
 			    .detail("Rows", rows.size())
 			    .detail("ChunkRange", chunkRange.toString())
 			    .detail("Keys", keys.toString());
-
 			if (rows.size() == 0) {
-				rows.readThrough = KeyRef(rows.arena(), chunkRange.end);
+				rows.readThrough = KeyRef(rows.arena(), std::min(chunkRange.end, keys.end));
+			}
+			if (i == chunks.size() - 1) {
+				rows.readThrough = KeyRef(rows.arena(), keys.end);
 			}
 			results.send(rows);
 		}
@ -6010,7 +6018,7 @@ ACTOR Future<Void> tryGetRangeFromBlob(PromiseStream<RangeResult> results,
 		tr->reset();
 		tr->setVersion(fetchVersion);
 		tr->trState->taskID = TaskPriority::FetchKeys;
-		wait(tryGetRange(results, tr, keys)); // fail back to storage server
+		throw;
 	}
 	return Void();
 }
@ -6798,8 +6806,10 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
 		// We must also ensure we have fetched all change feed metadata BEFORE changing the phase to fetching to ensure
 		// change feed mutations get applied correctly
 		state std::vector<Key> changeFeedsToFetch;
-		std::vector<Key> _cfToFetch = wait(fetchCFMetadata);
-		changeFeedsToFetch = _cfToFetch;
+		if (!isFullRestoreMode()) {
+			std::vector<Key> _cfToFetch = wait(fetchCFMetadata);
+			changeFeedsToFetch = _cfToFetch;
+		}
 		wait(data->durableVersionLock.take());

 		shard->phase = AddingShard::Fetching;
@ -10166,7 +10176,7 @@ Future<Void> StorageServerMetrics::waitMetrics(WaitMetricsRequest req, Future<Vo
 #pragma region Core
 #endif

-ACTOR Future<Void> waitMetricsTenantAware(StorageServer* self, WaitMetricsRequest req) {
+ACTOR Future<Void> waitMetricsTenantAware_internal(StorageServer* self, WaitMetricsRequest req) {
 	if (req.tenantInfo.present() && req.tenantInfo.get().tenantId != TenantInfo::INVALID_TENANT) {
 		wait(success(waitForVersionNoTooOld(self, latestVersion)));
 		Optional<TenantMapEntry> entry = self->getTenantEntry(latestVersion, req.tenantInfo.get());
@ -10184,85 +10194,45 @@ ACTOR Future<Void> waitMetricsTenantAware(StorageServer* self, WaitMetricsReques
 	return Void();
 }

+Future<Void> StorageServer::waitMetricsTenantAware(const WaitMetricsRequest& req) {
+	return waitMetricsTenantAware_internal(this, req);
+}
+
 ACTOR Future<Void> metricsCore(StorageServer* self, StorageServerInterface ssi) {
-	state Future<Void> doPollMetrics = Void();

 	wait(self->byteSampleRecovery);
 	TraceEvent("StorageServerRestoreDurableState", self->thisServerID).detail("RestoredBytes", self->bytesRestored);

 	// Logs all counters in `counters.cc` and reset the interval.
-	self->actors.add(traceCounters("StorageMetrics",
-	                               self->thisServerID,
-	                               SERVER_KNOBS->STORAGE_LOGGING_DELAY,
-	                               &self->counters.cc,
-	                               self->thisServerID.toString() + "/StorageMetrics",
-	                               [self = self](TraceEvent& te) {
-		                               te.detail("StorageEngine", self->storage.getKeyValueStoreType().toString());
-		                               te.detail("Tag", self->tag.toString());
-		                               StorageBytes sb = self->storage.getStorageBytes();
-		                               te.detail("KvstoreBytesUsed", sb.used);
-		                               te.detail("KvstoreBytesFree", sb.free);
-		                               te.detail("KvstoreBytesAvailable", sb.available);
-		                               te.detail("KvstoreBytesTotal", sb.total);
-		                               te.detail("KvstoreBytesTemp", sb.temp);
-		                               if (self->isTss()) {
-			                               te.detail("TSSPairID", self->tssPairID);
-			                               te.detail("TSSJointID",
-			                                         UID(self->thisServerID.first() ^ self->tssPairID.get().first(),
-			                                             self->thisServerID.second() ^ self->tssPairID.get().second()));
-		                               } else if (self->isSSWithTSSPair()) {
-			                               te.detail("SSPairID", self->ssPairID);
-			                               te.detail("TSSJointID",
-			                                         UID(self->thisServerID.first() ^ self->ssPairID.get().first(),
-			                                             self->thisServerID.second() ^ self->ssPairID.get().second()));
-		                               }
-	                               }));
+	self->actors.add(self->counters.cc.traceCounters(
+	    "StorageMetrics",
+	    self->thisServerID,
+	    SERVER_KNOBS->STORAGE_LOGGING_DELAY,
+	    self->thisServerID.toString() + "/StorageMetrics",
+	    [self = self](TraceEvent& te) {
+		    te.detail("StorageEngine", self->storage.getKeyValueStoreType().toString());
+		    te.detail("Tag", self->tag.toString());
+		    StorageBytes sb = self->storage.getStorageBytes();
+		    te.detail("KvstoreBytesUsed", sb.used);
+		    te.detail("KvstoreBytesFree", sb.free);
+		    te.detail("KvstoreBytesAvailable", sb.available);
+		    te.detail("KvstoreBytesTotal", sb.total);
+		    te.detail("KvstoreBytesTemp", sb.temp);
+		    if (self->isTss()) {
+			    te.detail("TSSPairID", self->tssPairID);
+			    te.detail("TSSJointID",
+			              UID(self->thisServerID.first() ^ self->tssPairID.get().first(),
+			                  self->thisServerID.second() ^ self->tssPairID.get().second()));
+		    } else if (self->isSSWithTSSPair()) {
+			    te.detail("SSPairID", self->ssPairID);
+			    te.detail("TSSJointID",
+			              UID(self->thisServerID.first() ^ self->ssPairID.get().first(),
+			                  self->thisServerID.second() ^ self->ssPairID.get().second()));
+		    }
+	    }));

-	loop {
-		choose {
-			when(state WaitMetricsRequest req = waitNext(ssi.waitMetrics.getFuture())) {
-				if (!req.tenantInfo.present() && !self->isReadable(req.keys)) {
-					CODE_PROBE(true, "waitMetrics immediate wrong_shard_server()");
-					self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
-				} else {
-					self->actors.add(waitMetricsTenantAware(self, req));
-				}
-			}
-			when(SplitMetricsRequest req = waitNext(ssi.splitMetrics.getFuture())) {
-				if (!self->isReadable(req.keys)) {
-					CODE_PROBE(true, "splitMetrics immediate wrong_shard_server()");
-					self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
-				} else {
-					self->metrics.splitMetrics(req);
-				}
-			}
-			when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) {
-				StorageBytes sb = self->storage.getStorageBytes();
-				self->metrics.getStorageMetrics(
-				    req, sb, self->counters.bytesInput.getRate(), self->versionLag, self->lastUpdate);
-			}
-			when(ReadHotSubRangeRequest req = waitNext(ssi.getReadHotRanges.getFuture())) {
-				if (!self->isReadable(req.keys)) {
-					CODE_PROBE(true, "readHotSubRanges immediate wrong_shard_server()", probe::decoration::rare);
-					self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
-				} else {
-					self->metrics.getReadHotRanges(req);
-				}
-			}
-			when(SplitRangeRequest req = waitNext(ssi.getRangeSplitPoints.getFuture())) {
-				if (!self->isReadable(req.keys)) {
-					CODE_PROBE(true, "getSplitPoints immediate wrong_shard_server()");
-					self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
-				} else {
-					self->getSplitPoints(req);
-				}
-			}
-			when(wait(doPollMetrics)) {
-				self->metrics.poll();
-				doPollMetrics = delay(SERVER_KNOBS->STORAGE_SERVER_POLL_METRICS_DELAY);
-			}
-		}
-	}
+	wait(serveStorageMetricsRequests(self, ssi));
+	return Void();
 }

 ACTOR Future<Void> logLongByteSampleRecovery(Future<Void> recovery) {
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@ -2267,7 +2267,25 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 					CODE_PROBE(true, "Recruited while already a blob migrator.");
 				} else {
 					startRole(Role::BLOB_MIGRATOR, recruited.id(), interf.id());
-					DUMPTOKEN(recruited.waitFailure);
+					DUMPTOKEN(recruited.haltBlobMigrator);
+					DUMPTOKEN(recruited.ssi.getValue);
+					DUMPTOKEN(recruited.ssi.getKey);
+					DUMPTOKEN(recruited.ssi.getKeyValues);
+					DUMPTOKEN(recruited.ssi.getMappedKeyValues);
+					DUMPTOKEN(recruited.ssi.getShardState);
+					DUMPTOKEN(recruited.ssi.waitMetrics);
+					DUMPTOKEN(recruited.ssi.splitMetrics);
+					DUMPTOKEN(recruited.ssi.getReadHotRanges);
+					DUMPTOKEN(recruited.ssi.getRangeSplitPoints);
+					DUMPTOKEN(recruited.ssi.getStorageMetrics);
+					DUMPTOKEN(recruited.ssi.waitFailure);
+					DUMPTOKEN(recruited.ssi.getQueuingMetrics);
+					DUMPTOKEN(recruited.ssi.getKeyValueStoreType);
+					DUMPTOKEN(recruited.ssi.watchValue);
+					DUMPTOKEN(recruited.ssi.getKeyValuesStream);
+					DUMPTOKEN(recruited.ssi.changeFeedStream);
+					DUMPTOKEN(recruited.ssi.changeFeedPop);
+					DUMPTOKEN(recruited.ssi.changeFeedVersionUpdate);

 					Future<Void> blobMigratorProcess = blobMigrator(recruited, dbInfo);
 					errorForwarders.add(forwardError(errors,
--- a/fdbserver/workloads/AtomicRestore.actor.cpp
+++ b/fdbserver/workloads/AtomicRestore.actor.cpp
@ -18,6 +18,7 @@
 * limitations under the License.
 */

+#include "fdbclient/ManagementAPI.actor.h"
 #include "fdbrpc/simulator.h"
 #include "fdbclient/BackupAgent.actor.h"
 #include "fdbserver/Knobs.h"
@ -95,6 +96,7 @@ struct AtomicRestoreWorkload : TestWorkload {
 		TraceEvent("AtomicRestore_Start").detail("UsePartitionedLog", self->usePartitionedLogs);

 		state std::string backupContainer = "file://simfdb/backups/";
+		state DatabaseConfiguration conf = wait(getDatabaseConfiguration(cx));
 		try {
 			wait(backupAgent.submitBackup(cx,
 			                              StringRef(backupContainer),
@ -103,7 +105,8 @@ struct AtomicRestoreWorkload : TestWorkload {
 			                              deterministicRandom()->randomInt(0, 100),
 			                              BackupAgentBase::getDefaultTagName(),
 			                              self->backupRanges,
-			                              SERVER_KNOBS->ENABLE_ENCRYPTION,
+			                              SERVER_KNOBS->ENABLE_ENCRYPTION &&
+			                                  conf.tenantMode != TenantMode::OPTIONAL_TENANT,
 			                              StopWhenDone::False,
 			                              self->usePartitionedLogs));
 		} catch (Error& e) {
--- a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
@ -215,7 +215,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {

 		state std::string backupContainer = "file://simfdb/backups/";
 		state Future<Void> status = statusLoop(cx, tag.toString());
-
+		state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
 		try {
 			wait(backupAgent->submitBackup(cx,
 			                               StringRef(backupContainer),
@ -224,7 +224,8 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 			                               deterministicRandom()->randomInt(0, 100),
 			                               tag.toString(),
 			                               backupRanges,
-			                               SERVER_KNOBS->ENABLE_ENCRYPTION,
+			                               SERVER_KNOBS->ENABLE_ENCRYPTION &&
+			                                   configuration.tenantMode != TenantMode::OPTIONAL_TENANT,
 			                               StopWhenDone{ !stopDifferentialDelay },
 			                               self->usePartitionedLogs));
 		} catch (Error& e) {
@ -474,6 +475,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 			// Occasionally start yet another backup that might still be running when we restore
 			if (!self->locked && BUGGIFY) {
 				TraceEvent("BARW_SubmitBackup2", randomID).detail("Tag", printable(self->backupTag));
+				state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
 				try {
 					// Note the "partitionedLog" must be false, because we change
 					// the configuration to disable backup workers before restore.
@ -484,7 +486,8 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 					                                       deterministicRandom()->randomInt(0, 100),
 					                                       self->backupTag.toString(),
 					                                       self->backupRanges,
-					                                       SERVER_KNOBS->ENABLE_ENCRYPTION,
+					                                       SERVER_KNOBS->ENABLE_ENCRYPTION &&
+					                                           configuration.tenantMode != TenantMode::OPTIONAL_TENANT,
 					                                       StopWhenDone::True,
 					                                       UsePartitionedLog::False);
 				} catch (Error& e) {
--- a/fdbserver/workloads/BackupCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupCorrectness.actor.cpp
@ -18,6 +18,7 @@
 * limitations under the License.
 */

+#include "fdbclient/ManagementAPI.actor.h"
 #include "fdbclient/ReadYourWrites.h"
 #include "fdbrpc/simulator.h"
 #include "fdbclient/BackupAgent.actor.h"
@ -331,7 +332,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {

 		state std::string backupContainer = "file://simfdb/backups/";
 		state Future<Void> status = statusLoop(cx, tag.toString());
-
+		state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
 		try {
 			wait(backupAgent->submitBackup(cx,
 			                               StringRef(backupContainer),
@ -340,7 +341,8 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 			                               deterministicRandom()->randomInt(0, 2000),
 			                               tag.toString(),
 			                               backupRanges,
-			                               SERVER_KNOBS->ENABLE_ENCRYPTION,
+			                               SERVER_KNOBS->ENABLE_ENCRYPTION &&
+			                                   configuration.tenantMode != TenantMode::OPTIONAL_TENANT,
 			                               StopWhenDone{ !stopDifferentialDelay },
 			                               UsePartitionedLog::False,
 			                               IncrementalBackupOnly::False,
@ -515,6 +517,42 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 		return Void();
 	}

+	ACTOR static Future<Void> clearAndRestoreSystemKeys(Database cx,
+	                                                    BackupAndRestoreCorrectnessWorkload* self,
+	                                                    FileBackupAgent* backupAgent,
+	                                                    Version targetVersion,
+	                                                    Reference<IBackupContainer> lastBackupContainer,
+	                                                    Standalone<VectorRef<KeyRangeRef>> systemRestoreRanges) {
+		// restore system keys before restoring any other ranges
+		wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			for (auto& range : systemRestoreRanges)
+				tr->clear(range);
+			return Void();
+		}));
+		state Standalone<StringRef> restoreTag(self->backupTag.toString() + "_system");
+		printf("BackupCorrectness, backupAgent.restore is called for tag:%s\n", restoreTag.toString().c_str());
+		wait(success(backupAgent->restore(cx,
+		                                  cx,
+		                                  restoreTag,
+		                                  KeyRef(lastBackupContainer->getURL()),
+		                                  lastBackupContainer->getProxy(),
+		                                  systemRestoreRanges,
+		                                  WaitForComplete::True,
+		                                  targetVersion,
+		                                  Verbose::True,
+		                                  Key(),
+		                                  Key(),
+		                                  self->locked,
+		                                  UnlockDB::True,
+		                                  OnlyApplyMutationLogs::False,
+		                                  InconsistentSnapshotOnly::False,
+		                                  ::invalidVersion,
+		                                  self->encryptionKeyFileName)));
+		printf("BackupCorrectness, backupAgent.restore finished for tag:%s\n", restoreTag.toString().c_str());
+		return Void();
+	}
+
 	ACTOR static Future<Void> _start(Database cx, BackupAndRestoreCorrectnessWorkload* self) {
 		state FileBackupAgent backupAgent;
 		state Future<Void> extraBackup;
@ -593,6 +631,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 			// Occasionally start yet another backup that might still be running when we restore
 			if (!self->locked && BUGGIFY) {
 				TraceEvent("BARW_SubmitBackup2", randomID).detail("Tag", printable(self->backupTag));
+				state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
 				try {
 					extraBackup = backupAgent.submitBackup(cx,
 					                                       "file://simfdb/backups/"_sr,
@ -601,7 +640,8 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 					                                       deterministicRandom()->randomInt(0, 100),
 					                                       self->backupTag.toString(),
 					                                       self->backupRanges,
-					                                       SERVER_KNOBS->ENABLE_ENCRYPTION,
+					                                       SERVER_KNOBS->ENABLE_ENCRYPTION &&
+					                                           configuration.tenantMode != TenantMode::OPTIONAL_TENANT,
 					                                       StopWhenDone::True);
 				} catch (Error& e) {
 					TraceEvent("BARW_SubmitBackup2Exception", randomID)
@ -638,7 +678,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 				                                                 lastBackupContainer->getEncryptionKeyFileName());
 				BackupDescription desc = wait(container->describeBackup());

-				Version targetVersion = -1;
+				state Version targetVersion = -1;
 				if (desc.maxRestorableVersion.present()) {
 					if (deterministicRandom()->random01() < 0.1) {
 						targetVersion = desc.minRestorableVersion.get();
@ -656,6 +696,32 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 				state std::vector<Standalone<StringRef>> restoreTags;
 				state bool multipleRangesInOneTag = false;
 				state int restoreIndex = 0;
+				// make sure system keys are not present in the restoreRanges as they will get restored first separately
+				// from the rest
+				Standalone<VectorRef<KeyRangeRef>> modifiedRestoreRanges;
+				Standalone<VectorRef<KeyRangeRef>> systemRestoreRanges;
+				for (int i = 0; i < self->restoreRanges.size(); ++i) {
+					if (!SERVER_KNOBS->ENABLE_ENCRYPTION ||
+					    !self->restoreRanges[i].intersects(getSystemBackupRanges())) {
+						modifiedRestoreRanges.push_back_deep(modifiedRestoreRanges.arena(), self->restoreRanges[i]);
+					} else {
+						KeyRangeRef normalKeyRange = self->restoreRanges[i] & normalKeys;
+						KeyRangeRef systemKeyRange = self->restoreRanges[i] & systemKeys;
+						if (!normalKeyRange.empty()) {
+							modifiedRestoreRanges.push_back_deep(modifiedRestoreRanges.arena(), normalKeyRange);
+						}
+						if (!systemKeyRange.empty()) {
+							systemRestoreRanges.push_back_deep(systemRestoreRanges.arena(), systemKeyRange);
+						}
+					}
+				}
+				self->restoreRanges = modifiedRestoreRanges;
+				if (!systemRestoreRanges.empty()) {
+					// We are able to restore system keys first since we restore an entire cluster at once rather than
+					// partial key ranges.
+					wait(clearAndRestoreSystemKeys(
+					    cx, self, &backupAgent, targetVersion, lastBackupContainer, systemRestoreRanges));
+				}
 				if (deterministicRandom()->random01() < 0.5) {
 					for (restoreIndex = 0; restoreIndex < self->restoreRanges.size(); restoreIndex++) {
 						auto range = self->restoreRanges[restoreIndex];
@ -703,6 +769,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 					                                       Key(),
 					                                       Key(),
 					                                       self->locked,
+					                                       UnlockDB::True,
 					                                       OnlyApplyMutationLogs::False,
 					                                       InconsistentSnapshotOnly::False,
 					                                       ::invalidVersion,
@ -735,6 +802,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 							                                             Key(),
 							                                             Key(),
 							                                             self->locked,
+							                                             UnlockDB::True,
 							                                             OnlyApplyMutationLogs::False,
 							                                             InconsistentSnapshotOnly::False,
 							                                             ::invalidVersion,
--- a/fdbserver/workloads/BackupToBlob.actor.cpp
+++ b/fdbserver/workloads/BackupToBlob.actor.cpp
@ -21,6 +21,7 @@
 #include "fdbrpc/simulator.h"
 #include "fdbclient/BackupAgent.actor.h"
 #include "fdbclient/BackupContainer.h"
+#include "fdbclient/ManagementAPI.actor.h"
 #include "fdbserver/Knobs.h"
 #include "fdbserver/workloads/BlobStoreWorkload.h"
 #include "fdbserver/workloads/workloads.actor.h"
@ -57,6 +58,7 @@ struct BackupToBlobWorkload : TestWorkload {
 		addDefaultBackupRanges(backupRanges);

 		wait(delay(self->backupAfter));
+		state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
 		wait(backupAgent.submitBackup(cx,
 		                              self->backupURL,
 		                              {},
@ -64,7 +66,8 @@ struct BackupToBlobWorkload : TestWorkload {
 		                              self->snapshotInterval,
 		                              self->backupTag.toString(),
 		                              backupRanges,
-		                              SERVER_KNOBS->ENABLE_ENCRYPTION));
+		                              SERVER_KNOBS->ENABLE_ENCRYPTION &&
+		                                  configuration.tenantMode != TenantMode::OPTIONAL_TENANT));
 		EBackupState backupStatus = wait(backupAgent.waitBackup(cx, self->backupTag.toString(), StopWhenDone::True));
 		TraceEvent("BackupToBlob_BackupStatus").detail("Status", BackupAgentBase::getStateText(backupStatus));
 		return Void();
--- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
@ -22,6 +22,7 @@
 #include "fdbclient/BackupAgent.actor.h"
 #include "fdbclient/ClusterConnectionMemoryRecord.h"
 #include "fdbclient/TenantManagement.actor.h"
+#include "fdbserver/Knobs.h"
 #include "fdbserver/workloads/workloads.actor.h"
 #include "fdbserver/workloads/BulkSetup.actor.h"
 #include "flow/ApiVersion.h"
@ -667,10 +668,47 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 				// wait(diffRanges(self->backupRanges, self->backupPrefix, cx, self->extraDB));

 				state Standalone<VectorRef<KeyRangeRef>> restoreRange;
+				state Standalone<VectorRef<KeyRangeRef>> systemRestoreRange;
 				for (auto r : self->backupRanges) {
-					restoreRange.push_back_deep(
-					    restoreRange.arena(),
-					    KeyRangeRef(r.begin.withPrefix(self->backupPrefix), r.end.withPrefix(self->backupPrefix)));
+					if (!SERVER_KNOBS->ENABLE_ENCRYPTION || !r.intersects(getSystemBackupRanges())) {
+						restoreRange.push_back_deep(
+						    restoreRange.arena(),
+						    KeyRangeRef(r.begin.withPrefix(self->backupPrefix), r.end.withPrefix(self->backupPrefix)));
+					} else {
+						KeyRangeRef normalKeyRange = r & normalKeys;
+						KeyRangeRef systemKeyRange = r & systemKeys;
+						if (!normalKeyRange.empty()) {
+							restoreRange.push_back_deep(restoreRange.arena(),
+							                            KeyRangeRef(normalKeyRange.begin.withPrefix(self->backupPrefix),
+							                                        normalKeyRange.end.withPrefix(self->backupPrefix)));
+						}
+						if (!systemKeyRange.empty()) {
+							systemRestoreRange.push_back_deep(systemRestoreRange.arena(), systemKeyRange);
+						}
+					}
+				}
+
+				// restore system keys first before restoring user data
+				if (!systemRestoreRange.empty()) {
+					state Key systemRestoreTag = "restore_system"_sr;
+					try {
+						wait(restoreTool.submitBackup(cx,
+						                              systemRestoreTag,
+						                              systemRestoreRange,
+						                              StopWhenDone::True,
+						                              StringRef(),
+						                              self->backupPrefix,
+						                              self->locked,
+						                              DatabaseBackupAgent::PreBackupAction::CLEAR));
+					} catch (Error& e) {
+						TraceEvent("BARW_DoBackupSubmitBackupException", randomID)
+						    .error(e)
+						    .detail("Tag", printable(systemRestoreTag));
+						if (e.code() != error_code_backup_unneeded && e.code() != error_code_backup_duplicate)
+							throw;
+					}
+					wait(success(restoreTool.waitBackup(cx, systemRestoreTag)));
+					wait(restoreTool.unlockBackup(cx, systemRestoreTag));
 				}

 				try {
--- a/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp
+++ b/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp
@ -105,16 +105,6 @@ struct BlobGranuleRangesWorkload : TestWorkload {
 		}
 	}

-	ACTOR Future<bool> setRange(Database cx, KeyRange range, bool active, Optional<TenantName> tenantName) {
-		if (active) {
-			bool success = wait(cx->blobbifyRange(range, tenantName));
-			return success;
-		} else {
-			bool success = wait(cx->unblobbifyRange(range, tenantName));
-			return success;
-		}
-	}
-
 	ACTOR Future<Void> registerNewRange(Database cx, BlobGranuleRangesWorkload* self, Optional<TenantName> tenantName) {
 		std::string nextRangeKey = "R_" + self->newKey();
 		state KeyRange range(KeyRangeRef(StringRef(nextRangeKey), strinc(StringRef(nextRangeKey))));
@ -124,8 +114,7 @@ struct BlobGranuleRangesWorkload : TestWorkload {

 		// don't put in active ranges until AFTER set range command succeeds, to avoid checking a range that maybe
 		// wasn't initialized
-		bool success =
-		    wait(self->setRange(cx, range, true, tenantName.present() ? tenantName.get() : self->tenantName));
+		bool success = wait(cx->blobbifyRange(range, tenantName.present() ? tenantName.get() : self->tenantName));
 		ASSERT(success);

 		if (BGRW_DEBUG) {
@ -163,7 +152,7 @@ struct BlobGranuleRangesWorkload : TestWorkload {
 			Key purgeKey = wait(self->versionedForcePurge(cx, range, self->tenantName));
 			wait(cx->waitPurgeGranulesComplete(purgeKey));
 		}
-		bool success = wait(self->setRange(cx, range, false, self->tenantName));
+		bool success = wait(cx->unblobbifyRange(range, self->tenantName));
 		ASSERT(success);

 		if (BGRW_DEBUG) {
@ -356,7 +345,7 @@ struct BlobGranuleRangesWorkload : TestWorkload {
 		// tear down range at end
 		Key purgeKey = wait(self->versionedForcePurge(cx, range, self->tenantName));
 		wait(cx->waitPurgeGranulesComplete(purgeKey));
-		bool success = wait(self->setRange(cx, range, false, self->tenantName));
+		bool success = wait(cx->unblobbifyRange(range, self->tenantName));
 		ASSERT(success);

 		if (BGRW_DEBUG) {
@ -373,7 +362,7 @@ struct BlobGranuleRangesWorkload : TestWorkload {
 		if (BGRW_DEBUG) {
 			fmt::print("VerifyRangeUnit: [{0} - {1})\n", range.begin.printable(), range.end.printable());
 		}
-		bool setSuccess = wait(self->setRange(cx, activeRange, true, self->tenantName));
+		bool setSuccess = wait(cx->blobbifyRange(activeRange, self->tenantName));
 		ASSERT(setSuccess);
 		wait(self->checkRange(cx, self, activeRange, true));

@ -426,7 +415,7 @@ struct BlobGranuleRangesWorkload : TestWorkload {
 		for (i = 0; i < rangeCount; i++) {
 			state KeyRange subRange(KeyRangeRef(boundaries[i], boundaries[i + 1]));
 			if (i != rangeToNotBlobbify) {
-				bool setSuccess = wait(self->setRange(cx, subRange, true, self->tenantName));
+				bool setSuccess = wait(cx->blobbifyRange(subRange, self->tenantName));
 				ASSERT(setSuccess);
 				wait(self->checkRange(cx, self, subRange, true));
 			} else {
@ -473,7 +462,7 @@ struct BlobGranuleRangesWorkload : TestWorkload {
 	}

 	ACTOR Future<Void> rangesMisalignedUnit(Database cx, BlobGranuleRangesWorkload* self, KeyRange range) {
-		bool setSuccess = wait(self->setRange(cx, range, true, self->tenantName));
+		bool setSuccess = wait(cx->blobbifyRange(range, self->tenantName));
 		ASSERT(setSuccess);
 		state KeyRange subRange(KeyRangeRef(range.begin.withSuffix("A"_sr), range.begin.withSuffix("B"_sr)));

@ -526,42 +515,42 @@ struct BlobGranuleRangesWorkload : TestWorkload {

 		// unblobbifying range that already doesn't exist should be no-op
 		if (deterministicRandom()->coinflip()) {
-			bool unblobbifyStartSuccess = wait(self->setRange(cx, activeRange, false, self->tenantName));
+			bool unblobbifyStartSuccess = wait(cx->blobbifyRange(activeRange, self->tenantName));
 			ASSERT(unblobbifyStartSuccess);
 		}

-		bool success = wait(self->setRange(cx, activeRange, true, self->tenantName));
+		bool success = wait(cx->blobbifyRange(activeRange, self->tenantName));
 		ASSERT(success);
 		wait(self->checkRange(cx, self, activeRange, true));

 		// check that re-blobbifying same range is successful
-		bool retrySuccess = wait(self->setRange(cx, activeRange, true, self->tenantName));
+		bool retrySuccess = wait(cx->blobbifyRange(activeRange, self->tenantName));
 		ASSERT(retrySuccess);
 		wait(self->checkRange(cx, self, activeRange, true));

 		// check that blobbifying range that overlaps but does not match existing blob range fails
-		bool fail1 = wait(self->setRange(cx, range, true, self->tenantName));
+		bool fail1 = wait(cx->blobbifyRange(range, self->tenantName));
 		ASSERT(!fail1);

-		bool fail2 = wait(self->setRange(cx, KeyRangeRef(range.begin, activeRange.end), true, self->tenantName));
+		bool fail2 = wait(cx->blobbifyRange(KeyRangeRef(range.begin, activeRange.end), self->tenantName));
 		ASSERT(!fail2);

-		bool fail3 = wait(self->setRange(cx, KeyRangeRef(activeRange.begin, range.end), true, self->tenantName));
+		bool fail3 = wait(cx->blobbifyRange(KeyRangeRef(activeRange.begin, range.end), self->tenantName));
 		ASSERT(!fail3);

-		bool fail4 = wait(self->setRange(cx, KeyRangeRef(range.begin, middleKey), true, self->tenantName));
+		bool fail4 = wait(cx->blobbifyRange(KeyRangeRef(range.begin, middleKey), self->tenantName));
 		ASSERT(!fail4);

-		bool fail5 = wait(self->setRange(cx, KeyRangeRef(middleKey, range.end), true, self->tenantName));
+		bool fail5 = wait(cx->blobbifyRange(KeyRangeRef(middleKey, range.end), self->tenantName));
 		ASSERT(!fail5);

-		bool fail6 = wait(self->setRange(cx, KeyRangeRef(activeRange.begin, middleKey), true, self->tenantName));
+		bool fail6 = wait(cx->blobbifyRange(KeyRangeRef(activeRange.begin, middleKey), self->tenantName));
 		ASSERT(!fail6);

-		bool fail7 = wait(self->setRange(cx, KeyRangeRef(middleKey, activeRange.end), true, self->tenantName));
+		bool fail7 = wait(cx->blobbifyRange(KeyRangeRef(middleKey, activeRange.end), self->tenantName));
 		ASSERT(!fail7);

-		bool fail8 = wait(self->setRange(cx, KeyRangeRef(middleKey, middleKey2), true, self->tenantName));
+		bool fail8 = wait(cx->blobbifyRange(KeyRangeRef(middleKey, middleKey2), self->tenantName));
 		ASSERT(!fail8);

 		{
@ -582,13 +571,14 @@ struct BlobGranuleRangesWorkload : TestWorkload {
 				}
 			}

-			// tear down + check that un-blobbifying at a non-aligned range also doesn't work
-			Key purgeKey = wait(self->versionedForcePurge(cx, activeRange, self->tenantName));
+			state Version purgeVersion = deterministicRandom()->coinflip() ? latestVersion : 1;
+			state KeyRangeRef purgeRange = deterministicRandom()->coinflip() ? activeRange : range;
+			Key purgeKey = wait(cx->purgeBlobGranules(purgeRange, purgeVersion, self->tenantName, true));
 			wait(cx->waitPurgeGranulesComplete(purgeKey));

 			if (deterministicRandom()->coinflip()) {
 				// force purge again and ensure it is idempotent
-				Key purgeKeyAgain = wait(cx->purgeBlobGranules(activeRange, 1, self->tenantName, true));
+				Key purgeKeyAgain = wait(cx->purgeBlobGranules(purgeRange, purgeVersion, self->tenantName, true));
 				wait(cx->waitPurgeGranulesComplete(purgeKeyAgain));
 			}
 		}
@ -600,41 +590,38 @@ struct BlobGranuleRangesWorkload : TestWorkload {
 			ASSERT(blobRanges.size() == 1);
 			ASSERT(blobRanges[0] == activeRange);

-			bool unblobbifyFail1 = wait(self->setRange(cx, range, false, self->tenantName));
+			bool unblobbifyFail1 = wait(cx->unblobbifyRange(range, self->tenantName));
 			ASSERT(!unblobbifyFail1);

 			bool unblobbifyFail2 =
-			    wait(self->setRange(cx, KeyRangeRef(range.begin, activeRange.end), false, self->tenantName));
+			    wait(cx->unblobbifyRange(KeyRangeRef(range.begin, activeRange.end), self->tenantName));
 			ASSERT(!unblobbifyFail2);

 			bool unblobbifyFail3 =
-			    wait(self->setRange(cx, KeyRangeRef(activeRange.begin, range.end), false, self->tenantName));
+			    wait(cx->unblobbifyRange(KeyRangeRef(activeRange.begin, range.end), self->tenantName));
 			ASSERT(!unblobbifyFail3);

 			bool unblobbifyFail4 =
-			    wait(self->setRange(cx, KeyRangeRef(activeRange.begin, middleKey), false, self->tenantName));
+			    wait(cx->unblobbifyRange(KeyRangeRef(activeRange.begin, middleKey), self->tenantName));
 			ASSERT(!unblobbifyFail4);

-			bool unblobbifyFail5 =
-			    wait(self->setRange(cx, KeyRangeRef(middleKey, activeRange.end), false, self->tenantName));
+			bool unblobbifyFail5 = wait(cx->unblobbifyRange(KeyRangeRef(middleKey, activeRange.end), self->tenantName));
 			ASSERT(!unblobbifyFail5);

 			bool unblobbifyFail6 =
-			    wait(self->setRange(cx, KeyRangeRef(activeRange.begin, middleKey), false, self->tenantName));
+			    wait(cx->unblobbifyRange(KeyRangeRef(activeRange.begin, middleKey), self->tenantName));
 			ASSERT(!unblobbifyFail6);

-			bool unblobbifyFail7 =
-			    wait(self->setRange(cx, KeyRangeRef(middleKey, activeRange.end), false, self->tenantName));
+			bool unblobbifyFail7 = wait(cx->unblobbifyRange(KeyRangeRef(middleKey, activeRange.end), self->tenantName));
 			ASSERT(!unblobbifyFail7);

-			bool unblobbifyFail8 =
-			    wait(self->setRange(cx, KeyRangeRef(middleKey, middleKey2), false, self->tenantName));
+			bool unblobbifyFail8 = wait(cx->unblobbifyRange(KeyRangeRef(middleKey, middleKey2), self->tenantName));
 			ASSERT(!unblobbifyFail8);

-			bool unblobbifySuccess = wait(self->setRange(cx, activeRange, true, self->tenantName));
+			bool unblobbifySuccess = wait(cx->unblobbifyRange(activeRange, self->tenantName));
 			ASSERT(unblobbifySuccess);

-			bool unblobbifySuccessAgain = wait(self->setRange(cx, activeRange, true, self->tenantName));
+			bool unblobbifySuccessAgain = wait(cx->unblobbifyRange(activeRange, self->tenantName));
 			ASSERT(unblobbifySuccessAgain);
 		}

@ -642,7 +629,7 @@ struct BlobGranuleRangesWorkload : TestWorkload {
 	}

 	ACTOR Future<Void> reBlobbifyUnit(Database cx, BlobGranuleRangesWorkload* self, KeyRange range) {
-		bool setSuccess = wait(self->setRange(cx, range, true, self->tenantName));
+		bool setSuccess = wait(cx->blobbifyRange(range, self->tenantName));
 		ASSERT(setSuccess);
 		wait(self->checkRange(cx, self, range, true));

@ -651,11 +638,11 @@ struct BlobGranuleRangesWorkload : TestWorkload {
 		wait(cx->waitPurgeGranulesComplete(purgeKey));
 		wait(self->checkRange(cx, self, range, false));

-		bool unsetSuccess = wait(self->setRange(cx, range, false, self->tenantName));
+		bool unsetSuccess = wait(cx->unblobbifyRange(range, self->tenantName));
 		ASSERT(unsetSuccess);
 		wait(self->checkRange(cx, self, range, false));

-		bool reSetSuccess = wait(self->setRange(cx, range, true, self->tenantName));
+		bool reSetSuccess = wait(cx->blobbifyRange(range, self->tenantName));
 		ASSERT(reSetSuccess);
 		wait(self->checkRange(cx, self, range, true));

--- a/fdbserver/workloads/BlobGranuleVerifier.actor.cpp
+++ b/fdbserver/workloads/BlobGranuleVerifier.actor.cpp
@ -305,6 +305,8 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		state Version prevPurgeVersion = -1;
 		state UID dbgId = debugRandom()->randomUniqueID();
 		state Version newPurgeVersion = 0;
+		// usually we want randomness to verify maximum data, but sometimes hotspotting a subset is good too
+		state bool pickGranuleUniform = deterministicRandom()->random01() < 0.1;

 		TraceEvent("BlobGranuleVerifierStart");
 		if (BGV_DEBUG) {
@ -458,7 +460,13 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 				}

 				// pick a random range
-				int rIndex = deterministicRandom()->randomInt(0, self->granuleRanges.get().size());
+				size_t granuleCount = self->granuleRanges.get().size();
+				size_t rIndex;
+				if (pickGranuleUniform) {
+					rIndex = deterministicRandom()->randomInt(0, granuleCount);
+				} else {
+					rIndex = deterministicRandom()->randomSkewedUInt32(0, granuleCount);
+				}
 				state KeyRange range = self->granuleRanges.get()[rIndex];

 				state std::pair<RangeResult, Version> fdb = wait(readFromFDB(cx, range));
--- a/fdbserver/workloads/ConsistencyCheck.actor.cpp
+++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp
@ -30,7 +30,7 @@
 #include "flow/IRateControl.h"
 #include "fdbrpc/simulator.h"
 #include "fdbserver/Knobs.h"
-#include "fdbserver/StorageMetrics.h"
+#include "fdbserver/StorageMetrics.actor.h"
 #include "fdbserver/DataDistribution.actor.h"
 #include "fdbserver/QuietDatabase.h"
 #include "fdbserver/TSSMappingUtil.actor.h"
@ -394,6 +394,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
 		state Standalone<VectorRef<KeyValueRef>>
 		    serverList; // "\xff/serverList/[[serverID]]" := "[[StorageServerInterface]]"
 		state Standalone<VectorRef<KeyValueRef>> serverTag; // "\xff/serverTag/[[serverID]]" = "[[Tag]]"
+		state bool testResult = true;

 		std::vector<Future<bool>> cacheResultsPromise;
 		cacheResultsPromise.push_back(self->fetchKeyValuesFromSS(cx, self, storageCacheKeys, cacheKeyPromise, true));
@ -581,7 +582,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
 					for (j = 0; j < keyValueFutures.size(); j++) {
 						ErrorOr<GetKeyValuesReply> rangeResult = keyValueFutures[j].get();
 						// if (rangeResult.isError()) {
-						// 	throw rangeResult.getError();
+						//	throw rangeResult.getError();
 						// }

 						// Compare the results with other storage servers
@ -709,7 +710,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
 									    .detail("MatchingKVPairs", matchingKVPairs);

 									self->testFailure("Data inconsistent", true);
-									return false;
+									testResult = false;
 								}
 							}
 						}
@ -755,7 +756,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
 				    .detail("BytesRead", bytesReadInRange);
 			}
 		}
-		return true;
+		return testResult;
 	}

 	// Directly fetch key/values from storage servers through GetKeyValuesRequest
--- a/fdbserver/workloads/IncrementalBackup.actor.cpp
+++ b/fdbserver/workloads/IncrementalBackup.actor.cpp
@ -20,6 +20,7 @@

 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/Knobs.h"
+#include "fdbclient/ManagementAPI.actor.h"
 #include "fdbclient/SystemData.h"
 #include "fdbclient/ReadYourWrites.h"
 #include "fdbrpc/simulator.h"
@ -150,6 +151,7 @@ struct IncrementalBackupWorkload : TestWorkload {

 		if (self->submitOnly) {
 			TraceEvent("IBackupSubmitAttempt").log();
+			state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
 			try {
 				wait(self->backupAgent.submitBackup(cx,
 				                                    self->backupDir,
@ -158,7 +160,8 @@ struct IncrementalBackupWorkload : TestWorkload {
 				                                    1e8,
 				                                    self->tag.toString(),
 				                                    backupRanges,
-				                                    SERVER_KNOBS->ENABLE_ENCRYPTION,
+				                                    SERVER_KNOBS->ENABLE_ENCRYPTION &&
+				                                        configuration.tenantMode != TenantMode::OPTIONAL_TENANT,
 				                                    StopWhenDone::False,
 				                                    UsePartitionedLog::False,
 				                                    IncrementalBackupOnly::True));
@ -227,19 +230,56 @@ struct IncrementalBackupWorkload : TestWorkload {
 			    .detail("Size", containers.size())
 			    .detail("First", containers.front());
 			state Key backupURL = Key(containers.front());
+
+			state Standalone<VectorRef<KeyRangeRef>> restoreRange;
+			state Standalone<VectorRef<KeyRangeRef>> systemRestoreRange;
+			for (auto r : backupRanges) {
+				if (!SERVER_KNOBS->ENABLE_ENCRYPTION || !r.intersects(getSystemBackupRanges())) {
+					restoreRange.push_back_deep(restoreRange.arena(), r);
+				} else {
+					KeyRangeRef normalKeyRange = r & normalKeys;
+					KeyRangeRef systemKeyRange = r & systemKeys;
+					if (!normalKeyRange.empty()) {
+						restoreRange.push_back_deep(restoreRange.arena(), normalKeyRange);
+					}
+					if (!systemKeyRange.empty()) {
+						systemRestoreRange.push_back_deep(systemRestoreRange.arena(), systemKeyRange);
+					}
+				}
+			}
+			if (!systemRestoreRange.empty()) {
+				TraceEvent("IBackupSystemRestoreAttempt").detail("BeginVersion", beginVersion);
+				wait(success(self->backupAgent.restore(cx,
+				                                       cx,
+				                                       "system_restore"_sr,
+				                                       backupURL,
+				                                       {},
+				                                       systemRestoreRange,
+				                                       WaitForComplete::True,
+				                                       invalidVersion,
+				                                       Verbose::True,
+				                                       Key(),
+				                                       Key(),
+				                                       LockDB::True,
+				                                       UnlockDB::True,
+				                                       OnlyApplyMutationLogs::True,
+				                                       InconsistentSnapshotOnly::False,
+				                                       beginVersion)));
+			}
 			TraceEvent("IBackupRestoreAttempt").detail("BeginVersion", beginVersion);
 			wait(success(self->backupAgent.restore(cx,
 			                                       cx,
 			                                       Key(self->tag.toString()),
 			                                       backupURL,
 			                                       {},
-			                                       backupRanges,
+			                                       restoreRange,
 			                                       WaitForComplete::True,
 			                                       invalidVersion,
 			                                       Verbose::True,
 			                                       Key(),
 			                                       Key(),
 			                                       LockDB::True,
+			                                       UnlockDB::True,
 			                                       OnlyApplyMutationLogs::True,
 			                                       InconsistentSnapshotOnly::False,
 			                                       beginVersion)));
--- a/fdbserver/workloads/RestoreBackup.actor.cpp
+++ b/fdbserver/workloads/RestoreBackup.actor.cpp
@ -24,6 +24,7 @@
 #include "fdbrpc/simulator.h"
 #include "fdbclient/BackupAgent.actor.h"
 #include "fdbclient/BackupContainer.h"
+#include "fdbserver/Knobs.h"
 #include "fdbserver/workloads/workloads.actor.h"
 #include "flow/actorcompiler.h" // This must be the last #include.

@ -113,14 +114,43 @@ struct RestoreBackupWorkload : TestWorkload {
 		wait(delay(self->delayFor));
 		wait(waitOnBackup(self, cx));
 		wait(clearDatabase(cx));
-		wait(success(self->backupAgent.restore(cx,
-		                                       cx,
-		                                       self->tag,
-		                                       Key(self->backupContainer->getURL()),
-		                                       self->backupContainer->getProxy(),
-		                                       WaitForComplete::True,
-		                                       ::invalidVersion,
-		                                       Verbose::True)));
+		if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
+			// restore system keys
+			VectorRef<KeyRangeRef> systemBackupRanges = getSystemBackupRanges();
+			state std::vector<Future<Version>> restores;
+			for (int i = 0; i < systemBackupRanges.size(); i++) {
+				restores.push_back((self->backupAgent.restore(cx,
+				                                              cx,
+				                                              "system_restore"_sr,
+				                                              Key(self->backupContainer->getURL()),
+				                                              self->backupContainer->getProxy(),
+				                                              WaitForComplete::True,
+				                                              ::invalidVersion,
+				                                              Verbose::True,
+				                                              systemBackupRanges[i])));
+			}
+			waitForAll(restores);
+			// restore non-system keys
+			wait(success(self->backupAgent.restore(cx,
+			                                       cx,
+			                                       self->tag,
+			                                       Key(self->backupContainer->getURL()),
+			                                       self->backupContainer->getProxy(),
+			                                       WaitForComplete::True,
+			                                       ::invalidVersion,
+			                                       Verbose::True,
+			                                       normalKeys)));
+		} else {
+			wait(success(self->backupAgent.restore(cx,
+			                                       cx,
+			                                       self->tag,
+			                                       Key(self->backupContainer->getURL()),
+			                                       self->backupContainer->getProxy(),
+			                                       WaitForComplete::True,
+			                                       ::invalidVersion,
+			                                       Verbose::True)));
+		}
+
 		return Void();
 	}

--- a/fdbserver/workloads/RestoreFromBlob.actor.cpp
+++ b/fdbserver/workloads/RestoreFromBlob.actor.cpp
@ -18,9 +18,11 @@
 * limitations under the License.
 */

+#include "fdbclient/SystemData.h"
 #include "fdbrpc/simulator.h"
 #include "fdbclient/BackupAgent.actor.h"
 #include "fdbclient/BackupContainer.h"
+#include "fdbserver/Knobs.h"
 #include "fdbserver/workloads/BlobStoreWorkload.h"
 #include "fdbserver/workloads/workloads.actor.h"
 #include "fdbserver/workloads/BulkSetup.actor.h"
@ -52,13 +54,22 @@ struct RestoreFromBlobWorkload : TestWorkload {

 	ACTOR static Future<Void> _start(Database cx, RestoreFromBlobWorkload* self) {
 		state FileBackupAgent backupAgent;
-		state Standalone<VectorRef<KeyRangeRef>> restoreRanges;
-
-		addDefaultBackupRanges(restoreRanges);

 		wait(delay(self->restoreAfter));
-		Version v = wait(
-		    backupAgent.restore(cx, {}, self->backupTag, self->backupURL, {}, restoreRanges, self->waitForComplete));
+		if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
+			// restore system keys followed by user keys
+			wait(success(backupAgent.restore(
+			    cx, {}, self->backupTag, self->backupURL, {}, getSystemBackupRanges(), self->waitForComplete)));
+			Standalone<VectorRef<KeyRangeRef>> restoreRanges;
+			restoreRanges.push_back_deep(restoreRanges.arena(), normalKeys);
+			wait(success(backupAgent.restore(
+			    cx, {}, self->backupTag, self->backupURL, {}, restoreRanges, self->waitForComplete)));
+		} else {
+			Standalone<VectorRef<KeyRangeRef>> restoreRanges;
+			addDefaultBackupRanges(restoreRanges);
+			wait(success(backupAgent.restore(
+			    cx, {}, self->backupTag, self->backupURL, {}, restoreRanges, self->waitForComplete)));
+		}
 		return Void();
 	}

--- a/fdbserver/workloads/StorageQuota.actor.cpp
+++ b/fdbserver/workloads/StorageQuota.actor.cpp
@ -38,17 +38,17 @@ struct StorageQuotaWorkload : TestWorkload {
 		wait(setStorageQuotaHelper(cx, "name2"_sr, 200));
 		wait(setStorageQuotaHelper(cx, "name1"_sr, 300));

-		state Optional<uint64_t> quota1 = wait(getStorageQuotaHelper(cx, "name1"_sr));
+		state Optional<int64_t> quota1 = wait(getStorageQuotaHelper(cx, "name1"_sr));
 		ASSERT(quota1.present() && quota1.get() == 300);
-		state Optional<uint64_t> quota2 = wait(getStorageQuotaHelper(cx, "name2"_sr));
+		state Optional<int64_t> quota2 = wait(getStorageQuotaHelper(cx, "name2"_sr));
 		ASSERT(quota2.present() && quota2.get() == 200);
-		state Optional<uint64_t> quota3 = wait(getStorageQuotaHelper(cx, "name3"_sr));
+		state Optional<int64_t> quota3 = wait(getStorageQuotaHelper(cx, "name3"_sr));
 		ASSERT(!quota3.present());

 		return Void();
 	}

-	ACTOR static Future<Void> setStorageQuotaHelper(Database cx, StringRef tenantName, uint64_t quota) {
+	ACTOR static Future<Void> setStorageQuotaHelper(Database cx, StringRef tenantName, int64_t quota) {
 		state Transaction tr(cx);
 		loop {
 			try {
@ -61,11 +61,11 @@ struct StorageQuotaWorkload : TestWorkload {
 		}
 	}

-	ACTOR static Future<Optional<uint64_t>> getStorageQuotaHelper(Database cx, StringRef tenantName) {
+	ACTOR static Future<Optional<int64_t>> getStorageQuotaHelper(Database cx, StringRef tenantName) {
 		state Transaction tr(cx);
 		loop {
 			try {
-				state Optional<uint64_t> quota = wait(getStorageQuota(&tr, tenantName));
+				state Optional<int64_t> quota = wait(getStorageQuota(&tr, tenantName));
 				wait(tr.commit());
 				return quota;
 			} catch (Error& e) {
--- a/fdbserver/workloads/SubmitBackup.actor.cpp
+++ b/fdbserver/workloads/SubmitBackup.actor.cpp
@ -19,6 +19,7 @@
 */

 #include "fdbclient/FDBTypes.h"
+#include "fdbclient/ManagementAPI.actor.h"
 #include "fdbclient/ReadYourWrites.h"
 #include "fdbrpc/simulator.h"
 #include "fdbclient/BackupAgent.actor.h"
@ -52,8 +53,9 @@ struct SubmitBackupWorkload : TestWorkload {

 	ACTOR static Future<Void> _start(SubmitBackupWorkload* self, Database cx) {
 		wait(delay(self->delayFor));
-		Standalone<VectorRef<KeyRangeRef>> backupRanges;
+		state Standalone<VectorRef<KeyRangeRef>> backupRanges;
 		addDefaultBackupRanges(backupRanges);
+		state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
 		try {
 			wait(self->backupAgent.submitBackup(cx,
 			                                    self->backupDir,
@ -62,7 +64,8 @@ struct SubmitBackupWorkload : TestWorkload {
 			                                    self->snapshotInterval,
 			                                    self->tag.toString(),
 			                                    backupRanges,
-			                                    SERVER_KNOBS->ENABLE_ENCRYPTION,
+			                                    SERVER_KNOBS->ENABLE_ENCRYPTION &&
+			                                        configuration.tenantMode != TenantMode::OPTIONAL_TENANT,
 			                                    self->stopWhenDone,
 			                                    UsePartitionedLog::False,
 			                                    self->incremental));
--- a/fdbserver/workloads/TagThrottleApi.actor.cpp
+++ b/fdbserver/workloads/TagThrottleApi.actor.cpp
@ -43,7 +43,7 @@ struct TagThrottleApiWorkload : TestWorkload {
 	}

 	Future<Void> start(Database const& cx) override {
-		if (this->clientId != 0)
+		if (SERVER_KNOBS->GLOBAL_TAG_THROTTLING || this->clientId != 0)
 			return Void();
 		return timeout(runThrottleApi(this, cx), testDuration, Void());
 	}
--- a/fdbserver/workloads/GlobalTagThrottling.actor.cpp
+++ b/fdbserver/workloads/GlobalTagThrottling.actor.cpp
@ -1,5 +1,5 @@
 /*
- * GlobalTagThrottling.actor.cpp
+ * ThroughputQuota.actor.cpp
 *
 * This source file is part of the FoundationDB open source project
 *
@ -23,42 +23,46 @@

 #include "flow/actorcompiler.h" // This must be the last #include.

-class GlobalTagThrottlingWorkload : public TestWorkload {
+// This workload sets the throughput quota of a tag during the setup phase
+class ThroughputQuotaWorkload : public TestWorkload {
 	TransactionTag transactionTag;
 	double reservedQuota{ 0.0 };
 	double totalQuota{ 0.0 };

-	ACTOR static Future<Void> setup(GlobalTagThrottlingWorkload* self, Database cx) {
+	ACTOR static Future<Void> setup(ThroughputQuotaWorkload* self, Database cx) {
 		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
 		loop {
 			try {
 				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-				TraceEvent("GlobalTagThrottlingWorkload_SettingTagQuota")
-				    .detail("Tag", self->transactionTag)
+				TraceEvent("ThroughputQuotaWorkload_SettingTagQuota")
+				    .detail("Tag", printable(self->transactionTag))
 				    .detail("ReservedQuota", self->reservedQuota)
 				    .detail("TotalQuota", self->totalQuota);
 				ThrottleApi::setTagQuota(tr, self->transactionTag, self->reservedQuota, self->totalQuota);
 				wait(tr->commit());
 				return Void();
 			} catch (Error& e) {
-				TraceEvent("GlobalTagThrottlingWorkload_SetupError").error(e);
+				TraceEvent("ThroughputQuotaWorkload_SetupError").error(e);
 				wait(tr->onError(e));
 			}
 		};
 	}

 public:
-	static constexpr auto NAME = "GlobalTagThrottling";
-	explicit GlobalTagThrottlingWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
+	static constexpr auto NAME = "ThroughputQuota";
+	explicit ThroughputQuotaWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
 		transactionTag = getOption(options, "transactionTag"_sr, "sampleTag"_sr);
 		reservedQuota = getOption(options, "reservedQuota"_sr, 0.0);
 		totalQuota = getOption(options, "totalQuota"_sr, 0.0);
 	}

-	Future<Void> setup(Database const& cx) override { return clientId ? Void() : setup(this, cx); }
+	Future<Void> setup(Database const& cx) override {
+		DatabaseContext::debugUseTags = true;
+		return clientId ? Void() : setup(this, cx);
+	}
 	Future<Void> start(Database const& cx) override { return Void(); }
 	Future<bool> check(Database const& cx) override { return true; }
 	void getMetrics(std::vector<PerfMetric>& m) override {}
 };

-WorkloadFactory<GlobalTagThrottlingWorkload> GlobalTagThrottlingWorkloadFactory;
+WorkloadFactory<ThroughputQuotaWorkload> ThroughputQuotaWorkloadFactory;
--- a/flow/include/flow/error_definitions.h
+++ b/flow/include/flow/error_definitions.h
@ -131,6 +131,7 @@ ERROR( please_reboot_kv_store, 1219, "Need to reboot the storage engine")
 ERROR( incompatible_software_version, 1220, "Current software does not support database format" )
 ERROR( audit_storage_failed, 1221, "Validate storage consistency operation failed" )
 ERROR( audit_storage_exceeded_request_limit, 1222, "Exceeded the max number of allowed concurrent audit storage requests" )
+ERROR( proxy_tag_throttled, 1223, "Exceeded maximum proxy tag throttling duration" )

 // 15xx Platform errors
 ERROR( platform_error, 1500, "Platform error" )
--- a/packaging/docker/Dockerfile
+++ b/packaging/docker/Dockerfile
@ -178,13 +178,13 @@ RUN yum -y install \
    rm -rf /var/cache/yum

 WORKDIR /tmp
-RUN curl -Ls https://amazon-eks.s3.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/kubectl -o kubectl && \
-    echo "08ff68159bbcb844455167abb1d0de75bbfe5ae1b051f81ab060a1988027868a  kubectl" > kubectl.txt && \
+RUN curl -Ls https://s3.us-west-2.amazonaws.com/amazon-eks/1.22.6/2022-03-09/bin/linux/amd64/kubectl -o kubectl && \
+    echo "860c3d37a5979491895767e7332404d28dc0d7797c7673c33df30ca80e215a07  kubectl" > kubectl.txt && \
    sha256sum --quiet -c kubectl.txt && \
    mv kubectl /usr/local/bin/kubectl && \
    chmod 755 /usr/local/bin/kubectl && \
-    curl -Ls https://awscli.amazonaws.com/awscli-exe-linux-x86_64-2.2.43.zip -o "awscliv2.zip" && \
-    echo "9a8b3c4e7f72bbcc55e341dce3af42479f2730c225d6d265ee6f9162cfdebdfd  awscliv2.zip" > awscliv2.txt && \
+    curl -Ls https://awscli.amazonaws.com/awscli-exe-linux-x86_64-2.7.34.zip -o "awscliv2.zip" && \
+    echo "daf9253f0071b5cfee9532bc5220bedd7a5d29d4e0f92b42b9e3e4c496341e88  awscliv2.zip" > awscliv2.txt && \
    sha256sum --quiet -c awscliv2.txt && \
    unzip -qq awscliv2.zip && \
    ./aws/install && \
--- a/packaging/docker/Dockerfile.eks
+++ b/packaging/docker/Dockerfile.eks
@ -53,13 +53,13 @@ RUN curl -Ls https://github.com/krallin/tini/releases/download/v0.19.0/tini-amd6
    mv tini /usr/bin/ && \
    rm -rf /tmp/*

-RUN curl -Ls https://amazon-eks.s3.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/kubectl -o kubectl && \
-    echo "08ff68159bbcb844455167abb1d0de75bbfe5ae1b051f81ab060a1988027868a  kubectl" > kubectl.txt && \
+RUN curl -Ls https://s3.us-west-2.amazonaws.com/amazon-eks/1.22.6/2022-03-09/bin/linux/amd64/kubectl -o kubectl && \
+    echo "860c3d37a5979491895767e7332404d28dc0d7797c7673c33df30ca80e215a07  kubectl" > kubectl.txt && \
    sha256sum --quiet -c kubectl.txt && \
    mv kubectl /usr/local/bin/kubectl && \
    chmod 755 /usr/local/bin/kubectl && \
-    curl -Ls https://awscli.amazonaws.com/awscli-exe-linux-x86_64-2.2.43.zip -o "awscliv2.zip" && \
-    echo "9a8b3c4e7f72bbcc55e341dce3af42479f2730c225d6d265ee6f9162cfdebdfd  awscliv2.zip" > awscliv2.txt && \
+    curl -Ls https://awscli.amazonaws.com/awscli-exe-linux-x86_64-2.7.34.zip -o "awscliv2.zip" && \
+    echo "daf9253f0071b5cfee9532bc5220bedd7a5d29d4e0f92b42b9e3e4c496341e88  awscliv2.zip" > awscliv2.txt && \
    sha256sum --quiet -c awscliv2.txt && \
    unzip -qq awscliv2.zip && \
    ./aws/install && \
--- a/packaging/docker/run_ycsb.sh
+++ b/packaging/docker/run_ycsb.sh
@ -1,22 +1,44 @@
 #!/usr/bin/env bash
-set -Eeuxo pipefail
+set -Eeuo pipefail
+
+function logg () {
+    printf "##### $(date +'%Y-%m-%dT%H:%M:%SZ') #  %-56.55s #####\n" "${1}"
+}
+
+function error_exit () {
+    echo "################################################################################"
+    logg "${0} FAILED"
+    logg "RUN_ID: ${RUN_ID}"
+    logg "WORKLOAD: ${WORKLOAD}"
+    logg "ENVIRONMENT IS:"
+    env
+    echo "################################################################################"
+}
+
+trap error_exit ERR

 namespace=$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace)
-POD_NUM=$(echo $POD_NAME | cut -d - -f3)
-KEY="ycsb_load_${POD_NUM}_of_${NUM_PODS}_complete"
-CLI=$(ls /var/dynamic-conf/bin/*/fdbcli | head -n1)

-echo "WAITING FOR ALL PODS TO COME UP"
-while [[ $(kubectl get pods -n ${namespace} -l name=ycsb,run=${RUN_ID} --field-selector=status.phase=Running | grep -cv NAME) -lt ${NUM_PODS} ]]; do
+logg "WAITING FOR ${NUM_PODS} PODS TO COME UP IN ${namespace}"
+while [[ $(kubectl get pods -n "${namespace}" -l name=ycsb,run="${RUN_ID}" --field-selector=status.phase=Running | grep -cv NAME) -lt ${NUM_PODS} ]]; do
    sleep 1
 done
-echo "ALL PODS ARE UP"
+logg "${NUM_PODS} PODS ARE UP IN ${namespace}"

-echo "RUNNING YCSB"
-./bin/ycsb.sh ${MODE} foundationdb -s -P workloads/${WORKLOAD} ${YCSB_ARGS}
-echo "YCSB FINISHED"
+logg "RUNNING YCSB ${WORKLOAD}"
+set -x
+./bin/ycsb.sh "${MODE}" foundationdb -s -P "workloads/${WORKLOAD}" "${YCSB_ARGS}"
+set +x
+logg "YCSB ${WORKLOAD} FINISHED"

-echo "COPYING HISTOGRAMS TO S3"
-aws s3 sync --sse aws:kms --exclude "*" --include "histogram.*" /tmp s3://${BUCKET}/ycsb_histograms/${namespace}/${POD_NAME}
-echo "COPYING HISTOGRAMS TO S3 FINISHED"
+logg "COPYING HISTOGRAMS TO S3"
+set -x
+aws s3 sync --sse aws:kms --exclude "*" --include "histogram.*" /tmp "s3://${BUCKET}/ycsb_histograms/${namespace}/${POD_NAME}"
+set +x
+logg "COPYING HISTOGRAMS TO S3 FINISHED"

+echo "################################################################################"
+logg "COMPLETED ${0}"
+logg "RUN_ID: ${RUN_ID}"
+logg "WORKLOAD: ${WORKLOAD}"
+echo "################################################################################"
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -226,7 +226,6 @@ if(WITH_PYTHON)
  add_fdb_test(TEST_FILES rare/CycleWithDeadHall.toml)
  add_fdb_test(TEST_FILES rare/DataDistributionMetrics.toml)
  add_fdb_test(TEST_FILES rare/FuzzTest.toml)
-  add_fdb_test(TEST_FILES rare/GlobalTagThrottling.toml IGNORE)
  add_fdb_test(TEST_FILES rare/HighContentionPrefixAllocator.toml)
  add_fdb_test(TEST_FILES rare/InventoryTestHeavyWrites.toml)
  add_fdb_test(TEST_FILES rare/LargeApiCorrectness.toml)
@ -240,6 +239,7 @@ if(WITH_PYTHON)
  add_fdb_test(TEST_FILES rare/RedwoodCorrectnessBTree.toml)
  add_fdb_test(TEST_FILES rare/RedwoodDeltaTree.toml)
  add_fdb_test(TEST_FILES rare/Throttling.toml)
+  add_fdb_test(TEST_FILES rare/ThroughputQuota.toml)
  add_fdb_test(TEST_FILES rare/TransactionTagApiCorrectness.toml)
  add_fdb_test(TEST_FILES rare/TransactionTagSwizzledApiCorrectness.toml)
  add_fdb_test(TEST_FILES rare/WriteTagThrottling.toml)
--- a/Show More
+++ b/Show More