Merge remote-tracking branch 'origin/main' into expose-txn-cost

2022-10-30 09:36:37 -07:00 · 2022-10-30 09:36:37 -07:00 · 0eb1598afa
parent 1c119be26d 65aeeff2a2
commit 0eb1598afa
143 changed files with 4499 additions and 2178 deletions
--- a/bindings/c/CMakeLists.txt
+++ b/bindings/c/CMakeLists.txt
@ -442,7 +442,7 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-santizer
    DEPENDS ${IMPLIBSO_SRC} fdb_c
    COMMENT "Generating source code for C shim library")

-  add_library(fdb_c_shim STATIC ${SHIM_LIB_GEN_SRC} foundationdb/fdb_c_shim.h fdb_c_shim.cpp)
+  add_library(fdb_c_shim SHARED ${SHIM_LIB_GEN_SRC} foundationdb/fdb_c_shim.h fdb_c_shim.cpp)
  target_link_options(fdb_c_shim PRIVATE "LINKER:--version-script=${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.map,-z,nodelete,-z,noexecstack")
  target_link_libraries(fdb_c_shim PUBLIC dl)
  target_include_directories(fdb_c_shim PUBLIC
--- a/bindings/c/test/mako/mako.cpp
+++ b/bindings/c/test/mako/mako.cpp
@ -59,6 +59,8 @@
 #include "shm.hpp"
 #include "stats.hpp"
 #include "time.hpp"
+#include "rapidjson/document.h"
+#include "rapidjson/error/en.h"

 namespace mako {

@ -88,14 +90,29 @@ Transaction createNewTransaction(Database db, Arguments const& args, int id = -1
 	}
 	// Create Tenant Transaction
 	int tenant_id = (id == -1) ? urand(0, args.active_tenants - 1) : id;
+	Transaction tr;
+	std::string tenantStr;
 	// If provided tenants array, use it
 	if (tenants) {
-		return tenants[tenant_id].createTransaction();
+		tr = tenants[tenant_id].createTransaction();
+	} else {
+		tenantStr = "tenant" + std::to_string(tenant_id);
+		BytesRef tenant_name = toBytesRef(tenantStr);
+		Tenant t = db.openTenant(tenant_name);
+		tr = t.createTransaction();
 	}
-	std::string tenantStr = "tenant" + std::to_string(tenant_id);
-	BytesRef tenant_name = toBytesRef(tenantStr);
-	Tenant t = db.openTenant(tenant_name);
-	return t.createTransaction();
+	if (!args.authorization_tokens.empty()) {
+		// lookup token based on tenant name and, if found, set authz token to transaction
+		if (tenantStr.empty())
+			tenantStr = "tenant" + std::to_string(tenant_id);
+		auto tokenMapItr = args.authorization_tokens.find(tenantStr);
+		if (tokenMapItr != args.authorization_tokens.end()) {
+			tr.setOption(FDB_TR_OPTION_AUTHORIZATION_TOKEN, tokenMapItr->second);
+		} else {
+			logr.warn("Authorization token map is not empty, but could not find token for tenant '{}'", tenantStr);
+		}
+	}
+	return tr;
 }

 uint64_t byteswapHelper(uint64_t input) {
@ -815,6 +832,18 @@ int workerProcessMain(Arguments const& args, int worker_id, shared_memory::Acces
 		logr.error("network::setOption(FDB_NET_OPTION_DISTRIBUTED_CLIENT_TRACER): {}", err.what());
 	}

+	if (args.tls_certificate_file.has_value()) {
+		network::setOption(FDB_NET_OPTION_TLS_CERT_PATH, args.tls_certificate_file.value());
+	}
+
+	if (args.tls_key_file.has_value()) {
+		network::setOption(FDB_NET_OPTION_TLS_KEY_PATH, args.tls_key_file.value());
+	}
+
+	if (args.tls_ca_file.has_value()) {
+		network::setOption(FDB_NET_OPTION_TLS_CA_PATH, args.tls_ca_file.value());
+	}
+
 	/* enable flatbuffers if specified */
 	if (args.flatbuffers) {
 #ifdef FDB_NET_OPTION_USE_FLATBUFFERS
@ -982,57 +1011,55 @@ int workerProcessMain(Arguments const& args, int worker_id, shared_memory::Acces
 }

 /* initialize the parameters with default values */
-int initArguments(Arguments& args) {
-	memset(&args, 0, sizeof(Arguments)); /* zero-out everything */
-	args.num_fdb_clusters = 0;
-	args.num_databases = 1;
-	args.api_version = maxApiVersion();
-	args.json = 0;
-	args.num_processes = 1;
-	args.num_threads = 1;
-	args.async_xacts = 0;
-	args.mode = MODE_INVALID;
-	args.rows = 100000;
-	args.load_factor = 1.0;
-	args.row_digits = digits(args.rows);
-	args.seconds = 30;
-	args.iteration = 0;
-	args.tpsmax = 0;
-	args.tpsmin = -1;
-	args.tpsinterval = 10;
-	args.tpschange = TPS_SIN;
-	args.sampling = 1000;
-	args.key_length = 32;
-	args.value_length = 16;
-	args.active_tenants = 0;
-	args.total_tenants = 0;
-	args.tenant_batch_size = 10000;
-	args.zipf = 0;
-	args.commit_get = 0;
-	args.verbose = 1;
-	args.flatbuffers = 0; /* internal */
-	args.knobs[0] = '\0';
-	args.log_group[0] = '\0';
-	args.prefixpadding = 0;
-	args.trace = 0;
-	args.tracepath[0] = '\0';
-	args.traceformat = 0; /* default to client's default (XML) */
-	args.streaming_mode = FDB_STREAMING_MODE_WANT_ALL;
-	args.txntrace = 0;
-	args.txntagging = 0;
-	memset(args.txntagging_prefix, 0, TAGPREFIXLENGTH_MAX);
+Arguments::Arguments() {
+	num_fdb_clusters = 0;
+	num_databases = 1;
+	api_version = maxApiVersion();
+	json = 0;
+	num_processes = 1;
+	num_threads = 1;
+	async_xacts = 0;
+	mode = MODE_INVALID;
+	rows = 100000;
+	load_factor = 1.0;
+	row_digits = digits(rows);
+	seconds = 30;
+	iteration = 0;
+	tpsmax = 0;
+	tpsmin = -1;
+	tpsinterval = 10;
+	tpschange = TPS_SIN;
+	sampling = 1000;
+	key_length = 32;
+	value_length = 16;
+	active_tenants = 0;
+	total_tenants = 0;
+	tenant_batch_size = 10000;
+	zipf = 0;
+	commit_get = 0;
+	verbose = 1;
+	flatbuffers = 0; /* internal */
+	knobs[0] = '\0';
+	log_group[0] = '\0';
+	prefixpadding = 0;
+	trace = 0;
+	tracepath[0] = '\0';
+	traceformat = 0; /* default to client's default (XML) */
+	streaming_mode = FDB_STREAMING_MODE_WANT_ALL;
+	txntrace = 0;
+	txntagging = 0;
+	memset(txntagging_prefix, 0, TAGPREFIXLENGTH_MAX);
 	for (auto i = 0; i < MAX_OP; i++) {
-		args.txnspec.ops[i][OP_COUNT] = 0;
+		txnspec.ops[i][OP_COUNT] = 0;
 	}
-	args.client_threads_per_version = 0;
-	args.disable_client_bypass = false;
-	args.disable_ryw = 0;
-	args.json_output_path[0] = '\0';
-	args.stats_export_path[0] = '\0';
-	args.bg_materialize_files = false;
-	args.bg_file_path[0] = '\0';
-	args.distributed_tracer_client = 0;
-	return 0;
+	client_threads_per_version = 0;
+	disable_client_bypass = false;
+	disable_ryw = 0;
+	json_output_path[0] = '\0';
+	stats_export_path[0] = '\0';
+	bg_materialize_files = false;
+	bg_file_path[0] = '\0';
+	distributed_tracer_client = 0;
 }

 /* parse transaction specification */
@ -1279,6 +1306,10 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
 			{ "bg_file_path", required_argument, NULL, ARG_BG_FILE_PATH },
 			{ "stats_export_path", optional_argument, NULL, ARG_EXPORT_PATH },
 			{ "distributed_tracer_client", required_argument, NULL, ARG_DISTRIBUTED_TRACER_CLIENT },
+			{ "tls_certificate_file", required_argument, NULL, ARG_TLS_CERTIFICATE_FILE },
+			{ "tls_key_file", required_argument, NULL, ARG_TLS_KEY_FILE },
+			{ "tls_ca_file", required_argument, NULL, ARG_TLS_CA_FILE },
+			{ "authorization_token_file", required_argument, NULL, ARG_AUTHORIZATION_TOKEN_FILE },
 			{ NULL, 0, NULL, 0 }
 		};
 		idx = 0;
@ -1515,6 +1546,45 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
 				args.distributed_tracer_client = -1;
 			}
 			break;
+		case ARG_TLS_CERTIFICATE_FILE:
+			args.tls_certificate_file = std::string(optarg);
+			break;
+		case ARG_TLS_KEY_FILE:
+			args.tls_key_file = std::string(optarg);
+			break;
+		case ARG_TLS_CA_FILE:
+			args.tls_ca_file = std::string(optarg);
+			break;
+		case ARG_AUTHORIZATION_TOKEN_FILE: {
+			std::string tokenFilename(optarg);
+			std::ifstream ifs(tokenFilename);
+			std::ostringstream oss;
+			oss << ifs.rdbuf();
+			rapidjson::Document d;
+			d.Parse(oss.str().c_str());
+			if (d.HasParseError()) {
+				logr.error("Failed to parse authorization token JSON file '{}': {} at offset {}",
+				           tokenFilename,
+				           GetParseError_En(d.GetParseError()),
+				           d.GetErrorOffset());
+				return -1;
+			} else if (!d.IsObject()) {
+				logr.error("Authorization token JSON file '{}' must contain a JSON object", tokenFilename);
+				return -1;
+			}
+			for (auto itr = d.MemberBegin(); itr != d.MemberEnd(); ++itr) {
+				if (!itr->value.IsString()) {
+					logr.error("Token '{}' is not a string", itr->name.GetString());
+					return -1;
+				}
+				args.authorization_tokens.insert_or_assign(
+				    std::string(itr->name.GetString(), itr->name.GetStringLength()),
+				    std::string(itr->value.GetString(), itr->value.GetStringLength()));
+			}
+			logr.info("Added {} tenant authorization tokens to map from file '{}'",
+			          args.authorization_tokens.size(),
+			          tokenFilename);
+		} break;
 		}
 	}

@ -1525,93 +1595,97 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
 	return 0;
 }

-int validateArguments(Arguments const& args) {
-	if (args.mode == MODE_INVALID) {
+int Arguments::validate() {
+	if (mode == MODE_INVALID) {
 		logr.error("--mode has to be set");
 		return -1;
 	}
-	if (args.verbose < VERBOSE_NONE || args.verbose > VERBOSE_DEBUG) {
+	if (verbose < VERBOSE_NONE || verbose > VERBOSE_DEBUG) {
 		logr.error("--verbose must be between 0 and 3");
 		return -1;
 	}
-	if (args.rows <= 0) {
+	if (rows <= 0) {
 		logr.error("--rows must be a positive integer");
 		return -1;
 	}
-	if (args.load_factor <= 0 || args.load_factor > 1) {
+	if (load_factor <= 0 || load_factor > 1) {
 		logr.error("--load_factor must be in range (0, 1]");
 		return -1;
 	}
-	if (args.key_length < 0) {
+	if (key_length < 0) {
 		logr.error("--keylen must be a positive integer");
 		return -1;
 	}
-	if (args.value_length < 0) {
+	if (value_length < 0) {
 		logr.error("--vallen must be a positive integer");
 		return -1;
 	}
-	if (args.num_fdb_clusters > NUM_CLUSTERS_MAX) {
+	if (num_fdb_clusters > NUM_CLUSTERS_MAX) {
 		logr.error("Mako is not supported to do work to more than {} clusters", NUM_CLUSTERS_MAX);
 		return -1;
 	}
-	if (args.num_databases > NUM_DATABASES_MAX) {
+	if (num_databases > NUM_DATABASES_MAX) {
 		logr.error("Mako is not supported to do work to more than {} databases", NUM_DATABASES_MAX);
 		return -1;
 	}
-	if (args.num_databases < args.num_fdb_clusters) {
-		logr.error("--num_databases ({}) must be >= number of clusters({})", args.num_databases, args.num_fdb_clusters);
+	if (num_databases < num_fdb_clusters) {
+		logr.error("--num_databases ({}) must be >= number of clusters({})", num_databases, num_fdb_clusters);
 		return -1;
 	}
-	if (args.num_threads < args.num_databases) {
-		logr.error("--threads ({}) must be >= number of databases ({})", args.num_threads, args.num_databases);
+	if (num_threads < num_databases) {
+		logr.error("--threads ({}) must be >= number of databases ({})", num_threads, num_databases);
 		return -1;
 	}
-	if (args.key_length < 4 /* "mako" */ + args.row_digits) {
+	if (key_length < 4 /* "mako" */ + row_digits) {
 		logr.error("--keylen must be larger than {} to store \"mako\" prefix "
 		           "and maximum row number",
-		           4 + args.row_digits);
+		           4 + row_digits);
 		return -1;
 	}
-	if (args.active_tenants > args.total_tenants) {
+	if (active_tenants > total_tenants) {
 		logr.error("--active_tenants must be less than or equal to --total_tenants");
 		return -1;
 	}
-	if (args.tenant_batch_size < 1) {
+	if (tenant_batch_size < 1) {
 		logr.error("--tenant_batch_size must be at least 1");
 		return -1;
 	}
-	if (args.mode == MODE_RUN) {
-		if ((args.seconds > 0) && (args.iteration > 0)) {
+	if (mode == MODE_RUN) {
+		if ((seconds > 0) && (iteration > 0)) {
 			logr.error("Cannot specify seconds and iteration together");
 			return -1;
 		}
-		if ((args.seconds == 0) && (args.iteration == 0)) {
+		if ((seconds == 0) && (iteration == 0)) {
 			logr.error("Must specify either seconds or iteration");
 			return -1;
 		}
-		if (args.txntagging < 0) {
+		if (txntagging < 0) {
 			logr.error("--txntagging must be a non-negative integer");
 			return -1;
 		}
 	}

 	// ensure that all of the files provided to mako are valid and exist
-	if (args.mode == MODE_REPORT) {
-		if (!args.num_report_files) {
+	if (mode == MODE_REPORT) {
+		if (!num_report_files) {
 			logr.error("No files to merge");
 		}
-		for (int i = 0; i < args.num_report_files; i++) {
+		for (int i = 0; i < num_report_files; i++) {
 			struct stat buffer;
-			if (stat(args.report_files[i], &buffer) != 0) {
-				logr.error("Couldn't open file {}", args.report_files[i]);
+			if (stat(report_files[i], &buffer) != 0) {
+				logr.error("Couldn't open file {}", report_files[i]);
 				return -1;
 			}
 		}
 	}
-	if (args.distributed_tracer_client < 0) {
-		logr.error("--disibuted_tracer_client must specify either (disabled, network_lossy, log_file)");
+	if (distributed_tracer_client < 0) {
+		logr.error("--distributed_tracer_client must specify either (disabled, network_lossy, log_file)");
 		return -1;
 	}
+
+	if (!authorization_tokens.empty() && !tls_ca_file.has_value()) {
+		logr.warn("Authorization tokens are being used without explicit TLS CA file configured");
+	}
 	return 0;
 }

@ -2262,11 +2336,6 @@ int main(int argc, char* argv[]) {

 	auto rc = int{};
 	auto args = Arguments{};
-	rc = initArguments(args);
-	if (rc < 0) {
-		logr.error("initArguments failed");
-		return -1;
-	}
 	rc = parseArguments(argc, argv, args);
 	if (rc < 0) {
 		/* usage printed */
@ -2282,7 +2351,7 @@ int main(int argc, char* argv[]) {
 		args.total_tenants = args.active_tenants;
 	}

-	rc = validateArguments(args);
+	rc = args.validate();
 	if (rc < 0)
 		return -1;
 	logr.setVerbosity(args.verbose);
--- a/bindings/c/test/mako/mako.hpp
+++ b/bindings/c/test/mako/mako.hpp
@ -30,6 +30,7 @@
 #include <cassert>
 #include <chrono>
 #include <list>
+#include <map>
 #include <vector>
 #include <string_view>
 #include <fdb_api.hpp>
@ -79,7 +80,11 @@ enum ArgKind {
 	ARG_JSON_REPORT,
 	ARG_BG_FILE_PATH, // if blob granule files are stored locally, mako will read and materialize them if this is set
 	ARG_EXPORT_PATH,
-	ARG_DISTRIBUTED_TRACER_CLIENT
+	ARG_DISTRIBUTED_TRACER_CLIENT,
+	ARG_TLS_CERTIFICATE_FILE,
+	ARG_TLS_KEY_FILE,
+	ARG_TLS_CA_FILE,
+	ARG_AUTHORIZATION_TOKEN_FILE,
 };

 constexpr const int OP_COUNT = 0;
@ -131,6 +136,9 @@ constexpr const int MAX_REPORT_FILES = 200;

 /* benchmark parameters */
 struct Arguments {
+	Arguments();
+	int validate();
+
 	int api_version;
 	int json;
 	int num_processes;
@ -180,6 +188,10 @@ struct Arguments {
 	char report_files[MAX_REPORT_FILES][PATH_MAX];
 	int num_report_files;
 	int distributed_tracer_client;
+	std::optional<std::string> tls_certificate_file;
+	std::optional<std::string> tls_key_file;
+	std::optional<std::string> tls_ca_file;
+	std::map<std::string, std::string> authorization_tokens; // maps tenant name to token string
 };

 } // namespace mako
--- a/bindings/c/test/mako/mako.rst
+++ b/bindings/c/test/mako/mako.rst
@ -38,7 +38,7 @@ Arguments
  | - ``build``:  Populate data
  | - ``run``:  Run the benchmark

- | ``-c | --cluster <cluster file>``
+- | ``-c | --cluster <cluster_file>``
  | FDB cluster files (Required, comma-separated)

 - | ``-d | --num_databases <num_databases>``
@ -125,9 +125,21 @@ Arguments
  | Disable snapshot read-your-writes

 - | ``--json_report`` defaults to ``mako.json``
-  | ``--json_report=PATH``
+  | ``--json_report <path>``
  | Output stats to the specified json file

+- | ``--tls_certificate_file <path>``
+  | Use TLS certificate located in ``<path>``
+
+- | ``--tls_key_file <path>``
+  | Use TLS key file located in ``<path>``
+
+- | ``--tls_ca_file <path>``
+  | Use TLS CA file located in ``<path>``
+
+- | ``--authorization_token_file <path>``
+  | Use authorization token JSON file located in ``<path>``
+  | Expected content is a JSON object where each key is a tenant name and the mapped value is a token string

 Transaction Specification
 =========================
--- a/cmake/FlowCommands.cmake
+++ b/cmake/FlowCommands.cmake
@ -76,38 +76,11 @@ function(generate_coverage_xml)
  add_dependencies(coverage_${target_name} coveragetool)
 endfunction()

-# This function asserts that `versions.h` does not exist in the source
-# directory. It does this in the prebuild phase of the target.
-# This is an ugly hack that should make sure that cmake isn't used with
-# a source directory in which FDB was previously built with `make`.
-function(assert_no_version_h target)
-
-  message(STATUS "Check versions.h on ${target}")
-  set(target_name "${target}_versions_h_check")
-
-  if (DEFINED ENV{VERBOSE})
-    add_custom_target("${target_name}"
-      COMMAND "${CMAKE_COMMAND}" -DFILE="${CMAKE_SOURCE_DIR}/versions.h"
-      -P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake"
-      COMMAND echo
-      "${CMAKE_COMMAND}" -P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake"
-      -DFILE="${CMAKE_SOURCE_DIR}/versions.h"
-      COMMENT "Check old build system wasn't used in source dir")
-  else()
-    add_custom_target("${target_name}"
-      COMMAND "${CMAKE_COMMAND}" -DFILE="${CMAKE_SOURCE_DIR}/versions.h"
-      -P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake"
-      COMMENT "Check old build system wasn't used in source dir")
-  endif()
-
-  add_dependencies(${target} ${target_name})
-endfunction()
-
 add_custom_target(strip_targets)
 add_dependencies(packages strip_targets)

 function(strip_debug_symbols target)
-  if (WIN32)
+  if(WIN32)
    return()
  endif()
  get_target_property(target_type ${target} TYPE)
@ -146,7 +119,7 @@ function(strip_debug_symbols target)
      COMMAND objcopy --verbose --only-keep-debug $<TARGET_FILE:${target}> "${out_file}.debug"
      COMMAND objcopy --verbose --add-gnu-debuglink="${out_file}.debug" "${out_file}"
      COMMENT "Copy debug symbols to ${out_name}.debug")
-    add_custom_target(strip_${target} DEPENDS  "${out_file}.debug")
+    add_custom_target(strip_${target} DEPENDS "${out_file}.debug")
  else()
    add_custom_target(strip_${target})
    add_dependencies(strip_${target} strip_only_${target})
@ -171,7 +144,7 @@ function(copy_headers)
  foreach(f IN LISTS CP_SRCS)
    is_prefix(bd "${CMAKE_CURRENT_BINARY_DIR}" "${f}")
    is_prefix(sd "${CMAKE_CURRENT_SOURCE_DIR}" "${f}")
-    if (bd OR sd)
+    if(bd OR sd)
      continue()
    endif()
    is_header(hdr "${f}")
@ -180,7 +153,7 @@ function(copy_headers)
    endif()
    get_filename_component(fname ${f} NAME)
    get_filename_component(dname ${f} DIRECTORY)
-    if (dname)
+    if(dname)
      make_directory(${incl_dir}/${dname})
    endif()
    set(fpath "${incl_dir}/${dname}/${fname}")
@ -309,9 +282,6 @@ function(add_flow_target)

    add_custom_target(${AFT_NAME}_actors DEPENDS ${generated_files})
    add_dependencies(${AFT_NAME} ${AFT_NAME}_actors)
-    if(NOT WIN32)
-      assert_no_version_h(${AFT_NAME}_actors)
-    endif()
    generate_coverage_xml(${AFT_NAME})
    if(strip_target)
      strip_debug_symbols(${AFT_NAME})
--- a/cmake/awssdk.cmake
+++ b/cmake/awssdk.cmake
@ -8,40 +8,43 @@ endif()

 include(ExternalProject)
 ExternalProject_Add(awssdk_project
-  GIT_REPOSITORY    https://github.com/aws/aws-sdk-cpp.git
-  GIT_TAG           e4b4b310d8631bc7e9a797b6ac03a73c6f210bf6 # v1.9.331
-  SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src"
-  BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build"
-  GIT_CONFIG        advice.detachedHead=false
-  CMAKE_ARGS        -DBUILD_SHARED_LIBS=OFF        # SDK builds shared libs by default, we want static libs
-                    -DENABLE_TESTING=OFF
-                    -DBUILD_ONLY=core              # git repo contains SDK for every AWS product, we only want the core auth libraries
-                    -DSIMPLE_INSTALL=ON
-                    -DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path
-                    -DBYO_CRYPTO=ON                # we have our own crypto libraries that conflict if we let aws sdk build and link its own
-                    -DBUILD_CURL=ON
-                    -DBUILD_ZLIB=ON
-                    
-                    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${AWSSDK_COMPILER_FLAGS}
-  TEST_COMMAND      ""
+  GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git
+  GIT_TAG e4b4b310d8631bc7e9a797b6ac03a73c6f210bf6 # v1.9.331
+  SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src"
+  BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build"
+  GIT_CONFIG advice.detachedHead=false
+  # it seems advice.detachedHead breaks something which causes aws sdk to always be rebuilt.
+  # This option forces to cmake to build the aws sdk only once and never attempt to update it
+  UPDATE_DISCONNECTED ON
+  CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF        # SDK builds shared libs by default, we want static libs
+  -DENABLE_TESTING=OFF
+  -DBUILD_ONLY=core              # git repo contains SDK for every AWS product, we only want the core auth libraries
+  -DSIMPLE_INSTALL=ON
+  -DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path
+  -DBYO_CRYPTO=ON                # we have our own crypto libraries that conflict if we let aws sdk build and link its own
+  -DBUILD_CURL=ON
+  -DBUILD_ZLIB=ON
+
+  -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+  -DCMAKE_CXX_FLAGS=${AWSSDK_COMPILER_FLAGS}
+  TEST_COMMAND ""
  # the sdk build produces a ton of artifacts, with their own dependency tree, so there is a very specific dependency order they must be linked in
-  BUILD_BYPRODUCTS  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a"
-                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a"
-)
+  BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a"
+  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a"
+  )

 add_library(awssdk_core STATIC IMPORTED)
 add_dependencies(awssdk_core awssdk_project)
--- a/contrib/TestHarness2/test_harness/run.py
+++ b/contrib/TestHarness2/test_harness/run.py
@ -303,7 +303,6 @@ class TestRun:
        self.stats: str | None = stats
        self.expected_unseed: int | None = expected_unseed
        self.use_valgrind: bool = config.use_valgrind
-        self.long_running: bool = config.long_running
        self.old_binary_path: Path = config.old_binaries_path
        self.buggify_enabled: bool = buggify_enabled
        self.fault_injection_enabled: bool = True
@ -315,7 +314,7 @@ class TestRun:
        # state for the run
        self.retryable_error: bool = False
        self.summary: Summary = Summary(binary, uid=self.uid, stats=self.stats, expected_unseed=self.expected_unseed,
-                                        will_restart=will_restart)
+                                        will_restart=will_restart, long_running=config.long_running)
        self.run_time: int = 0
        self.success = self.run()

@ -367,6 +366,11 @@ class TestRun:
            command += ['-b', 'on']
        if config.crash_on_error:
            command.append('--crash')
+        if config.long_running:
+            # disable simulation speedup
+            command += ['--knob-sim-speedup-after-seconds=36000']
+            # disable traceTooManyLines Error MAX_TRACE_LINES
+            command += ['--knob-max-trace-lines=1000000000']

        self.temp_path.mkdir(parents=True, exist_ok=True)

@ -376,7 +380,8 @@ class TestRun:
        process = subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, cwd=self.temp_path,
                                   text=True, env=env)
        did_kill = False
-        timeout = 20 * config.kill_seconds if self.use_valgrind or self.long_running else config.kill_seconds
+        # No timeout for long running tests
+        timeout = 20 * config.kill_seconds if self.use_valgrind else (None if config.long_running else config.kill_seconds)
        err_out: str
        try:
            _, err_out = process.communicate(timeout=timeout)
--- a/contrib/TestHarness2/test_harness/summarize.py
+++ b/contrib/TestHarness2/test_harness/summarize.py
@ -159,13 +159,20 @@ class Parser:
        pass


-class XmlParser(Parser, xml.sax.handler.ContentHandler):
+class XmlParser(Parser, xml.sax.handler.ContentHandler, xml.sax.handler.ErrorHandler):
    def __init__(self):
        super().__init__()
        self.handler: ParseHandler | None = None

    def parse(self, file: TextIO, handler: ParseHandler) -> None:
-        xml.sax.parse(file, self)
+        self.handler = handler
+        xml.sax.parse(file, self, errorHandler=self)
+
+    def error(self, exception):
+        pass
+
+    def fatalError(self, exception):
+        pass

    def startElement(self, name, attrs) -> None:
        attributes: Dict[str, str] = {}
@ -276,6 +283,7 @@ class TraceFiles:
                    raise StopIteration
                self.current += 1
                return self.trace_files[self.current - 1]
+
        return TraceFilesIterator(self)


@ -283,11 +291,12 @@ class Summary:
    def __init__(self, binary: Path, runtime: float = 0, max_rss: int | None = None,
                 was_killed: bool = False, uid: uuid.UUID | None = None, expected_unseed: int | None = None,
                 exit_code: int = 0, valgrind_out_file: Path | None = None, stats: str | None = None,
-                 error_out: str = None, will_restart: bool = False):
+                 error_out: str = None, will_restart: bool = False, long_running: bool = False):
        self.binary = binary
        self.runtime: float = runtime
        self.max_rss: int | None = max_rss
        self.was_killed: bool = was_killed
+        self.long_running = long_running
        self.expected_unseed: int | None = expected_unseed
        self.exit_code: int = exit_code
        self.out: SummaryTree = SummaryTree('Test')
@ -388,6 +397,10 @@ class Summary:
        if self.was_killed:
            child = SummaryTree('ExternalTimeout')
            child.attributes['Severity'] = '40'
+            if self.long_running:
+                # debugging info for long-running tests
+                child.attributes['LongRunning'] = '1'
+                child.attributes['Runtime'] = str(self.runtime)
            self.out.append(child)
            self.error = True
        if self.max_rss is not None:
@ -426,7 +439,8 @@ class Summary:
            lines = self.error_out.splitlines()
            stderr_bytes = 0
            for line in lines:
-                if line.endswith("WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!"):
+                if line.endswith(
+                        "WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!"):
                    # When running ASAN we expect to see this message. Boost coroutine should be using the correct asan annotations so that it shouldn't produce any false positives.
                    continue
                if line.endswith("Warning: unimplemented fcntl command: 1036"):
@ -560,6 +574,9 @@ class Summary:
        self.handler.add_handler(('Severity', '30'), parse_warning)

        def parse_error(attrs: Dict[str, str]):
+            if 'ErrorIsInjectedFault' in attrs and attrs['ErrorIsInjectedFault'].lower() in ['1', 'true']:
+                # ignore injected errors. In newer fdb versions these will have a lower severity
+                return
            self.errors += 1
            self.error = True
            if self.errors > config.max_errors:
@ -606,6 +623,7 @@ class Summary:
                child.attributes['File'] = attrs['File']
                child.attributes['Line'] = attrs['Line']
                self.out.append(child)
+
        self.handler.add_handler(('Type', 'BuggifySection'), buggify_section)
        self.handler.add_handler(('Type', 'FaultInjected'), buggify_section)

@ -614,9 +632,11 @@ class Summary:
            child.attributes['Name'] = attrs['Name']
            child.attributes['File'] = attrs['File']
            child.attributes['Line'] = attrs['Line']
+
        self.handler.add_handler(('Type', 'RunningUnitTest'), running_unit_test)

        def stderr_severity(attrs: Dict[str, str]):
            if 'NewSeverity' in attrs:
                self.stderr_severity = attrs['NewSeverity']
+
        self.handler.add_handler(('Type', 'StderrSeverity'), stderr_severity)
--- a/contrib/TestHarness2/test_harness/timeout.py
+++ b/contrib/TestHarness2/test_harness/timeout.py
@ -55,6 +55,6 @@ if __name__ == '__main__':
                    summary.summarize_files(files)
                    summary.out.dump(sys.stdout)
            else:
-                summary = Summary(Path('bin/fdbserver'), was_killed=True)
+                summary = Summary(Path('bin/fdbserver'), was_killed=True, long_running=config.long_running)
                summary.summarize_files(files)
                summary.out.dump(sys.stdout)
--- a/design/idempotency_ids.md
+++ b/design/idempotency_ids.md
@ -34,20 +34,25 @@ Commit proxies would combine idempotency IDs for transactions within a batch. Th

 ## Value format
 ```
-${protocol_version}(${n (1 byte)}${idempotency_id (n bytes)}${low_order_byte_of_batch_index})*
+${protocol_version}${timestamp}(${n (1 byte)}${idempotency_id (n bytes)}${low_order_byte_of_batch_index})*
 ```

 The batch index for each idempotency id can be reconstructed from the high order byte and low order bytes stored in the key and value, respectively. This is necessary for an "unknown_committed" transaction to recover their full version stamp. Batch index is a `short int`, i.e. 2 bytes.

+The timestamp is the unix epoch stored as a little-endian signed 64-bit integer.
+
 # Cleaning up old idempotency ids

 After learning the result of an attempt to commit a transaction with an
 idempotency id, the client may inform the cluster that it's no longer interested
 in that id and the cluster can reclaim the space used to store the idempotency
-id. The happy-path reply to a CommitTransactionRequest will say which proxy this
-request should be sent to, and all idempotency ids for a database key will be
-sent to the same proxy so that it can clear the key once it receives all of
-them. The first proxy will also periodically clean up the oldest idempotency ids, based on a policy determined by two knobs. One knob will control the minimum lifetime of an idempotency id (i.e. don't delete anything younger than 1 day), and the other will control the target byte size of the idempotency keys (e.g. keep 100 MB of idempotency keys around).
+id. The commit proxy that committed a batch is responsible for cleaning all
+idempotency kv pairs from that batch, so clients must tell that specific proxy
+that they're done with the id. The first proxy will also periodically clean up
+the oldest idempotency ids, based on a policy determined by two knobs.  One knob
+will control the minimum lifetime of an idempotency id (i.e. don't delete
+anything younger than 1 day), and the other will control the target byte size of
+the idempotency keys (e.g. keep 100 MB of idempotency keys around).

 # Commit protocol

--- a/documentation/sphinx/conf.py
+++ b/documentation/sphinx/conf.py
@ -49,7 +49,7 @@ master_doc = 'index'

 # General information about the project.
 project = u'FoundationDB'
-copyright = u'2013-2021 Apple, Inc and the FoundationDB project authors'
+copyright = u'2013-2022 Apple, Inc and the FoundationDB project authors'

 # Load the version information from 'versions.target'
 import xml.etree.ElementTree as ET
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@ -2365,6 +2365,7 @@ ACTOR Future<Void> runRestore(Database db,
 			                                                   KeyRef(addPrefix),
 			                                                   KeyRef(removePrefix),
 			                                                   LockDB::True,
+			                                                   UnlockDB::True,
 			                                                   onlyApplyMutationLogs,
 			                                                   inconsistentSnapshotOnly,
 			                                                   beginVersion,
--- a/fdbclient/BlobCipher.cpp
+++ b/fdbclient/BlobCipher.cpp
@ -83,7 +83,7 @@ BlobCipherMetrics::BlobCipherMetrics()
                  CounterSet(cc, "Backup"),
                  CounterSet(cc, "Test") }) {
 	specialCounter(cc, "CacheSize", []() { return BlobCipherKeyCache::getInstance()->getSize(); });
-	traceFuture = traceCounters("BlobCipherMetrics", UID(), FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, &cc);
+	traceFuture = cc.traceCounters("BlobCipherMetrics", UID(), FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL);
 }

 std::string toString(BlobCipherMetrics::UsageType type) {
--- a/fdbclient/BlobMetadataUtils.cpp
+++ b/fdbclient/BlobMetadataUtils.cpp
@ -0,0 +1,109 @@
+/*
+ * BlobMetadataUtils.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/BlobMetadataUtils.h"
+
+#include "fmt/format.h"
+#include "flow/IRandom.h"
+#include "flow/flow.h"
+#include "fdbclient/Knobs.h"
+#include "fdbclient/S3BlobStore.h"
+
+std::string buildPartitionPath(const std::string& url, const std::string& partition) {
+	ASSERT(!partition.empty());
+	ASSERT(partition.front() != '/');
+	ASSERT(partition.back() == '/');
+	StringRef u(url);
+	if (u.startsWith("file://"_sr)) {
+		ASSERT(u.endsWith("/"_sr));
+		return url + partition;
+	} else if (u.startsWith("blobstore://"_sr)) {
+		std::string resource;
+		std::string lastOpenError;
+		S3BlobStoreEndpoint::ParametersT backupParams;
+
+		std::string urlCopy = url;
+
+		Reference<S3BlobStoreEndpoint> bstore =
+		    S3BlobStoreEndpoint::fromString(url, {}, &resource, &lastOpenError, &backupParams);
+
+		ASSERT(!resource.empty());
+		ASSERT(resource.back() != '/');
+		size_t resourceStart = url.find(resource);
+		ASSERT(resourceStart != std::string::npos);
+
+		return urlCopy.insert(resourceStart + resource.size(), "/" + partition);
+	} else {
+		// FIXME: support azure
+		throw backup_invalid_url();
+	}
+}
+
+// FIXME: make this (more) deterministic outside of simulation for FDBPerfKmsConnector
+Standalone<BlobMetadataDetailsRef> createRandomTestBlobMetadata(const std::string& baseUrl,
+                                                                BlobMetadataDomainId domainId,
+                                                                BlobMetadataDomainName domainName) {
+	Standalone<BlobMetadataDetailsRef> metadata;
+	metadata.domainId = domainId;
+	metadata.arena().dependsOn(domainName.arena());
+	metadata.domainName = domainName;
+	// 0 == no partition, 1 == suffix partitioned, 2 == storage location partitioned
+	int type = deterministicRandom()->randomInt(0, 3);
+	int partitionCount = (type == 0) ? 0 : deterministicRandom()->randomInt(2, 12);
+	TraceEvent ev(SevDebug, "SimBlobMetadata");
+	ev.detail("DomainId", domainId).detail("TypeNum", type).detail("PartitionCount", partitionCount);
+	if (type == 0) {
+		// single storage location
+		std::string partition = std::to_string(domainId) + "/";
+		metadata.base = StringRef(metadata.arena(), buildPartitionPath(baseUrl, partition));
+		ev.detail("Base", metadata.base);
+	}
+	if (type == 1) {
+		// simulate hash prefixing in s3
+		metadata.base = StringRef(metadata.arena(), baseUrl);
+		ev.detail("Base", metadata.base);
+		for (int i = 0; i < partitionCount; i++) {
+			metadata.partitions.push_back_deep(metadata.arena(),
+			                                   deterministicRandom()->randomUniqueID().shortString() + "-" +
+			                                       std::to_string(domainId) + "/");
+			ev.detail("P" + std::to_string(i), metadata.partitions.back());
+		}
+	}
+	if (type == 2) {
+		// simulate separate storage location per partition
+		for (int i = 0; i < partitionCount; i++) {
+			std::string partition = std::to_string(domainId) + "_" + std::to_string(i) + "/";
+			metadata.partitions.push_back_deep(metadata.arena(), buildPartitionPath(baseUrl, partition));
+			ev.detail("P" + std::to_string(i), metadata.partitions.back());
+		}
+	}
+
+	// set random refresh + expire time
+	if (deterministicRandom()->coinflip()) {
+		metadata.refreshAt = now() + deterministicRandom()->random01() * CLIENT_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
+		metadata.expireAt =
+		    metadata.refreshAt + deterministicRandom()->random01() * CLIENT_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
+	} else {
+		metadata.refreshAt = std::numeric_limits<double>::max();
+		metadata.expireAt = metadata.refreshAt;
+	}
+
+	return metadata;
+}
--- a/fdbclient/ClientKnobs.cpp
+++ b/fdbclient/ClientKnobs.cpp
@ -273,6 +273,7 @@ void ClientKnobs::initialize(Randomize randomize) {
 	init( WRITE_COST_BYTE_FACTOR,                 16384 ); if( randomize && BUGGIFY ) WRITE_COST_BYTE_FACTOR = 4096;
 	init( READ_COST_BYTE_FACTOR,                  16384 ); if( randomize && BUGGIFY ) READ_COST_BYTE_FACTOR = 4096;
 	init( GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO,            5.0 );
+	init( PROXY_MAX_TAG_THROTTLE_DURATION,          5.0 ); if( randomize && BUGGIFY ) PROXY_MAX_TAG_THROTTLE_DURATION = 0.5;

 	// busyness reporting
 	init( BUSYNESS_SPIKE_START_THRESHOLD,         0.100 );
@ -281,6 +282,7 @@ void ClientKnobs::initialize(Randomize randomize) {
 	// Blob granules
 	init( BG_MAX_GRANULE_PARALLELISM,                10 );
 	init( BG_TOO_MANY_GRANULES,                   10000 );
+	init( BLOB_METADATA_REFRESH_INTERVAL,          3600 ); if ( randomize && BUGGIFY ) { BLOB_METADATA_REFRESH_INTERVAL = deterministicRandom()->randomInt(5, 120); }

 	init( CHANGE_QUORUM_BAD_STATE_RETRY_TIMES,        3 );
 	init( CHANGE_QUORUM_BAD_STATE_RETRY_DELAY,      2.0 );
--- a/fdbclient/FDBTypes.cpp
+++ b/fdbclient/FDBTypes.cpp
@ -22,6 +22,16 @@
 #include "fdbclient/Knobs.h"
 #include "fdbclient/NativeAPI.actor.h"

+KeyRangeRef toPrefixRelativeRange(KeyRangeRef range, KeyRef prefix) {
+	if (prefix.empty()) {
+		return range;
+	} else {
+		KeyRef begin = range.begin.startsWith(prefix) ? range.begin.removePrefix(prefix) : allKeys.begin;
+		KeyRef end = range.end.startsWith(prefix) ? range.end.removePrefix(prefix) : allKeys.end;
+		return KeyRangeRef(begin, end);
+	}
+}
+
 KeyRef keyBetween(const KeyRangeRef& keys) {
 	int pos = 0; // will be the position of the first difference between keys.begin and keys.end
 	int minSize = std::min(keys.begin.size(), keys.end.size());
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@ -167,6 +167,7 @@ public:
 	KeyBackedProperty<Key> removePrefix() { return configSpace.pack(__FUNCTION__sr); }
 	KeyBackedProperty<bool> onlyApplyMutationLogs() { return configSpace.pack(__FUNCTION__sr); }
 	KeyBackedProperty<bool> inconsistentSnapshotOnly() { return configSpace.pack(__FUNCTION__sr); }
+	KeyBackedProperty<bool> unlockDBAfterRestore() { return configSpace.pack(__FUNCTION__sr); }
 	// XXX: Remove restoreRange() once it is safe to remove. It has been changed to restoreRanges
 	KeyBackedProperty<KeyRange> restoreRange() { return configSpace.pack(__FUNCTION__sr); }
 	KeyBackedProperty<std::vector<KeyRange>> restoreRanges() { return configSpace.pack(__FUNCTION__sr); }
@ -591,12 +592,11 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
 	}

 	ACTOR static Future<StringRef> decryptImpl(Database cx,
-	                                           StringRef headerS,
+	                                           BlobCipherEncryptHeader header,
 	                                           const uint8_t* dataP,
 	                                           int64_t dataLen,
 	                                           Arena* arena) {
 		Reference<AsyncVar<ClientDBInfo> const> dbInfo = cx->clientInfo;
-		state BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(headerS);
 		TextAndHeaderCipherKeys cipherKeys = wait(getEncryptCipherKeys(dbInfo, header, BlobCipherMetrics::BACKUP));
 		ASSERT(cipherKeys.cipherHeaderKey.isValid() && cipherKeys.cipherTextKey.isValid());
 		validateEncryptionHeader(cipherKeys.cipherHeaderKey, cipherKeys.cipherTextKey, header);
@ -606,7 +606,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
 	}

 	static Future<StringRef> decrypt(Database cx,
-	                                 StringRef headerS,
+	                                 BlobCipherEncryptHeader headerS,
 	                                 const uint8_t* dataP,
 	                                 int64_t dataLen,
 	                                 Arena* arena) {
@ -651,7 +651,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
 	}

 	ACTOR static Future<Void> updateEncryptionKeysCtx(EncryptedRangeFileWriter* self, KeyRef key) {
-		state std::pair<int64_t, TenantName> curTenantInfo = wait(getEncryptionDomainDetails(key, self));
+		state std::pair<int64_t, TenantName> curTenantInfo = wait(getEncryptionDomainDetails(key, self->tenantCache));
 		state Reference<AsyncVar<ClientDBInfo> const> dbInfo = self->cx->clientInfo;

 		// Get text and header cipher key
@ -693,12 +693,13 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {

 	static bool isSystemKey(KeyRef key) { return key.size() && key[0] == systemKeys.begin[0]; }

-	ACTOR static Future<std::pair<int64_t, TenantName>>
-	getEncryptionDomainDetailsImpl(KeyRef key, Reference<TenantEntryCache<Void>> tenantCache, bool useTenantCache) {
+	ACTOR static Future<std::pair<int64_t, TenantName>> getEncryptionDomainDetailsImpl(
+	    KeyRef key,
+	    Reference<TenantEntryCache<Void>> tenantCache) {
 		if (isSystemKey(key)) {
 			return std::make_pair(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
 		}
-		if (key.size() < TENANT_PREFIX_SIZE || !useTenantCache) {
+		if (key.size() < TENANT_PREFIX_SIZE) {
 			return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
 		}
 		KeyRef tenantPrefix = KeyRef(key.begin(), TENANT_PREFIX_SIZE);
@ -710,21 +711,10 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
 		return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
 	}

-	static Future<std::pair<int64_t, TenantName>> getEncryptionDomainDetails(KeyRef key,
-	                                                                         EncryptedRangeFileWriter* self) {
-		// If tenants are disabled on a cluster then don't use the TenantEntryCache as it will result in alot of
-		// unnecessary cache misses. For a cluster configured in TenantMode::Optional, the backup performance may
-		// degrade if most of the mutations belong to an invalid tenant
-		TenantMode mode = self->cx->clientInfo->get().tenantMode;
-		bool useTenantCache = mode != TenantMode::DISABLED;
-		if (g_network->isSimulated() && mode == TenantMode::OPTIONAL_TENANT) {
-			// TODO: Currently simulation tests run with optional tenant mode but most data does not belong to any
-			// tenant. This results in many timeouts so disable using the tenant cache until optional tenant mode
-			// support with backups is more performant
-			useTenantCache = false;
-		}
-		CODE_PROBE(useTenantCache, "using tenant cache");
-		return getEncryptionDomainDetailsImpl(key, self->tenantCache, useTenantCache);
+	static Future<std::pair<int64_t, TenantName>> getEncryptionDomainDetails(
+	    KeyRef key,
+	    Reference<TenantEntryCache<Void>> tenantCache) {
+		return getEncryptionDomainDetailsImpl(key, tenantCache);
 	}

 	// Handles the first block and internal blocks.  Ends current block if needed.
@ -816,6 +806,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
 		    curKeyTenantInfo.first != FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
 			endKey = StringRef(k.begin(), TENANT_PREFIX_SIZE);
 		}
+
 		state ValueRef newValue = StringRef();
 		self->lastKey = k;
 		self->lastValue = v;
@ -834,9 +825,9 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
 		if (self->lastKey.size() == 0 || k.size() == 0) {
 			return false;
 		}
-		state std::pair<int64_t, TenantName> curKeyTenantInfo = wait(getEncryptionDomainDetails(k, self));
-		state std::pair<int64_t, TenantName> prevKeyTenantInfo = wait(getEncryptionDomainDetails(self->lastKey, self));
-		// crossing tenant boundaries so finish the current block using only the tenant prefix of the new key
+		state std::pair<int64_t, TenantName> curKeyTenantInfo = wait(getEncryptionDomainDetails(k, self->tenantCache));
+		state std::pair<int64_t, TenantName> prevKeyTenantInfo =
+		    wait(getEncryptionDomainDetails(self->lastKey, self->tenantCache));
 		if (curKeyTenantInfo.first != prevKeyTenantInfo.first) {
 			CODE_PROBE(true, "crossed tenant boundaries");
 			wait(handleTenantBondary(self, k, v, writeValue, curKeyTenantInfo));
@ -1040,11 +1031,18 @@ private:
 	Key lastValue;
 };

-void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>* results) {
+ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
+                                        Standalone<VectorRef<KeyValueRef>>* results,
+                                        bool encryptedBlock,
+                                        Optional<Reference<TenantEntryCache<Void>>> tenantCache,
+                                        Optional<BlobCipherEncryptHeader> encryptHeader) {
 	// Read begin key, if this fails then block was invalid.
-	uint32_t kLen = reader->consumeNetworkUInt32();
-	const uint8_t* k = reader->consume(kLen);
+	state uint32_t kLen = reader->consumeNetworkUInt32();
+	state const uint8_t* k = reader->consume(kLen);
 	results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef()));
+	state KeyRef prevKey = KeyRef(k, kLen);
+	state bool done = false;
+	state Optional<std::pair<int64_t, TenantName>> prevTenantInfo;

 	// Read kv pairs and end key
 	while (1) {
@ -1052,6 +1050,35 @@ void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>*
 		kLen = reader->consumeNetworkUInt32();
 		k = reader->consume(kLen);

+		// make sure that all keys in a block belong to exactly one tenant,
+		// unless its the last key in which case it can be a truncated (different) tenant prefix
+		if (encryptedBlock && g_network && g_network->isSimulated()) {
+			ASSERT(tenantCache.present());
+			ASSERT(encryptHeader.present());
+			state KeyRef curKey = KeyRef(k, kLen);
+			if (!prevTenantInfo.present()) {
+				std::pair<int64_t, TenantName> tenantInfo =
+				    wait(EncryptedRangeFileWriter::getEncryptionDomainDetails(prevKey, tenantCache.get()));
+				prevTenantInfo = tenantInfo;
+			}
+			std::pair<int64_t, TenantName> curTenantInfo =
+			    wait(EncryptedRangeFileWriter::getEncryptionDomainDetails(curKey, tenantCache.get()));
+			if (!curKey.empty() && !prevKey.empty() && prevTenantInfo.get().first != curTenantInfo.first) {
+				ASSERT(!done);
+				if (curTenantInfo.first != SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID &&
+				    curTenantInfo.first != FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
+					ASSERT(curKey.size() == TENANT_PREFIX_SIZE);
+				}
+				done = true;
+			}
+			// make sure that all keys (except possibly the last key) in a block are encrypted using the correct key
+			if (!prevKey.empty()) {
+				ASSERT(prevTenantInfo.get().first == encryptHeader.get().cipherTextDetails.encryptDomainId);
+			}
+			prevKey = curKey;
+			prevTenantInfo = curTenantInfo;
+		}
+
 		// If eof reached or first value len byte is 0xFF then a valid block end was reached.
 		if (reader->eof() || *reader->rptr == 0xFF) {
 			results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef()));
@ -1072,6 +1099,8 @@ void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>*
 	for (auto b : reader->remainder())
 		if (b != 0xFF)
 			throw restore_corrupted_data_padding();
+
+	return Void();
 }

 ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file,
@ -1094,7 +1123,11 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
 		// BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION
 		int32_t file_version = reader.consume<int32_t>();
 		if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {
-			decodeKVPairs(&reader, &results);
+			wait(decodeKVPairs(&reader,
+			                   &results,
+			                   false,
+			                   Optional<Reference<TenantEntryCache<Void>>>(),
+			                   Optional<BlobCipherEncryptHeader>()));
 		} else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) {
 			CODE_PROBE(true, "decoding encrypted block");
 			ASSERT(cx.present());
@ -1108,7 +1141,8 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<

 			// read encryption header
 			const uint8_t* headerStart = reader.consume(BlobCipherEncryptHeader::headerSize);
-			StringRef header = StringRef(headerStart, BlobCipherEncryptHeader::headerSize);
+			StringRef headerS = StringRef(headerStart, BlobCipherEncryptHeader::headerSize);
+			state BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(headerS);
 			const uint8_t* dataPayloadStart = headerStart + BlobCipherEncryptHeader::headerSize;
 			// calculate the total bytes read up to (and including) the header
 			int64_t bytesRead = sizeof(int32_t) + sizeof(uint32_t) + optionsLen + BlobCipherEncryptHeader::headerSize;
@ -1117,7 +1151,12 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
 			StringRef decryptedData =
 			    wait(EncryptedRangeFileWriter::decrypt(cx.get(), header, dataPayloadStart, dataLen, &results.arena()));
 			reader = StringRefReader(decryptedData, restore_corrupted_data());
-			decodeKVPairs(&reader, &results);
+			state Optional<Reference<TenantEntryCache<Void>>> tenantCache;
+			if (g_network && g_simulator->isSimulated()) {
+				tenantCache = makeReference<TenantEntryCache<Void>>(cx.get(), TenantEntryCacheRefreshMode::WATCH);
+				wait(tenantCache.get()->init());
+			}
+			wait(decodeKVPairs(&reader, &results, true, tenantCache, header));
 		} else {
 			throw restore_unsupported_file_version();
 		}
@ -1711,7 +1750,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {

 		state bool done = false;
 		state int64_t nrKeys = 0;
-		state bool encryptionEnabled = false;
+		state Optional<bool> encryptionEnabled;

 		loop {
 			state RangeResultWithVersion values;
@ -1777,7 +1816,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {

 						wait(taskBucket->keepRunning(tr, task) &&
 						     storeOrThrow(snapshotBeginVersion, backup.snapshotBeginVersion().get(tr)) &&
-						     storeOrThrow(encryptionEnabled, backup.enableSnapshotBackupEncryption().get(tr)) &&
+						     store(encryptionEnabled, backup.enableSnapshotBackupEncryption().get(tr)) &&
 						     store(snapshotRangeFileCount, backup.snapshotRangeFileCount().getD(tr)));

 						break;
@ -1790,9 +1829,10 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {
 				    wait(bc->writeRangeFile(snapshotBeginVersion, snapshotRangeFileCount, outVersion, blockSize));
 				outFile = f;

-				encryptionEnabled = encryptionEnabled && cx->clientInfo->get().isEncryptionEnabled;
+				const bool encrypted =
+				    encryptionEnabled.present() && encryptionEnabled.get() && cx->clientInfo->get().isEncryptionEnabled;
 				// Initialize range file writer and write begin key
-				if (encryptionEnabled) {
+				if (encrypted) {
 					CODE_PROBE(true, "using encrypted snapshot file writer");
 					if (!tenantCache.isValid()) {
 						tenantCache = makeReference<TenantEntryCache<Void>>(cx, TenantEntryCacheRefreshMode::WATCH);
@ -3398,6 +3438,8 @@ struct RestoreCompleteTaskFunc : RestoreTaskFuncBase {

 		state RestoreConfig restore(task);
 		restore.stateEnum().set(tr, ERestoreState::COMPLETED);
+		state bool unlockDB = wait(restore.unlockDBAfterRestore().getD(tr, Snapshot::False, true));
+
 		tr->atomicOp(metadataVersionKey, metadataVersionRequiredValue, MutationRef::SetVersionstampedValue);
 		// Clear the file map now since it could be huge.
 		restore.fileSet().clear(tr);
@ -3413,7 +3455,9 @@ struct RestoreCompleteTaskFunc : RestoreTaskFuncBase {
 		restore.clearApplyMutationsKeys(tr);

 		wait(taskBucket->finish(tr, task));
-		wait(unlockDatabase(tr, restore.getUid()));
+		if (unlockDB) {
+			wait(unlockDatabase(tr, restore.getUid()));
+		}

 		return Void();
 	}
@ -5172,6 +5216,7 @@ public:
 	                                        Key addPrefix,
 	                                        Key removePrefix,
 	                                        LockDB lockDB,
+	                                        UnlockDB unlockDB,
 	                                        OnlyApplyMutationLogs onlyApplyMutationLogs,
 	                                        InconsistentSnapshotOnly inconsistentSnapshotOnly,
 	                                        Version beginVersion,
@ -5245,6 +5290,7 @@ public:
 		restore.onlyApplyMutationLogs().set(tr, onlyApplyMutationLogs);
 		restore.inconsistentSnapshotOnly().set(tr, inconsistentSnapshotOnly);
 		restore.beginVersion().set(tr, beginVersion);
+		restore.unlockDBAfterRestore().set(tr, unlockDB);
 		if (BUGGIFY && restoreRanges.size() == 1) {
 			restore.restoreRange().set(tr, restoreRanges[0]);
 		} else {
@ -5836,6 +5882,7 @@ public:
 	                                     Key addPrefix,
 	                                     Key removePrefix,
 	                                     LockDB lockDB,
+	                                     UnlockDB unlockDB,
 	                                     OnlyApplyMutationLogs onlyApplyMutationLogs,
 	                                     InconsistentSnapshotOnly inconsistentSnapshotOnly,
 	                                     Version beginVersion,
@ -5892,6 +5939,7 @@ public:
 				                   addPrefix,
 				                   removePrefix,
 				                   lockDB,
+				                   unlockDB,
 				                   onlyApplyMutationLogs,
 				                   inconsistentSnapshotOnly,
 				                   beginVersion,
@ -6017,7 +6065,7 @@ public:
 			}
 		}

-		Reference<IBackupContainer> bc = wait(backupConfig.backupContainer().getOrThrow(cx.getReference()));
+		state Reference<IBackupContainer> bc = wait(backupConfig.backupContainer().getOrThrow(cx.getReference()));

 		if (fastRestore) {
 			TraceEvent("AtomicParallelRestoreStartRestore").log();
@ -6043,24 +6091,80 @@ public:
 			return -1;
 		} else {
 			TraceEvent("AS_StartRestore").log();
-			Version ver = wait(restore(backupAgent,
-			                           cx,
-			                           cx,
-			                           tagName,
-			                           KeyRef(bc->getURL()),
-			                           bc->getProxy(),
-			                           ranges,
-			                           WaitForComplete::True,
-			                           ::invalidVersion,
-			                           Verbose::True,
-			                           addPrefix,
-			                           removePrefix,
-			                           LockDB::True,
-			                           OnlyApplyMutationLogs::False,
-			                           InconsistentSnapshotOnly::False,
-			                           ::invalidVersion,
-			                           {},
-			                           randomUid));
+			state Standalone<VectorRef<KeyRangeRef>> restoreRange;
+			state Standalone<VectorRef<KeyRangeRef>> systemRestoreRange;
+			bool encryptionEnabled = cx->clientInfo->get().isEncryptionEnabled;
+			for (auto r : ranges) {
+				if (!encryptionEnabled || !r.intersects(getSystemBackupRanges())) {
+					restoreRange.push_back_deep(restoreRange.arena(), r);
+				} else {
+					KeyRangeRef normalKeyRange = r & normalKeys;
+					KeyRangeRef systemKeyRange = r & systemKeys;
+					if (!normalKeyRange.empty()) {
+						restoreRange.push_back_deep(restoreRange.arena(), normalKeyRange);
+					}
+					if (!systemKeyRange.empty()) {
+						systemRestoreRange.push_back_deep(systemRestoreRange.arena(), systemKeyRange);
+					}
+				}
+			}
+			if (!systemRestoreRange.empty()) {
+				// restore system keys
+				wait(success(restore(backupAgent,
+				                     cx,
+				                     cx,
+				                     "system_restore"_sr,
+				                     KeyRef(bc->getURL()),
+				                     bc->getProxy(),
+				                     systemRestoreRange,
+				                     WaitForComplete::True,
+				                     ::invalidVersion,
+				                     Verbose::True,
+				                     addPrefix,
+				                     removePrefix,
+				                     LockDB::True,
+				                     UnlockDB::False,
+				                     OnlyApplyMutationLogs::False,
+				                     InconsistentSnapshotOnly::False,
+				                     ::invalidVersion,
+				                     {},
+				                     randomUid)));
+				state Reference<ReadYourWritesTransaction> rywTransaction =
+				    Reference<ReadYourWritesTransaction>(new ReadYourWritesTransaction(cx));
+				// clear old restore config associated with system keys
+				loop {
+					try {
+						rywTransaction->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+						rywTransaction->setOption(FDBTransactionOptions::LOCK_AWARE);
+						state RestoreConfig oldRestore(randomUid);
+						oldRestore.clear(rywTransaction);
+						wait(rywTransaction->commit());
+						break;
+					} catch (Error& e) {
+						wait(rywTransaction->onError(e));
+					}
+				}
+			}
+			// restore user data
+			state Version ver = wait(restore(backupAgent,
+			                                 cx,
+			                                 cx,
+			                                 tagName,
+			                                 KeyRef(bc->getURL()),
+			                                 bc->getProxy(),
+			                                 restoreRange,
+			                                 WaitForComplete::True,
+			                                 ::invalidVersion,
+			                                 Verbose::True,
+			                                 addPrefix,
+			                                 removePrefix,
+			                                 LockDB::True,
+			                                 UnlockDB::True,
+			                                 OnlyApplyMutationLogs::False,
+			                                 InconsistentSnapshotOnly::False,
+			                                 ::invalidVersion,
+			                                 {},
+			                                 randomUid));
 			return ver;
 		}
 	}
@ -6120,6 +6224,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
                                         Key addPrefix,
                                         Key removePrefix,
                                         LockDB lockDB,
+                                         UnlockDB unlockDB,
                                         OnlyApplyMutationLogs onlyApplyMutationLogs,
                                         InconsistentSnapshotOnly inconsistentSnapshotOnly,
                                         Version beginVersion,
@ -6137,6 +6242,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
 	                                    addPrefix,
 	                                    removePrefix,
 	                                    lockDB,
+	                                    unlockDB,
 	                                    onlyApplyMutationLogs,
 	                                    inconsistentSnapshotOnly,
 	                                    beginVersion,
@ -6178,6 +6284,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
 	               addPrefix,
 	               removePrefix,
 	               lockDB,
+	               UnlockDB::True,
 	               onlyApplyMutationLogs,
 	               inconsistentSnapshotOnly,
 	               beginVersion,
--- a/fdbclient/IdempotencyId.actor.cpp
+++ b/fdbclient/IdempotencyId.actor.cpp
@ -1,5 +1,5 @@
 /*
- * IdempotencyId.cpp
+ * IdempotencyId.actor.cpp
 *
 * This source file is part of the FoundationDB open source project
 *
@ -18,9 +18,11 @@
 * limitations under the License.
 */

-#include "fdbclient/IdempotencyId.h"
+#include "fdbclient/IdempotencyId.actor.h"
+#include "fdbclient/ReadYourWrites.h"
 #include "fdbclient/SystemData.h"
 #include "flow/UnitTest.h"
+#include "flow/actorcompiler.h" // this has to be the last include

 struct IdempotencyIdKVBuilderImpl {
 	Optional<Version> commitVersion;
@ -40,6 +42,7 @@ void IdempotencyIdKVBuilder::add(const IdempotencyIdRef& id, uint16_t batchIndex
 		ASSERT((batchIndex >> 8) == impl->batchIndexHighOrderByte.get());
 	} else {
 		impl->batchIndexHighOrderByte = batchIndex >> 8;
+		impl->value << int64_t(now());
 	}
 	StringRef s = id.asStringRefUnsafe();
 	impl->value << uint8_t(s.size());
@ -53,19 +56,17 @@ Optional<KeyValue> IdempotencyIdKVBuilder::buildAndClear() {
 		return {};
 	}

-	BinaryWriter key{ Unversioned() };
-	key.serializeBytes(idempotencyIdKeys.begin);
-	key << bigEndian64(impl->commitVersion.get());
-	key << impl->batchIndexHighOrderByte.get();
-
 	Value v = impl->value.toValue();

+	KeyRef key =
+	    makeIdempotencySingleKeyRange(v.arena(), impl->commitVersion.get(), impl->batchIndexHighOrderByte.get()).begin;
+
 	impl->value = BinaryWriter(IncludeVersion());
 	impl->batchIndexHighOrderByte = Optional<uint8_t>();

 	Optional<KeyValue> result = KeyValue();
 	result.get().arena() = v.arena();
-	result.get().key = key.toValue(result.get().arena());
+	result.get().key = key;
 	result.get().value = v;
 	return result;
 }
@ -86,6 +87,8 @@ Optional<CommitResult> kvContainsIdempotencyId(const KeyValueRef& kv, const Idem

 	// Even if id is a substring of value, it may still not actually contain it.
 	BinaryReader reader(kv.value.begin(), kv.value.size(), IncludeVersion());
+	int64_t timestamp; // ignored
+	reader >> timestamp;
 	while (!reader.empty()) {
 		uint8_t length;
 		reader >> length;
@ -93,13 +96,9 @@ Optional<CommitResult> kvContainsIdempotencyId(const KeyValueRef& kv, const Idem
 		uint8_t lowOrderBatchIndex;
 		reader >> lowOrderBatchIndex;
 		if (candidate == needle) {
-			BinaryReader reader(kv.key.begin(), kv.key.size(), Unversioned());
-			reader.readBytes(idempotencyIdKeys.begin.size());
 			Version commitVersion;
-			reader >> commitVersion;
-			commitVersion = bigEndian64(commitVersion);
 			uint8_t highOrderBatchIndex;
-			reader >> highOrderBatchIndex;
+			decodeIdempotencyKey(kv.key, commitVersion, highOrderBatchIndex);
 			return CommitResult{ commitVersion,
 				                 static_cast<uint16_t>((uint16_t(highOrderBatchIndex) << 8) |
 				                                       uint16_t(lowOrderBatchIndex)) };
@ -172,4 +171,35 @@ TEST_CASE("/fdbclient/IdempotencyId/serialization") {
 		ASSERT(t == id);
 	}
 	return Void();
+}
+
+KeyRangeRef makeIdempotencySingleKeyRange(Arena& arena, Version version, uint8_t highOrderBatchIndex) {
+	static const auto size =
+	    idempotencyIdKeys.begin.size() + sizeof(version) + sizeof(highOrderBatchIndex) + /*\x00*/ 1;
+
+	StringRef second = makeString(size, arena);
+	auto* dst = mutateString(second);
+
+	memcpy(dst, idempotencyIdKeys.begin.begin(), idempotencyIdKeys.begin.size());
+	dst += idempotencyIdKeys.begin.size();
+
+	version = bigEndian64(version);
+	memcpy(dst, &version, sizeof(version));
+	dst += sizeof(version);
+
+	*dst++ = highOrderBatchIndex;
+
+	*dst++ = 0;
+
+	ASSERT_EQ(dst - second.begin(), size);
+
+	return KeyRangeRef(second.removeSuffix("\x00"_sr), second);
+}
+
+void decodeIdempotencyKey(KeyRef key, Version& commitVersion, uint8_t& highOrderBatchIndex) {
+	BinaryReader reader(key, Unversioned());
+	reader.readBytes(idempotencyIdKeys.begin.size());
+	reader >> commitVersion;
+	commitVersion = bigEndian64(commitVersion);
+	reader >> highOrderBatchIndex;
 }
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@ -2639,7 +2639,8 @@ TEST_CASE("/ManagementAPI/AutoQuorumChange/checkLocality") {
 			                        ProcessClass(ProcessClass::CoordinatorClass, ProcessClass::CommandLineSource),
 			                        "",
 			                        "",
-			                        currentProtocolVersion());
+			                        currentProtocolVersion(),
+			                        false);
 		}

 		workers.push_back(data);
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@ -1888,6 +1888,9 @@ void MultiVersionDatabase::setOption(FDBDatabaseOptions::Option option, Optional
 		TraceEvent("UnknownDatabaseOption").detail("Option", option);
 		throw invalid_option();
 	}
+	if (itr->first == FDBDatabaseOptions::USE_CONFIG_DATABASE) {
+		dbState->isConfigDB = true;
+	}

 	int defaultFor = itr->second.defaultFor;
 	if (defaultFor >= 0) {
@ -1994,7 +1997,7 @@ ThreadFuture<ProtocolVersion> MultiVersionDatabase::getServerProtocol(Optional<P
 MultiVersionDatabase::DatabaseState::DatabaseState(ClusterConnectionRecord const& connectionRecord,
                                                   Reference<IDatabase> versionMonitorDb)
  : dbVar(new ThreadSafeAsyncVar<Reference<IDatabase>>(Reference<IDatabase>(nullptr))),
-    connectionRecord(connectionRecord), versionMonitorDb(versionMonitorDb), closed(false) {}
+    connectionRecord(connectionRecord), versionMonitorDb(versionMonitorDb), closed(false), isConfigDB(false) {}

 // Adds a client (local or externally loaded) that can be used to connect to the cluster
 void MultiVersionDatabase::DatabaseState::addClient(Reference<ClientInfo> client) {
@ -2192,8 +2195,12 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> ne
 			    .detail("ConnectionRecord", connectionRecord);
 		}
 	}
+	// Verify the database has the necessary functionality to update the shared
+	// state. Avoid updating the shared state if the database is a
+	// configuration database, because a configuration database does not have
+	// access to typical system keys and does not need to be updated.
 	if (db.isValid() && dbProtocolVersion.present() &&
-	    MultiVersionApi::api->getApiVersion().hasClusterSharedStateMap()) {
+	    MultiVersionApi::api->getApiVersion().hasClusterSharedStateMap() && !isConfigDB) {
 		Future<std::string> updateResult =
 		    MultiVersionApi::api->updateClusterSharedStateMap(connectionRecord, dbProtocolVersion.get(), db);
 		sharedStateUpdater = map(errorOr(updateResult), [this](ErrorOr<std::string> result) {
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -1479,16 +1479,6 @@ Future<RangeResult> HealthMetricsRangeImpl::getRange(ReadYourWritesTransaction*
 	return healthMetricsGetRangeActor(ryw, kr);
 }

-KeyRangeRef toRelativeRange(KeyRangeRef range, KeyRef prefix) {
-	if (prefix.empty()) {
-		return range;
-	} else {
-		KeyRef begin = range.begin.startsWith(prefix) ? range.begin.removePrefix(prefix) : allKeys.begin;
-		KeyRef end = range.end.startsWith(prefix) ? range.end.removePrefix(prefix) : allKeys.end;
-		return KeyRangeRef(begin, end);
-	}
-}
-
 ACTOR Future<UID> getClusterId(Database db) {
 	while (!db->clientInfo->get().clusterId.isValid()) {
 		wait(db->clientInfo->onChange());
@ -1925,7 +1915,8 @@ Optional<KeyRangeLocationInfo> DatabaseContext::getCachedLocation(const Optional
 	auto range =
 	    isBackward ? locationCache.rangeContainingKeyBefore(resolvedKey) : locationCache.rangeContaining(resolvedKey);
 	if (range->value()) {
-		return KeyRangeLocationInfo(tenantEntry, toRelativeRange(range->range(), tenantEntry.prefix), range->value());
+		return KeyRangeLocationInfo(
+		    tenantEntry, toPrefixRelativeRange(range->range(), tenantEntry.prefix), range->value());
 	}

 	return Optional<KeyRangeLocationInfo>();
@ -1962,7 +1953,8 @@ bool DatabaseContext::getCachedLocations(const Optional<TenantNameRef>& tenantNa
 			result.clear();
 			return false;
 		}
-		result.emplace_back(tenantEntry, toRelativeRange(r->range() & resolvedRange, tenantEntry.prefix), r->value());
+		result.emplace_back(
+		    tenantEntry, toPrefixRelativeRange(r->range() & resolvedRange, tenantEntry.prefix), r->value());
 		if (result.size() == limit || begin == end) {
 			break;
 		}
@ -2978,7 +2970,7 @@ ACTOR Future<KeyRangeLocationInfo> getKeyLocation_internal(Database cx,

 					return KeyRangeLocationInfo(
 					    rep.tenantEntry,
-					    KeyRange(toRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena),
+					    KeyRange(toPrefixRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena),
 					    locationInfo);
 				}
 			}
@ -3123,7 +3115,7 @@ ACTOR Future<std::vector<KeyRangeLocationInfo>> getKeyRangeLocations_internal(
 						// efficient to save the map pairs and insert them all at once.
 						results.emplace_back(
 						    rep.tenantEntry,
-						    (toRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys),
+						    (toPrefixRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys),
 						    cx->setCachedLocation(
 						        tenant.name, rep.tenantEntry, rep.results[shard].first, rep.results[shard].second));
 						wait(yield());
@ -4025,6 +4017,7 @@ Future<RangeResultFamily> getExactRange(Reference<TransactionState> trState,
 			req.version = version;
 			req.begin = firstGreaterOrEqual(range.begin);
 			req.end = firstGreaterOrEqual(range.end);
+
 			setMatchIndex<GetKeyValuesFamilyRequest>(req, matchIndex);
 			req.spanContext = span.context;
 			trState->cx->getLatestCommitVersions(
@ -6158,6 +6151,7 @@ ACTOR static Future<Optional<CommitResult>> determineCommitStatus(Reference<Tran
                                                                  IdempotencyIdRef idempotencyId) {
 	state Transaction tr(trState->cx);
 	state int retries = 0;
+	state Version expiredVersion;
 	state Span span("NAPI:determineCommitStatus"_loc, trState->spanContext);
 	tr.span.setParent(span.context);
 	loop {
@ -6167,11 +6161,19 @@ ACTOR static Future<Optional<CommitResult>> determineCommitStatus(Reference<Tran
 			tr.trState->authToken = trState->authToken;
 			tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
 			tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
+			KeyBackedObjectProperty<IdempotencyIdsExpiredVersion, _Unversioned> expiredKey(idempotencyIdsExpiredVersion,
+			                                                                               Unversioned());
+			IdempotencyIdsExpiredVersion expiredVal = wait(expiredKey.getD(&tr));
+			expiredVersion = expiredVal.expired;
+			if (expiredVersion >= minPossibleCommitVersion) {
+				throw commit_unknown_result_fatal();
+			}
 			Version rv = wait(tr.getReadVersion());
 			TraceEvent("DetermineCommitStatusAttempt")
 			    .detail("IdempotencyId", idempotencyId.asStringRefUnsafe())
 			    .detail("Retries", retries)
 			    .detail("ReadVersion", rv)
+			    .detail("ExpiredVersion", expiredVersion)
 			    .detail("MinPossibleCommitVersion", minPossibleCommitVersion)
 			    .detail("MaxPossibleCommitVersion", maxPossibleCommitVersion);
 			KeyRange possibleRange =
@ -6415,6 +6417,12 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,

 		req.debugID = commitID;
 		state Future<CommitID> reply;
+		// Only gets filled in in the happy path where we don't have to commit on the first proxy or use provisional
+		// proxies
+		state int alternativeChosen = -1;
+		// Only valid if alternativeChosen >= 0
+		state Reference<CommitProxyInfo> proxiesUsed;
+
 		if (trState->options.commitOnFirstProxy) {
 			if (trState->cx->clientInfo->get().firstCommitProxy.present()) {
 				reply = throwErrorOr(brokenPromiseToMaybeDelivered(
@ -6425,11 +6433,13 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
 				                       : Never();
 			}
 		} else {
-			reply = basicLoadBalance(trState->cx->getCommitProxies(trState->useProvisionalProxies),
+			proxiesUsed = trState->cx->getCommitProxies(trState->useProvisionalProxies);
+			reply = basicLoadBalance(proxiesUsed,
 			                         &CommitProxyInterface::commit,
 			                         req,
 			                         TaskPriority::DefaultPromiseEndpoint,
-			                         AtMostOnce::True);
+			                         AtMostOnce::True,
+			                         &alternativeChosen);
 		}
 		state double grvTime = now();
 		choose {
@ -6479,6 +6489,12 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
 						                                       ci.version,
 						                                       req,
 						                                       trState->tenant()));
+					if (trState->automaticIdempotency && alternativeChosen >= 0) {
+						// Automatic idempotency means we're responsible for best effort idempotency id clean up
+						proxiesUsed->getInterface(alternativeChosen)
+						    .expireIdempotencyId.send(ExpireIdempotencyIdRequest{
+						        ci.version, uint8_t(ci.txnBatchId >> 8), trState->getTenantInfo() });
+					}
 					return Void();
 				} else {
 					// clear the RYW transaction which contains previous conflicting keys
@ -6566,7 +6582,7 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
 			    e.code() != error_code_grv_proxy_memory_limit_exceeded &&
 			    e.code() != error_code_batch_transaction_throttled && e.code() != error_code_tag_throttled &&
 			    e.code() != error_code_process_behind && e.code() != error_code_future_version &&
-			    e.code() != error_code_tenant_not_found) {
+			    e.code() != error_code_tenant_not_found && e.code() != error_code_proxy_tag_throttled) {
 				TraceEvent(SevError, "TryCommitError").error(e);
 			}
 			if (trState->trLogInfo)
@ -6964,11 +6980,16 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optional<Strin
 			throw e;
 		}
 		tr.idempotencyId = IdempotencyIdRef(tr.arena, IdempotencyIdRef(value.get()));
+		trState->automaticIdempotency = false;
 		break;
 	case FDBTransactionOptions::AUTOMATIC_IDEMPOTENCY:
 		validateOptionValueNotPresent(value);
-		tr.idempotencyId = IdempotencyIdRef(
-		    tr.arena, IdempotencyIdRef(BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned())));
+		if (!tr.idempotencyId.valid()) {
+			tr.idempotencyId = IdempotencyIdRef(
+			    tr.arena,
+			    IdempotencyIdRef(BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned())));
+		}
+		trState->automaticIdempotency = true;
 		break;

 	default:
@ -7007,6 +7028,8 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion(SpanContext parentSpa
 				                               &GrvProxyInterface::getConsistentReadVersion,
 				                               req,
 				                               cx->taskID))) {
+					CODE_PROBE(v.proxyTagThrottledDuration > 0.0,
+					           "getConsistentReadVersion received GetReadVersionReply delayed by proxy tag throttling");
 					if (tags.size() != 0) {
 						auto& priorityThrottledTags = cx->throttledTags[priority];
 						for (auto& tag : tags) {
@ -7041,7 +7064,7 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion(SpanContext parentSpa
 			}
 		} catch (Error& e) {
 			if (e.code() != error_code_broken_promise && e.code() != error_code_batch_transaction_throttled &&
-			    e.code() != error_code_grv_proxy_memory_limit_exceeded)
+			    e.code() != error_code_grv_proxy_memory_limit_exceeded && e.code() != error_code_proxy_tag_throttled)
 				TraceEvent(SevError, "GetConsistentReadVersionError").error(e);
 			if (e.code() == error_code_batch_transaction_throttled && !cx->apiVersionAtLeast(630)) {
 				wait(delayJittered(5.0));
@ -7492,7 +7515,7 @@ Future<Void> Transaction::onError(Error const& e) {
 	    e.code() == error_code_database_locked || e.code() == error_code_commit_proxy_memory_limit_exceeded ||
 	    e.code() == error_code_grv_proxy_memory_limit_exceeded || e.code() == error_code_process_behind ||
 	    e.code() == error_code_batch_transaction_throttled || e.code() == error_code_tag_throttled ||
-	    e.code() == error_code_blob_granule_request_failed) {
+	    e.code() == error_code_blob_granule_request_failed || e.code() == error_code_proxy_tag_throttled) {
 		if (e.code() == error_code_not_committed)
 			++trState->cx->transactionsNotCommitted;
 		else if (e.code() == error_code_commit_unknown_result)
@ -7732,6 +7755,35 @@ ACTOR Future<Standalone<VectorRef<ReadHotRangeWithMetrics>>> getReadHotRanges(Da
 	}
 }

+ACTOR Future<Optional<StorageMetrics>> waitStorageMetricsWithLocation(TenantInfo tenantInfo,
+                                                                      KeyRange keys,
+                                                                      std::vector<KeyRangeLocationInfo> locations,
+                                                                      StorageMetrics min,
+                                                                      StorageMetrics max,
+                                                                      StorageMetrics permittedError) {
+	try {
+		Future<StorageMetrics> fx;
+		if (locations.size() > 1) {
+			fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError);
+		} else {
+			WaitMetricsRequest req(tenantInfo, keys, min, max);
+			fx = loadBalance(locations[0].locations->locations(),
+			                 &StorageServerInterface::waitMetrics,
+			                 req,
+			                 TaskPriority::DataDistribution);
+		}
+		StorageMetrics x = wait(fx);
+		return x;
+	} catch (Error& e) {
+		TraceEvent(SevDebug, "WaitStorageMetricsError").error(e);
+		if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
+			TraceEvent(SevError, "WaitStorageMetricsError").error(e);
+			throw;
+		}
+	}
+	return Optional<StorageMetrics>();
+}
+
 ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
    Database cx,
    KeyRange keys,
@ -7761,38 +7813,26 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
 		}

 		// SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better
-		// solution to this.
-		if (locations.size() < shardLimit) {
-			try {
-				Future<StorageMetrics> fx;
-				if (locations.size() > 1) {
-					fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError);
-				} else {
-					WaitMetricsRequest req(tenantInfo, keys, min, max);
-					fx = loadBalance(locations[0].locations->locations(),
-					                 &StorageServerInterface::waitMetrics,
-					                 req,
-					                 TaskPriority::DataDistribution);
-				}
-				StorageMetrics x = wait(fx);
-				return std::make_pair(x, -1);
-			} catch (Error& e) {
-				if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
-					TraceEvent(SevError, "WaitStorageMetricsError").error(e);
-					throw;
-				}
-				cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
-				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
-			}
-		} else {
+		// solution to this. How could this happen?
+		if (locations.size() >= shardLimit) {
 			TraceEvent(SevWarn, "WaitStorageMetricsPenalty")
 			    .detail("Keys", keys)
-			    .detail("Limit", CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT)
+			    .detail("Limit", shardLimit)
+			    .detail("LocationSize", locations.size())
 			    .detail("JitteredSecondsOfPenitence", CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY);
 			wait(delayJittered(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
 			// make sure that the next getKeyRangeLocations() call will actually re-fetch the range
 			cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
+			continue;
 		}
+
+		Optional<StorageMetrics> res =
+		    wait(waitStorageMetricsWithLocation(tenantInfo, keys, locations, min, max, permittedError));
+		if (res.present()) {
+			return std::make_pair(res, -1);
+		}
+		cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
+		wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
 	}
 }

@ -8653,6 +8693,56 @@ Future<Void> DatabaseContext::splitStorageMetricsStream(const PromiseStream<Key>
 	    resultStream, Database(Reference<DatabaseContext>::addRef(this)), keys, limit, estimated, minSplitBytes);
 }

+ACTOR Future<Optional<Standalone<VectorRef<KeyRef>>>> splitStorageMetricsWithLocations(
+    std::vector<KeyRangeLocationInfo> locations,
+    KeyRange keys,
+    StorageMetrics limit,
+    StorageMetrics estimated,
+    Optional<int> minSplitBytes) {
+	state StorageMetrics used;
+	state Standalone<VectorRef<KeyRef>> results;
+	results.push_back_deep(results.arena(), keys.begin);
+	//TraceEvent("SplitStorageMetrics").detail("Locations", locations.size());
+	try {
+		state int i = 0;
+		for (; i < locations.size(); i++) {
+			SplitMetricsRequest req(
+			    locations[i].range, limit, used, estimated, i == locations.size() - 1, minSplitBytes);
+			SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(),
+			                                         &StorageServerInterface::splitMetrics,
+			                                         req,
+			                                         TaskPriority::DataDistribution));
+			if (res.splits.size() && res.splits[0] <= results.back()) { // split points are out of order, possibly
+				                                                        // because of moving data, throw error to retry
+				ASSERT_WE_THINK(false); // FIXME: This seems impossible and doesn't seem to be covered by testing
+				throw all_alternatives_failed();
+			}
+			if (res.splits.size()) {
+				results.append(results.arena(), res.splits.begin(), res.splits.size());
+				results.arena().dependsOn(res.splits.arena());
+			}
+			used = res.used;
+
+			//TraceEvent("SplitStorageMetricsResult").detail("Used", used.bytes).detail("Location", i).detail("Size", res.splits.size());
+		}
+
+		if (used.allLessOrEqual(limit * CLIENT_KNOBS->STORAGE_METRICS_UNFAIR_SPLIT_LIMIT) && results.size() > 1) {
+			results.resize(results.arena(), results.size() - 1);
+		}
+
+		if (keys.end <= locations.back().range.end) {
+			results.push_back_deep(results.arena(), keys.end);
+		}
+		return results;
+	} catch (Error& e) {
+		if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
+			TraceEvent(SevError, "SplitStorageMetricsError").error(e);
+			throw;
+		}
+	}
+	return Optional<Standalone<VectorRef<KeyRef>>>();
+}
+
 ACTOR Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(Database cx,
                                                                KeyRange keys,
                                                                StorageMetrics limit,
@ -8671,61 +8761,24 @@ ACTOR Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(Database cx,
 		                              Optional<UID>(),
 		                              UseProvisionalProxies::False,
 		                              latestVersion));
-		state StorageMetrics used;
-		state Standalone<VectorRef<KeyRef>> results;

 		// SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better
 		// solution to this.
 		if (locations.size() == CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT) {
 			wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
 			cx->invalidateCache(Key(), keys);
-		} else {
-			results.push_back_deep(results.arena(), keys.begin);
-			try {
-				//TraceEvent("SplitStorageMetrics").detail("Locations", locations.size());
-
-				state int i = 0;
-				for (; i < locations.size(); i++) {
-					SplitMetricsRequest req(
-					    locations[i].range, limit, used, estimated, i == locations.size() - 1, minSplitBytes);
-					SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(),
-					                                         &StorageServerInterface::splitMetrics,
-					                                         req,
-					                                         TaskPriority::DataDistribution));
-					if (res.splits.size() &&
-					    res.splits[0] <= results.back()) { // split points are out of order, possibly because of
-						                                   // moving data, throw error to retry
-						ASSERT_WE_THINK(
-						    false); // FIXME: This seems impossible and doesn't seem to be covered by testing
-						throw all_alternatives_failed();
-					}
-					if (res.splits.size()) {
-						results.append(results.arena(), res.splits.begin(), res.splits.size());
-						results.arena().dependsOn(res.splits.arena());
-					}
-					used = res.used;
-
-					//TraceEvent("SplitStorageMetricsResult").detail("Used", used.bytes).detail("Location", i).detail("Size", res.splits.size());
-				}
-
-				if (used.allLessOrEqual(limit * CLIENT_KNOBS->STORAGE_METRICS_UNFAIR_SPLIT_LIMIT) &&
-				    results.size() > 1) {
-					results.resize(results.arena(), results.size() - 1);
-				}
-
-				if (keys.end <= locations.back().range.end) {
-					results.push_back_deep(results.arena(), keys.end);
-				}
-				return results;
-			} catch (Error& e) {
-				if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
-					TraceEvent(SevError, "SplitStorageMetricsError").error(e);
-					throw;
-				}
-				cx->invalidateCache(Key(), keys);
-				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
-			}
+			continue;
 		}
+
+		Optional<Standalone<VectorRef<KeyRef>>> results =
+		    wait(splitStorageMetricsWithLocations(locations, keys, limit, estimated, minSplitBytes));
+
+		if (results.present()) {
+			return results.get();
+		}
+
+		cx->invalidateCache(Key(), keys);
+		wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
 	}
 }

@ -9312,7 +9365,7 @@ void handleTSSChangeFeedMismatch(const ChangeFeedStreamRequest& request,
 		mismatchEvent.detail("EndKey", request.range.end);
 		mismatchEvent.detail("CanReadPopped", request.canReadPopped);
 		mismatchEvent.detail("PopVersion", popVersion);
-		mismatchEvent.detail("DebugUID", request.debugUID);
+		mismatchEvent.detail("DebugUID", request.id);

 		// mismatch info
 		mismatchEvent.detail("MatchesFound", matchesFound);
@ -9338,7 +9391,7 @@ void handleTSSChangeFeedMismatch(const ChangeFeedStreamRequest& request,
 			    "TSSMismatchChangeFeedStream");
 			summaryEvent.detail("TSSID", tssData.tssId)
 			    .detail("MismatchId", mismatchUID)
-			    .detail("FeedDebugUID", request.debugUID);
+			    .detail("FeedDebugUID", request.id);
 		}
 	}
 }
@ -9863,7 +9916,8 @@ ACTOR Future<Void> mergeChangeFeedStream(Reference<DatabaseContext> db,
                                         Version* begin,
                                         Version end,
                                         int replyBufferSize,
-                                         bool canReadPopped) {
+                                         bool canReadPopped,
+                                         ReadOptions readOptions) {
 	state std::vector<Future<Void>> fetchers(interfs.size());
 	state std::vector<Future<Void>> onErrors(interfs.size());
 	state std::vector<MutationAndVersionStream> streams(interfs.size());
@ -9891,10 +9945,11 @@ ACTOR Future<Void> mergeChangeFeedStream(Reference<DatabaseContext> db,
 		if (replyBufferSize != -1 && req.replyBufferSize < CLIENT_KNOBS->CHANGE_FEED_STREAM_MIN_BYTES) {
 			req.replyBufferSize = CLIENT_KNOBS->CHANGE_FEED_STREAM_MIN_BYTES;
 		}
-		req.debugUID = deterministicRandom()->randomUniqueID();
-		debugUIDs.push_back(req.debugUID);
-		mergeCursorUID =
-		    UID(mergeCursorUID.first() ^ req.debugUID.first(), mergeCursorUID.second() ^ req.debugUID.second());
+		req.options = readOptions;
+		req.id = deterministicRandom()->randomUniqueID();
+
+		debugUIDs.push_back(req.id);
+		mergeCursorUID = UID(mergeCursorUID.first() ^ req.id.first(), mergeCursorUID.second() ^ req.id.second());

 		results->streams.push_back(interfs[i].first.changeFeedStream.getReplyStream(req));
 		maybeDuplicateTSSChangeFeedStream(req,
@ -10097,7 +10152,8 @@ ACTOR Future<Void> singleChangeFeedStream(Reference<DatabaseContext> db,
                                          Version* begin,
                                          Version end,
                                          int replyBufferSize,
-                                          bool canReadPopped) {
+                                          bool canReadPopped,
+                                          ReadOptions readOptions) {
 	state Database cx(db);
 	state ChangeFeedStreamRequest req;
 	state Optional<ChangeFeedTSSValidationData> tssData;
@ -10107,10 +10163,11 @@ ACTOR Future<Void> singleChangeFeedStream(Reference<DatabaseContext> db,
 	req.range = range;
 	req.canReadPopped = canReadPopped;
 	req.replyBufferSize = replyBufferSize;
-	req.debugUID = deterministicRandom()->randomUniqueID();
+	req.options = readOptions;
+	req.id = deterministicRandom()->randomUniqueID();

 	if (DEBUG_CF_CLIENT_TRACE) {
-		TraceEvent(SevDebug, "TraceChangeFeedClientSingleCursor", req.debugUID)
+		TraceEvent(SevDebug, "TraceChangeFeedClientSingleCursor", req.id)
 		    .detail("FeedID", rangeID)
 		    .detail("Range", range)
 		    .detail("Begin", *begin)
@ -10150,7 +10207,8 @@ ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
                                            Version end,
                                            KeyRange range,
                                            int replyBufferSize,
-                                            bool canReadPopped) {
+                                            bool canReadPopped,
+                                            ReadOptions readOptions) {
 	state Database cx(db);
 	state Span span("NAPI:GetChangeFeedStream"_loc);
 	db->usedAnyChangeFeeds = true;
@ -10240,14 +10298,22 @@ ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
 				}
 				CODE_PROBE(true, "Change feed merge cursor");
 				// TODO (jslocum): validate connectionFileChanged behavior
-				wait(
-				    mergeChangeFeedStream(db, interfs, results, rangeID, &begin, end, replyBufferSize, canReadPopped) ||
-				    cx->connectionFileChanged());
+				wait(mergeChangeFeedStream(
+				         db, interfs, results, rangeID, &begin, end, replyBufferSize, canReadPopped, readOptions) ||
+				     cx->connectionFileChanged());
 			} else {
 				CODE_PROBE(true, "Change feed single cursor");
 				StorageServerInterface interf = locations[0].locations->getInterface(chosenLocations[0]);
-				wait(singleChangeFeedStream(
-				         db, interf, range, results, rangeID, &begin, end, replyBufferSize, canReadPopped) ||
+				wait(singleChangeFeedStream(db,
+				                            interf,
+				                            range,
+				                            results,
+				                            rangeID,
+				                            &begin,
+				                            end,
+				                            replyBufferSize,
+				                            canReadPopped,
+				                            readOptions) ||
 				     cx->connectionFileChanged());
 			}
 		} catch (Error& e) {
@ -10314,9 +10380,17 @@ Future<Void> DatabaseContext::getChangeFeedStream(Reference<ChangeFeedData> resu
                                                  Version end,
                                                  KeyRange range,
                                                  int replyBufferSize,
-                                                  bool canReadPopped) {
-	return getChangeFeedStreamActor(
-	    Reference<DatabaseContext>::addRef(this), results, rangeID, begin, end, range, replyBufferSize, canReadPopped);
+                                                  bool canReadPopped,
+                                                  ReadOptions readOptions) {
+	return getChangeFeedStreamActor(Reference<DatabaseContext>::addRef(this),
+	                                results,
+	                                rangeID,
+	                                begin,
+	                                end,
+	                                range,
+	                                replyBufferSize,
+	                                canReadPopped,
+	                                readOptions);
 }

 Version OverlappingChangeFeedsInfo::getFeedMetadataVersion(const KeyRangeRef& range) const {
@ -10548,6 +10622,76 @@ Reference<DatabaseContext::TransactionT> DatabaseContext::createTransaction() {
 }

 // BlobGranule API.
+ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobRanges(Transaction* tr, KeyRange range, int batchLimit) {
+	state Standalone<VectorRef<KeyRangeRef>> blobRanges;
+	state Key beginKey = range.begin;
+
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+
+			state RangeResult results = wait(
+			    krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2));
+
+			blobRanges.arena().dependsOn(results.arena());
+			for (int i = 0; i < results.size() - 1; i++) {
+				if (results[i].value == blobRangeActive) {
+					blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key));
+				}
+				if (blobRanges.size() == batchLimit) {
+					return blobRanges;
+				}
+			}
+
+			if (!results.more) {
+				return blobRanges;
+			}
+			beginKey = results.back().key;
+		} catch (Error& e) {
+			wait(tr->onError(e));
+		}
+	}
+}
+
+ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobbifiedRanges(Transaction* tr,
+                                                                     KeyRange range,
+                                                                     int rangeLimit,
+                                                                     Optional<TenantName> tenantName) {
+	state TenantMapEntry tme;
+
+	loop {
+		try {
+			if (tenantName.present()) {
+				wait(store(tme, blobGranuleGetTenantEntry(tr, range.begin, tenantName)));
+				range = range.withPrefix(tme.prefix);
+			}
+			break;
+		} catch (Error& e) {
+			wait(tr->onError(e));
+		}
+	}
+
+	state Standalone<VectorRef<KeyRangeRef>> blobRanges = wait(getBlobRanges(tr, range, rangeLimit));
+	if (!tenantName.present()) {
+		return blobRanges;
+	}
+
+	// Strip tenant prefix out.
+	state Standalone<VectorRef<KeyRangeRef>> tenantBlobRanges;
+	for (auto& blobRange : blobRanges) {
+		// Filter out blob ranges that span tenants for some reason.
+		if (!blobRange.begin.startsWith(tme.prefix) || !blobRange.end.startsWith(tme.prefix)) {
+			TraceEvent("ListBlobbifiedRangeSpansTenants")
+			    .suppressFor(/*seconds=*/5)
+			    .detail("Tenant", tenantName.get())
+			    .detail("Range", blobRange);
+			continue;
+		}
+		tenantBlobRanges.push_back_deep(tenantBlobRanges.arena(), blobRange.removePrefix(tme.prefix));
+	}
+	return tenantBlobRanges;
+}
+
 ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
                                         KeyRange range,
                                         Version purgeVersion,
@ -10590,10 +10734,13 @@ ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
 			}

 			// must be aligned to blob range(s)
-			state Future<Optional<Value>> beginPresent = tr.get(purgeRange.begin.withPrefix(blobRangeKeys.begin));
-			state Future<Optional<Value>> endPresent = tr.get(purgeRange.end.withPrefix(blobRangeKeys.begin));
-			wait(success(beginPresent) && success(endPresent));
-			if (!beginPresent.get().present() || !endPresent.get().present()) {
+			state Future<Standalone<VectorRef<KeyRangeRef>>> blobbifiedBegin =
+			    getBlobbifiedRanges(&tr, KeyRangeRef(purgeRange.begin, purgeRange.begin), 2, {});
+			state Future<Standalone<VectorRef<KeyRangeRef>>> blobbifiedEnd =
+			    getBlobbifiedRanges(&tr, KeyRangeRef(purgeRange.end, purgeRange.end), 2, {});
+			wait(success(blobbifiedBegin) && success(blobbifiedEnd));
+			if ((!blobbifiedBegin.get().empty() && blobbifiedBegin.get().front().begin < purgeRange.begin) ||
+			    (!blobbifiedEnd.get().empty() && blobbifiedEnd.get().back().end > purgeRange.end)) {
 				TraceEvent("UnalignedPurge")
 				    .detail("Range", range)
 				    .detail("Version", purgeVersion)
@ -10670,39 +10817,6 @@ Future<Void> DatabaseContext::waitPurgeGranulesComplete(Key purgeKey) {
 	return waitPurgeGranulesCompleteActor(Reference<DatabaseContext>::addRef(this), purgeKey);
 }

-ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobRanges(Reference<ReadYourWritesTransaction> tr,
-                                                               KeyRange range,
-                                                               int batchLimit) {
-	state Standalone<VectorRef<KeyRangeRef>> blobRanges;
-	state Key beginKey = range.begin;
-
-	loop {
-		try {
-			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-
-			state RangeResult results = wait(
-			    krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2));
-
-			blobRanges.arena().dependsOn(results.arena());
-			for (int i = 0; i < results.size() - 1; i++) {
-				if (results[i].value == blobRangeActive) {
-					blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key));
-				}
-				if (blobRanges.size() == batchLimit) {
-					return blobRanges;
-				}
-			}
-
-			if (!results.more) {
-				return blobRanges;
-			}
-			beginKey = results.back().key;
-		} catch (Error& e) {
-			wait(tr->onError(e));
-		}
-	}
-}
-
 ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx,
                                     KeyRange range,
                                     bool active,
@ -10724,7 +10838,7 @@ ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx,
 				range = range.withPrefix(tenantEntry.prefix);
 			}

-			Standalone<VectorRef<KeyRangeRef>> startBlobRanges = wait(getBlobRanges(tr, range, 1));
+			Standalone<VectorRef<KeyRangeRef>> startBlobRanges = wait(getBlobRanges(&tr->getTransaction(), range, 1));

 			if (active) {
 				// Idempotent request.
@ -10772,47 +10886,19 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRangesActor(Refer
                                                                           KeyRange range,
                                                                           int rangeLimit,
                                                                           Optional<TenantName> tenantName) {
+
 	state Database db(cx);
-	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
-	state TenantMapEntry tme;
+	state Transaction tr(db);

-	loop {
-		try {
-			if (tenantName.present()) {
-				wait(store(tme, blobGranuleGetTenantEntry(&tr->getTransaction(), range.begin, tenantName)));
-				range = range.withPrefix(tme.prefix);
-			}
-			break;
-		} catch (Error& e) {
-			wait(tr->onError(e));
-		}
-	}
+	Standalone<VectorRef<KeyRangeRef>> blobbifiedRanges = wait(getBlobbifiedRanges(&tr, range, rangeLimit, tenantName));

-	state Standalone<VectorRef<KeyRangeRef>> blobRanges = wait(getBlobRanges(tr, range, rangeLimit));
-	if (!tenantName.present()) {
-		return blobRanges;
-	}
-
-	// Strip tenant prefix out.
-	state Standalone<VectorRef<KeyRangeRef>> tenantBlobRanges;
-	for (auto& blobRange : blobRanges) {
-		// Filter out blob ranges that span tenants for some reason.
-		if (!blobRange.begin.startsWith(tme.prefix) || !blobRange.end.startsWith(tme.prefix)) {
-			TraceEvent("ListBlobbifiedRangeSpansTenants")
-			    .suppressFor(/*seconds=*/5)
-			    .detail("Tenant", tenantName.get())
-			    .detail("Range", blobRange);
-			continue;
-		}
-		tenantBlobRanges.push_back_deep(tenantBlobRanges.arena(), blobRange.removePrefix(tme.prefix));
-	}
-	return tenantBlobRanges;
+	return blobbifiedRanges;
 }

 Future<Standalone<VectorRef<KeyRangeRef>>> DatabaseContext::listBlobbifiedRanges(KeyRange range,
-                                                                                 int rowLimit,
+                                                                                 int rangeLimit,
                                                                                 Optional<TenantName> tenantName) {
-	return listBlobbifiedRangesActor(Reference<DatabaseContext>::addRef(this), range, rowLimit, tenantName);
+	return listBlobbifiedRangesActor(Reference<DatabaseContext>::addRef(this), range, rangeLimit, tenantName);
 }

 int64_t getMaxKeySize(KeyRef const& key) {
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@ -422,10 +422,11 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	// Enable this knob only for experminatal purpose, never enable this in production.
 	// If enabled, all the committed in-memory memtable writes are lost on a crash.
 	init( ROCKSDB_DISABLE_WAL_EXPERIMENTAL,                    false );
-	// If ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE is enabled, disable ENABLE_CLEAR_RANGE_EAGER_READS knob.
+	// If ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE is enabled, disable ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS knob.
 	// These knobs have contrary functionality.
 	init( ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE,             false ); if( randomize && BUGGIFY ) ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE = deterministicRandom()->coinflip() ? false : true;
 	init( ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT,              200000 ); // 200KB
+	init( ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS,               true ); if( randomize && BUGGIFY ) ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS = deterministicRandom()->coinflip() ? false : true;
 	// Can commit will delay ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD seconds for
 	// ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD times, if rocksdb overloaded.
 	// Set ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD to 0, to disable
@ -727,8 +728,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL,                30.0 ); if(randomize && BUGGIFY) TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL = 1.0;
 	init( AUTO_TAG_THROTTLING_ENABLED,                          true ); if(randomize && BUGGIFY) AUTO_TAG_THROTTLING_ENABLED = false;
 	init( SS_THROTTLE_TAGS_TRACKED,                                1 ); if(randomize && BUGGIFY) SS_THROTTLE_TAGS_TRACKED = deterministicRandom()->randomInt(1, 10);
-	init( GLOBAL_TAG_THROTTLING,                               false );
-	init( ENFORCE_TAG_THROTTLING_ON_PROXIES,                   false );
+	init( GLOBAL_TAG_THROTTLING,                               false ); if(isSimulated) GLOBAL_TAG_THROTTLING = deterministicRandom()->coinflip();
+	init( ENFORCE_TAG_THROTTLING_ON_PROXIES,   GLOBAL_TAG_THROTTLING );
 	init( GLOBAL_TAG_THROTTLING_MIN_RATE,                        1.0 );
 	init( GLOBAL_TAG_THROTTLING_FOLDING_TIME,                   10.0 );
 	init( GLOBAL_TAG_THROTTLING_MAX_TAGS_TRACKED,                 10 );
@ -761,7 +762,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( FETCH_KEYS_LOWER_PRIORITY,                               0 );
 	init( SERVE_FETCH_CHECKPOINT_PARALLELISM,                      4 );
 	init( SERVE_AUDIT_STORAGE_PARALLELISM,                      2 );
-	init( CHANGE_FEED_DISK_READS_PARALLELISM,                   1000 ); if( randomize && BUGGIFY ) CHANGE_FEED_DISK_READS_PARALLELISM = 20;
 	init( BUGGIFY_BLOCK_BYTES,                                 10000 );
 	init( STORAGE_RECOVERY_VERSION_LAG_LIMIT,				2 * MAX_READ_TRANSACTION_LIFE_VERSIONS );
 	init( STORAGE_COMMIT_BYTES,                             10000000 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_BYTES = 2000000;
@ -800,6 +800,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( QUICK_GET_KEY_VALUES_LIMIT,                           2000 );
 	init( QUICK_GET_KEY_VALUES_LIMIT_BYTES,                      1e7 );
 	init( STORAGE_FEED_QUERY_HARD_LIMIT,                      100000 );
+	init( STORAGE_SERVER_READ_CONCURRENCY,                        70 );
+	// Priorities which each ReadType maps to, in enumeration order
+	init( STORAGESERVER_READ_RANKS,                      "0,2,1,1,1" );
+	init( STORAGESERVER_READ_PRIORITIES,                   "48,32,8" );

 	//Wait Failure
 	init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS,                 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2;
@ -911,7 +915,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( REDWOOD_DEFAULT_EXTENT_SIZE,              32 * 1024 * 1024 );
 	init( REDWOOD_DEFAULT_EXTENT_READ_SIZE,              1024 * 1024 );
 	init( REDWOOD_EXTENT_CONCURRENT_READS,                         4 );
-	init( REDWOOD_KVSTORE_CONCURRENT_READS,                       64 );
 	init( REDWOOD_KVSTORE_RANGE_PREFETCH,                       true );
 	init( REDWOOD_PAGE_REBUILD_MAX_SLACK,                       0.33 );
 	init( REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES,                    10 );
@ -924,6 +927,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( REDWOOD_HISTOGRAM_INTERVAL,                           30.0 );
 	init( REDWOOD_EVICT_UPDATED_PAGES,                          true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; }
 	init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT,                    2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); }
+	init( REDWOOD_PRIORITY_LAUNCHS,                    "32,32,32,32" );
 	init( REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT,             false );

 	// Server request latency measurement
@ -966,6 +970,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( BG_CONSISTENCY_CHECK_ENABLED,                         true ); if (randomize && BUGGIFY) BG_CONSISTENCY_CHECK_ENABLED = false;
 	init( BG_CONSISTENCY_CHECK_TARGET_SPEED_KB,                 1000 ); if (randomize && BUGGIFY) BG_CONSISTENCY_CHECK_TARGET_SPEED_KB *= (deterministicRandom()->randomInt(2, 50) / 10);
 	init( BG_KEY_TUPLE_TRUNCATE_OFFSET,                            0 );
+	init( BG_ENABLE_READ_DRIVEN_COMPACTION,                     true ); if (randomize && BUGGIFY) BG_ENABLE_READ_DRIVEN_COMPACTION = false;
+	init( BG_RDC_BYTES_FACTOR,                                     2 ); if (randomize && BUGGIFY) BG_RDC_BYTES_FACTOR = deterministicRandom()->randomInt(1, 10);
+	init( BG_RDC_READ_FACTOR,                                      3 ); if (randomize && BUGGIFY) BG_RDC_READ_FACTOR = deterministicRandom()->randomInt(1, 10);

 	init( BG_ENABLE_MERGING,                                    true ); if (randomize && BUGGIFY) BG_ENABLE_MERGING = false;
 	init( BG_MERGE_CANDIDATE_THRESHOLD_SECONDS, isSimulated ? 20.0 : 30 * 60 ); if (randomize && BUGGIFY) BG_MERGE_CANDIDATE_THRESHOLD_SECONDS = 5.0;
@ -974,6 +981,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM,                8 ); if( randomize && BUGGIFY ) BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM = 1;
 	init( BLOB_WORKER_RESNAPSHOT_PARALLELISM,                     40 ); if( randomize && BUGGIFY ) BLOB_WORKER_RESNAPSHOT_PARALLELISM = deterministicRandom()->randomInt(1, 10);
 	init( BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM,             2000 ); if( randomize && BUGGIFY ) BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM = deterministicRandom()->randomInt(10, 100);
+	init( BLOB_WORKER_RDC_PARALLELISM,                             2 ); if( randomize && BUGGIFY ) BLOB_WORKER_RDC_PARALLELISM = deterministicRandom()->randomInt(1, 6);
+
 	init( BLOB_WORKER_TIMEOUT,                                  10.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_TIMEOUT = 1.0;
 	init( BLOB_WORKER_REQUEST_TIMEOUT,                           5.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_REQUEST_TIMEOUT = 1.0;
 	init( BLOB_WORKERLIST_FETCH_INTERVAL,                        1.0 );
@ -996,8 +1005,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	// Blob Metadata
 	init( BLOB_METADATA_CACHE_TTL, isSimulated ? 120 : 24 * 60 * 60 );
 	if ( randomize && BUGGIFY) { BLOB_METADATA_CACHE_TTL = deterministicRandom()->randomInt(50, 100); }
-	init( BLOB_METADATA_REFRESH_INTERVAL,   isSimulated ? 60 : 60 * 60 );
-	if ( randomize && BUGGIFY) { BLOB_METADATA_REFRESH_INTERVAL = deterministicRandom()->randomInt(5, 120); }

 	// HTTP KMS Connector
 	init( REST_KMS_CONNECTOR_KMS_DISCOVERY_URL_MODE,           "file");
@ -1018,6 +1025,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	// NOTE: 'token-name" can NOT contain '#' character
 	init( REST_KMS_CONNECTOR_VALIDATION_TOKEN_DETAILS,             "");

+	// Drop in-memory state associated with an idempotency id after this many seconds. Once dropped, this id cannot be
+	// expired proactively, but will eventually get cleaned up by the idempotency id cleaner.
+	init( IDEMPOTENCY_ID_IN_MEMORY_LIFETIME,                       10);
+
 	// clang-format on

 	if (clientKnobs) {
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@ -284,8 +284,6 @@ const KeyRangeRef readConflictRangeKeysRange =
 const KeyRangeRef writeConflictRangeKeysRange = KeyRangeRef("\xff\xff/transaction/write_conflict_range/"_sr,
                                                            "\xff\xff/transaction/write_conflict_range/\xff\xff"_sr);

-const KeyRef clusterIdKey = "\xff/clusterId"_sr;
-
 const KeyRangeRef auditRange = KeyRangeRef("\xff/audit/"_sr, "\xff/audit0"_sr);
 const KeyRef auditPrefix = auditRange.begin;

@ -1074,6 +1072,11 @@ const KeyRangeRef timeKeeperPrefixRange("\xff\x02/timeKeeper/map/"_sr, "\xff\x02
 const KeyRef timeKeeperVersionKey = "\xff\x02/timeKeeper/version"_sr;
 const KeyRef timeKeeperDisableKey = "\xff\x02/timeKeeper/disable"_sr;

+// Durable cluster ID key. Added "Key" to the end to differentiate from the key
+// "\xff/clusterId" which was stored in the txnStateStore in FDB 7.1, whereas
+// this key is stored in the database in 7.2+.
+const KeyRef clusterIdKey = "\xff/clusterIdKey"_sr;
+
 // Backup Log Mutation constant variables
 const KeyRef backupEnabledKey = "\xff/backupEnabled"_sr;
 const KeyRangeRef backupLogKeys("\xff\x02/blog/"_sr, "\xff\x02/blog0"_sr);
@ -1810,4 +1813,4 @@ TEST_CASE("noSim/SystemData/compat/KeyServers") {
 	printf("ssi serdes test complete\n");

 	return Void();
-}
+}
--- a/fdbclient/TaskBucket.actor.cpp
+++ b/fdbclient/TaskBucket.actor.cpp
@ -579,8 +579,8 @@ public:
 	                              int maxConcurrentTasks) {
 		state Reference<AsyncVar<bool>> paused = makeReference<AsyncVar<bool>>(true);
 		state Future<Void> watchPausedFuture = watchPaused(cx, taskBucket, paused);
-		taskBucket->metricLogger = traceCounters(
-		    "TaskBucketMetrics", taskBucket->dbgid, CLIENT_KNOBS->TASKBUCKET_LOGGING_DELAY, &taskBucket->cc);
+		taskBucket->metricLogger = taskBucket->cc.traceCounters(
+		    "TaskBucketMetrics", taskBucket->dbgid, CLIENT_KNOBS->TASKBUCKET_LOGGING_DELAY);
 		loop {
 			while (paused->get()) {
 				wait(paused->onChange() || watchPausedFuture);
--- a/fdbclient/include/fdbclient/BackupAgent.actor.h
+++ b/fdbclient/include/fdbclient/BackupAgent.actor.h
@ -196,6 +196,7 @@ public:
 	                        Key addPrefix = Key(),
 	                        Key removePrefix = Key(),
 	                        LockDB = LockDB::True,
+	                        UnlockDB = UnlockDB::True,
 	                        OnlyApplyMutationLogs = OnlyApplyMutationLogs::False,
 	                        InconsistentSnapshotOnly = InconsistentSnapshotOnly::False,
 	                        Version beginVersion = ::invalidVersion,
--- a/fdbclient/include/fdbclient/BlobMetadataUtils.h
+++ b/fdbclient/include/fdbclient/BlobMetadataUtils.h
@ -91,4 +91,8 @@ struct BlobMetadataDetailsRef {
 	}
 };

+Standalone<BlobMetadataDetailsRef> createRandomTestBlobMetadata(const std::string& baseUrl,
+                                                                BlobMetadataDomainId domainId,
+                                                                BlobMetadataDomainName domainName);
+
 #endif
--- a/fdbclient/include/fdbclient/BlobWorkerCommon.h
+++ b/fdbclient/include/fdbclient/BlobWorkerCommon.h
@ -45,6 +45,7 @@ struct BlobWorkerStats {
 	Counter compressionBytesFinal;
 	Counter fullRejections;
 	Counter forceFlushCleanups;
+	Counter readDrivenCompactions;

 	int numRangesAssigned;
 	int mutationBytesBuffered;
@ -83,10 +84,11 @@ struct BlobWorkerStats {
 	    readRequestsWithBegin("ReadRequestsWithBegin", cc), readRequestsCollapsed("ReadRequestsCollapsed", cc),
 	    flushGranuleReqs("FlushGranuleReqs", cc), compressionBytesRaw("CompressionBytesRaw", cc),
 	    compressionBytesFinal("CompressionBytesFinal", cc), fullRejections("FullRejections", cc),
-	    forceFlushCleanups("ForceFlushCleanups", cc), numRangesAssigned(0), mutationBytesBuffered(0),
-	    activeReadRequests(0), granulesPendingSplitCheck(0), minimumCFVersion(0), cfVersionLag(0),
-	    notAtLatestChangeFeeds(0), lastResidentMemory(0), estimatedMaxResidentMemory(0),
-	    initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock), deltaWritesLock(deltaWritesLock) {
+	    forceFlushCleanups("ForceFlushCleanups", cc), readDrivenCompactions("ReadDrivenCompactions", cc),
+	    numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0), granulesPendingSplitCheck(0),
+	    minimumCFVersion(0), cfVersionLag(0), notAtLatestChangeFeeds(0), lastResidentMemory(0),
+	    estimatedMaxResidentMemory(0), initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock),
+	    deltaWritesLock(deltaWritesLock) {
 		specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; });
 		specialCounter(cc, "MutationBytesBuffered", [this]() { return this->mutationBytesBuffered; });
 		specialCounter(cc, "ActiveReadRequests", [this]() { return this->activeReadRequests; });
@ -103,8 +105,8 @@ struct BlobWorkerStats {
 		specialCounter(cc, "DeltaFileWritesActive", [this]() { return this->deltaWritesLock->activePermits(); });
 		specialCounter(cc, "DeltaFileWritesWaiting", [this]() { return this->deltaWritesLock->waiters(); });

-		logger = traceCounters("BlobWorkerMetrics", id, interval, &cc, "BlobWorkerMetrics");
+		logger = cc.traceCounters("BlobWorkerMetrics", id, interval, "BlobWorkerMetrics");
 	}
 };

-#endif
+#endif
--- a/fdbclient/include/fdbclient/BuildIdempotencyIdMutations.h
+++ b/fdbclient/include/fdbclient/BuildIdempotencyIdMutations.h
@ -22,7 +22,7 @@
 #define FDBCLIENT_BUILD_IDEMPOTENCY_ID_MUTATIONS_H

 #include "fdbclient/CommitProxyInterface.h"
-#include "fdbclient/IdempotencyId.h"
+#include "fdbclient/IdempotencyId.actor.h"

 #pragma once

--- a/fdbclient/include/fdbclient/ClientKnobs.h
+++ b/fdbclient/include/fdbclient/ClientKnobs.h
@ -264,6 +264,8 @@ public:
 	int64_t READ_COST_BYTE_FACTOR; // Used to round up the cost of read operations
 	// Cost multiplier for writes (because write operations are more expensive than reads):
 	double GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO;
+	double PROXY_MAX_TAG_THROTTLE_DURATION; // Maximum duration that a transaction can be tag throttled by proxy before
+	                                        // being rejected

 	// busyness reporting
 	double BUSYNESS_SPIKE_START_THRESHOLD;
@ -272,6 +274,7 @@ public:
 	// Blob Granules
 	int BG_MAX_GRANULE_PARALLELISM;
 	int BG_TOO_MANY_GRANULES;
+	int64_t BLOB_METADATA_REFRESH_INTERVAL;

 	// The coordinator key/value in storage server might be inconsistent to the value stored in the cluster file.
 	// This might happen when a recovery is happening together with a cluster controller coordinator key change.
--- a/fdbclient/include/fdbclient/CommitProxyInterface.h
+++ b/fdbclient/include/fdbclient/CommitProxyInterface.h
@ -30,7 +30,7 @@
 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/GlobalConfig.h"
 #include "fdbclient/GrvProxyInterface.h"
-#include "fdbclient/IdempotencyId.h"
+#include "fdbclient/IdempotencyId.actor.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbclient/TagThrottle.actor.h"
 #include "fdbclient/VersionVector.h"
@ -61,6 +61,7 @@ struct CommitProxyInterface {
 	RequestStream<struct ProxySnapRequest> proxySnapReq;
 	RequestStream<struct ExclusionSafetyCheckRequest> exclusionSafetyCheckReq;
 	RequestStream<struct GetDDMetricsRequest> getDDMetrics;
+	PublicRequestStream<struct ExpireIdempotencyIdRequest> expireIdempotencyId;

 	UID id() const { return commit.getEndpoint().token; }
 	std::string toString() const { return id().shortString(); }
@ -87,6 +88,8 @@ struct CommitProxyInterface {
 			exclusionSafetyCheckReq =
 			    RequestStream<struct ExclusionSafetyCheckRequest>(commit.getEndpoint().getAdjustedEndpoint(8));
 			getDDMetrics = RequestStream<struct GetDDMetricsRequest>(commit.getEndpoint().getAdjustedEndpoint(9));
+			expireIdempotencyId =
+			    PublicRequestStream<struct ExpireIdempotencyIdRequest>(commit.getEndpoint().getAdjustedEndpoint(10));
 		}
 	}

@ -103,6 +106,7 @@ struct CommitProxyInterface {
 		streams.push_back(proxySnapReq.getReceiver());
 		streams.push_back(exclusionSafetyCheckReq.getReceiver());
 		streams.push_back(getDDMetrics.getReceiver());
+		streams.push_back(expireIdempotencyId.getReceiver());
 		FlowTransport::transport().addEndpoints(streams);
 	}
 };
@ -151,6 +155,24 @@ struct ClientDBInfo {
 	}
 };

+struct ExpireIdempotencyIdRequest {
+	constexpr static FileIdentifier file_identifier = 1900933;
+	Version commitVersion = invalidVersion;
+	uint8_t batchIndexHighByte = 0;
+	TenantInfo tenant;
+
+	ExpireIdempotencyIdRequest() {}
+	ExpireIdempotencyIdRequest(Version commitVersion, uint8_t batchIndexHighByte, TenantInfo tenant)
+	  : commitVersion(commitVersion), batchIndexHighByte(batchIndexHighByte), tenant(tenant) {}
+
+	bool verify() const { return tenant.isAuthorized(); }
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, commitVersion, batchIndexHighByte, tenant);
+	}
+};
+
 struct CommitID {
 	constexpr static FileIdentifier file_identifier = 14254927;
 	Version version; // returns invalidVersion if transaction conflicts
--- a/fdbclient/include/fdbclient/DatabaseContext.h
+++ b/fdbclient/include/fdbclient/DatabaseContext.h
@ -382,7 +382,8 @@ public:
 	                                 Version end = std::numeric_limits<Version>::max(),
 	                                 KeyRange range = allKeys,
 	                                 int replyBufferSize = -1,
-	                                 bool canReadPopped = true);
+	                                 bool canReadPopped = true,
+	                                 ReadOptions readOptions = { ReadType::NORMAL, CacheResult::False });

 	Future<OverlappingChangeFeedsInfo> getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion);
 	Future<Void> popChangeFeedMutations(Key rangeID, Version version);
--- a/fdbclient/include/fdbclient/FDBTypes.h
+++ b/fdbclient/include/fdbclient/FDBTypes.h
@ -590,6 +590,8 @@ inline KeyRange prefixRange(KeyRef prefix) {
 // The returned reference is valid as long as keys is valid.
 KeyRef keyBetween(const KeyRangeRef& keys);

+KeyRangeRef toPrefixRelativeRange(KeyRangeRef range, KeyRef prefix);
+
 struct KeySelectorRef {
 private:
 	KeyRef key; // Find the last item less than key
@ -1633,13 +1635,7 @@ struct StorageWiggleValue {
 	}
 };

-enum class ReadType {
-	EAGER,
-	FETCH,
-	LOW,
-	NORMAL,
-	HIGH,
-};
+enum class ReadType { EAGER = 0, FETCH = 1, LOW = 2, NORMAL = 3, HIGH = 4, MIN = EAGER, MAX = HIGH };

 FDB_DECLARE_BOOLEAN_PARAM(CacheResult);

@ -1655,14 +1651,14 @@ struct ReadOptions {
 	Optional<UID> debugID;
 	Optional<Version> consistencyCheckStartVersion;

-	ReadOptions() : type(ReadType::NORMAL), cacheResult(CacheResult::True){};
-
-	ReadOptions(Optional<UID> debugID,
+	ReadOptions(Optional<UID> debugID = Optional<UID>(),
 	            ReadType type = ReadType::NORMAL,
-	            CacheResult cache = CacheResult::False,
+	            CacheResult cache = CacheResult::True,
 	            Optional<Version> version = Optional<Version>())
 	  : type(type), cacheResult(cache), debugID(debugID), consistencyCheckStartVersion(version){};

+	ReadOptions(ReadType type, CacheResult cache = CacheResult::True) : ReadOptions({}, type, cache) {}
+
 	template <class Ar>
 	void serialize(Ar& ar) {
 		serializer(ar, type, cacheResult, debugID, consistencyCheckStartVersion);
--- a/fdbclient/include/fdbclient/IdempotencyId.actor.h
+++ b/fdbclient/include/fdbclient/IdempotencyId.actor.h
@ -1,5 +1,5 @@
 /*
- * IdempotencyId.h
+ * IdempotencyId.actor.h
 *
 * This source file is part of the FoundationDB open source project
 *
@ -18,8 +18,13 @@
 * limitations under the License.
 */

-#ifndef FDBCLIENT_IDEMPOTENCYID_H
-#define FDBCLIENT_IDEMPOTENCYID_H
+// When actually compiled (NO_INTELLISENSE), include the generated version of this file.  In intellisense use the source
+// version.
+#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_IDEMPOTENCY_ID_ACTOR_G_H)
+#define FDBCLIENT_IDEMPOTENCY_ID_ACTOR_G_H
+#include "fdbclient/IdempotencyId.actor.g.h"
+#elif !defined(FDBCLIENT_IDEMPOTENCY_ID_ACTOR_H)
+#define FDBCLIENT_IDEMPOTENCY_ID_ACTOR_H

 #pragma once

@ -28,12 +33,24 @@
 #include "flow/Arena.h"
 #include "flow/IRandom.h"
 #include "flow/serialize.h"
+#include "flow/actorcompiler.h" // this has to be the last include

 struct CommitResult {
 	Version commitVersion;
 	uint16_t batchIndex;
 };

+// The type of the value stored at the key |idempotencyIdsExpiredVersion|
+struct IdempotencyIdsExpiredVersion {
+	static constexpr auto file_identifier = 3746945;
+	Version expired = 0;
+
+	template <class Archive>
+	void serialize(Archive& ar) {
+		serializer(ar, expired);
+	}
+};
+
 // See design/idempotency_ids.md for more information. Designed so that the common case of a random 16 byte id does not
 // usually require indirection. Either invalid or an id with length >= 16 and < 256.
 struct IdempotencyIdRef {
@ -163,4 +180,10 @@ private:
 // Check if id is present in kv, and if so return the commit version and batchIndex
 Optional<CommitResult> kvContainsIdempotencyId(const KeyValueRef& kv, const IdempotencyIdRef& id);

-#endif
+// Make a range containing only the idempotency key associated with version and highOrderBatchIndex
+KeyRangeRef makeIdempotencySingleKeyRange(Arena& arena, Version version, uint8_t highOrderBatchIndex);
+
+void decodeIdempotencyKey(KeyRef key, Version& commitVersion, uint8_t& highOrderBatchIndex);
+
+#include "flow/unactorcompiler.h"
+#endif
--- a/fdbclient/include/fdbclient/KeyLocationService.h
+++ b/fdbclient/include/fdbclient/KeyLocationService.h
@ -0,0 +1,48 @@
+/*
+ * KeyLocationService.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FOUNDATIONDB_KEYLOCATIONSERVICE_H
+#define FOUNDATIONDB_KEYLOCATIONSERVICE_H
+
+#include "fdbclient/NativeAPI.actor.h"
+#include "fdbclient/DatabaseContext.h"
+
+class IKeyLocationService {
+
+	// If isBackward == true, returns the shard containing the key before 'key' (an infinitely long, inexpressible key).
+	// Otherwise returns the shard containing key. It's possible the returned location is a failed interface.
+	virtual Future<KeyRangeLocationInfo> getKeyLocation(TenantInfo tenant,
+	                                                    Key key,
+	                                                    SpanContext spanContext,
+	                                                    Optional<UID> debugID,
+	                                                    UseProvisionalProxies useProvisionalProxies,
+	                                                    Reverse isBackward,
+	                                                    Version version) = 0;
+
+	virtual Future<std::vector<KeyRangeLocationInfo>> getKeyRangeLocations(TenantInfo tenant,
+	                                                                       KeyRange keys,
+	                                                                       int limit,
+	                                                                       Reverse reverse,
+	                                                                       SpanContext spanContext,
+	                                                                       Optional<UID> debugID,
+	                                                                       UseProvisionalProxies useProvisionalProxies,
+	                                                                       Version version) = 0;
+};
+
+#endif // FOUNDATIONDB_KEYLOCATIONSERVICE_H
--- a/fdbclient/include/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/include/fdbclient/MultiVersionTransaction.h
@ -1027,6 +1027,7 @@ public:
 		ThreadFuture<Void> protocolVersionMonitor;

 		Future<Void> sharedStateUpdater;
+		bool isConfigDB;

 		// Versions older than 6.1 do not benefit from having their database connections closed. Additionally,
 		// there are various issues that result in negative behavior in some cases if the connections are closed.
--- a/fdbclient/include/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/include/fdbclient/NativeAPI.actor.h
@ -271,6 +271,8 @@ struct TransactionState : ReferenceCounted<TransactionState> {
 	// prefix/<key2> : '0' - any keys equal or larger than this key are (definitely) not conflicting keys
 	std::shared_ptr<CoalescedKeyRangeMap<Value>> conflictingKeys;

+	bool automaticIdempotency = false;
+
 	// Only available so that Transaction can have a default constructor, for use in state variables
 	TransactionState(TaskPriority taskID, SpanContext spanContext)
 	  : taskID(taskID), spanContext(spanContext), tenantSet(false) {}
@ -487,6 +489,7 @@ public:
 	Database getDatabase() const { return trState->cx; }
 	static Reference<TransactionLogInfo> createTrLogInfoProbabilistically(const Database& cx);

+	Transaction& getTransaction() { return *this; }
 	void setTransactionID(UID id);
 	void setToken(uint64_t token);

@ -603,6 +606,26 @@ int64_t getMaxWriteKeySize(KeyRef const& key, bool hasRawAccess);
 // Returns the maximum legal size of a key that can be cleared. Keys larger than this will be assumed not to exist.
 int64_t getMaxClearKeySize(KeyRef const& key);

+struct KeyRangeLocationInfo;
+// Return the aggregated StorageMetrics of range keys to the caller. The locations tell which interface should
+// serve the request. The final result is within (min-permittedError/2, max + permittedError/2) if valid.
+ACTOR Future<Optional<StorageMetrics>> waitStorageMetricsWithLocation(TenantInfo tenantInfo,
+                                                                      KeyRange keys,
+                                                                      std::vector<KeyRangeLocationInfo> locations,
+                                                                      StorageMetrics min,
+                                                                      StorageMetrics max,
+                                                                      StorageMetrics permittedError);
+
+// Return the suggested split points from storage server.The locations tell which interface should
+// serve the request. `limit` is the current estimated storage metrics of `keys`.The returned points, if present,
+// guarantee the metrics of split result is within limit.
+ACTOR Future<Optional<Standalone<VectorRef<KeyRef>>>> splitStorageMetricsWithLocations(
+    std::vector<KeyRangeLocationInfo> locations,
+    KeyRange keys,
+    StorageMetrics limit,
+    StorageMetrics estimated,
+    Optional<int> minSplitBytes);
+
 namespace NativeAPI {
 ACTOR Future<std::vector<std::pair<StorageServerInterface, ProcessClass>>> getServerListAndProcessClasses(
    Transaction* tr);
--- a/fdbclient/include/fdbclient/ServerKnobs.h
+++ b/fdbclient/include/fdbclient/ServerKnobs.h
@ -349,6 +349,7 @@ public:
 	bool ROCKSDB_DISABLE_WAL_EXPERIMENTAL;
 	bool ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE;
 	int64_t ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT;
+	bool ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS;
 	int64_t ROCKSDB_COMPACTION_READAHEAD_SIZE;
 	int64_t ROCKSDB_BLOCK_SIZE;
 	bool ENABLE_SHARDED_ROCKSDB;
@ -715,7 +716,6 @@ public:
 	int FETCH_KEYS_LOWER_PRIORITY;
 	int SERVE_FETCH_CHECKPOINT_PARALLELISM;
 	int SERVE_AUDIT_STORAGE_PARALLELISM;
-	int CHANGE_FEED_DISK_READS_PARALLELISM;
 	int BUGGIFY_BLOCK_BYTES;
 	int64_t STORAGE_RECOVERY_VERSION_LAG_LIMIT;
 	double STORAGE_DURABILITY_LAG_REJECT_THRESHOLD;
@ -754,6 +754,9 @@ public:
 	int QUICK_GET_KEY_VALUES_LIMIT;
 	int QUICK_GET_KEY_VALUES_LIMIT_BYTES;
 	int STORAGE_FEED_QUERY_HARD_LIMIT;
+	int STORAGE_SERVER_READ_CONCURRENCY;
+	std::string STORAGESERVER_READ_RANKS;
+	std::string STORAGESERVER_READ_PRIORITIES;

 	// Wait Failure
 	int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS;
@ -883,7 +886,6 @@ public:
 	int REDWOOD_DEFAULT_EXTENT_SIZE; // Extent size for new Redwood files
 	int REDWOOD_DEFAULT_EXTENT_READ_SIZE; // Extent read size for Redwood files
 	int REDWOOD_EXTENT_CONCURRENT_READS; // Max number of simultaneous extent disk reads in progress.
-	int REDWOOD_KVSTORE_CONCURRENT_READS; // Max number of simultaneous point or range reads in progress.
 	bool REDWOOD_KVSTORE_RANGE_PREFETCH; // Whether to use range read prefetching
 	double REDWOOD_PAGE_REBUILD_MAX_SLACK; // When rebuilding pages, max slack to allow in page
 	int REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES; // Number of pages to try to pop from the lazy delete queue and process at
@ -903,6 +905,8 @@ public:
 	int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches
 	bool REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT; // Whether to split pages by tenant if encryption is enabled

+	std::string REDWOOD_PRIORITY_LAUNCHS;
+
 	// Server request latency measurement
 	int LATENCY_SAMPLE_SIZE;
 	double LATENCY_METRICS_LOGGING_INTERVAL;
@ -947,10 +951,14 @@ public:
 	int BG_MERGE_CANDIDATE_THRESHOLD_SECONDS;
 	int BG_MERGE_CANDIDATE_DELAY_SECONDS;
 	int BG_KEY_TUPLE_TRUNCATE_OFFSET;
+	bool BG_ENABLE_READ_DRIVEN_COMPACTION;
+	int BG_RDC_BYTES_FACTOR;
+	int BG_RDC_READ_FACTOR;

 	int BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM;
 	int BLOB_WORKER_RESNAPSHOT_PARALLELISM;
 	int BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM;
+	int BLOB_WORKER_RDC_PARALLELISM;

 	double BLOB_WORKER_TIMEOUT; // Blob Manager's reaction time to a blob worker failure
 	double BLOB_WORKER_REQUEST_TIMEOUT; // Blob Worker's server-side request timeout
@ -972,7 +980,6 @@ public:

 	// Blob metadata
 	int64_t BLOB_METADATA_CACHE_TTL;
-	int64_t BLOB_METADATA_REFRESH_INTERVAL;

 	// HTTP KMS Connector
 	std::string REST_KMS_CONNECTOR_KMS_DISCOVERY_URL_MODE;
@ -986,6 +993,9 @@ public:
 	std::string REST_KMS_CONNECTOR_GET_ENCRYPTION_KEYS_ENDPOINT;
 	std::string REST_KMS_CONNECTOR_GET_BLOB_METADATA_ENDPOINT;

+	// Idempotency ids
+	double IDEMPOTENCY_ID_IN_MEMORY_LIFETIME;
+
 	ServerKnobs(Randomize, ClientKnobs*, IsSimulated);
 	void initialize(Randomize, ClientKnobs*, IsSimulated);
 };
--- a/fdbclient/include/fdbclient/StorageCheckpoint.h
+++ b/fdbclient/include/fdbclient/StorageCheckpoint.h
@ -45,7 +45,7 @@ struct CheckpointMetaData {

 	constexpr static FileIdentifier file_identifier = 13804342;
 	Version version;
-	KeyRange range;
+	std::vector<KeyRange> ranges;
 	int16_t format; // CheckpointFormat.
 	UID ssID; // Storage server ID on which this checkpoint is created.
 	UID checkpointID; // A unique id for this checkpoint.
@ -58,11 +58,15 @@ struct CheckpointMetaData {

 	CheckpointMetaData() = default;
 	CheckpointMetaData(KeyRange const& range, CheckpointFormat format, UID const& ssID, UID const& checkpointID)
-	  : version(invalidVersion), range(range), format(format), ssID(ssID), checkpointID(checkpointID), state(Pending),
-	    referenceCount(0), gcTime(0) {}
+	  : version(invalidVersion), format(format), ssID(ssID), checkpointID(checkpointID), state(Pending),
+	    referenceCount(0), gcTime(0) {
+		this->ranges.push_back(range);
+	}
 	CheckpointMetaData(Version version, KeyRange const& range, CheckpointFormat format, UID checkpointID)
-	  : version(version), range(range), format(format), ssID(UID()), checkpointID(checkpointID), state(Pending),
-	    referenceCount(0), gcTime(0) {}
+	  : version(version), format(format), ssID(UID()), checkpointID(checkpointID), state(Pending), referenceCount(0),
+	    gcTime(0) {
+		this->ranges.push_back(range);
+	}

 	CheckpointState getState() const { return static_cast<CheckpointState>(state); }

@ -73,7 +77,7 @@ struct CheckpointMetaData {
 	void setFormat(CheckpointFormat format) { this->format = static_cast<int16_t>(format); }

 	std::string toString() const {
-		std::string res = "Checkpoint MetaData:\nRange: " + range.toString() + "\nVersion: " + std::to_string(version) +
+		std::string res = "Checkpoint MetaData:\nRange: " + describe(ranges) + "\nVersion: " + std::to_string(version) +
 		                  "\nFormat: " + std::to_string(format) + "\nServer: " + ssID.toString() +
 		                  "\nID: " + checkpointID.toString() + "\nState: " + std::to_string(static_cast<int>(state)) +
 		                  "\n";
@ -82,7 +86,7 @@ struct CheckpointMetaData {

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, version, range, format, state, checkpointID, ssID, gcTime, serializedCheckpoint);
+		serializer(ar, version, ranges, format, state, checkpointID, ssID, gcTime, serializedCheckpoint);
 	}
 };

@ -99,23 +103,28 @@ struct DataMoveMetaData {
 	constexpr static FileIdentifier file_identifier = 13804362;
 	UID id; // A unique id for this data move.
 	Version version;
-	KeyRange range;
+	std::vector<KeyRange> ranges;
 	int priority;
 	std::set<UID> src;
 	std::set<UID> dest;
+	std::set<UID> checkpoints;
 	int16_t phase; // DataMoveMetaData::Phase.
+	int8_t mode;

 	DataMoveMetaData() = default;
-	DataMoveMetaData(UID id, Version version, KeyRange range)
-	  : id(id), version(version), range(std::move(range)), priority(0) {}
-	DataMoveMetaData(UID id, KeyRange range) : id(id), version(invalidVersion), range(std::move(range)), priority(0) {}
+	DataMoveMetaData(UID id, Version version, KeyRange range) : id(id), version(version), priority(0), mode(0) {
+		this->ranges.push_back(range);
+	}
+	DataMoveMetaData(UID id, KeyRange range) : id(id), version(invalidVersion), priority(0), mode(0) {
+		this->ranges.push_back(range);
+	}

 	Phase getPhase() const { return static_cast<Phase>(phase); }

 	void setPhase(Phase phase) { this->phase = static_cast<int16_t>(phase); }

 	std::string toString() const {
-		std::string res = "DataMoveMetaData: [ID]: " + id.shortString() + " [Range]: " + range.toString() +
+		std::string res = "DataMoveMetaData: [ID]: " + id.shortString() + " [Range]: " + describe(ranges) +
 		                  " [Phase]: " + std::to_string(static_cast<int>(phase)) +
 		                  " [Source Servers]: " + describe(src) + " [Destination Servers]: " + describe(dest);
 		return res;
@ -123,7 +132,7 @@ struct DataMoveMetaData {

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, id, version, range, phase, src, dest);
+		serializer(ar, id, version, ranges, priority, src, dest, checkpoints, phase, mode);
 	}
 };

--- a/fdbclient/include/fdbclient/StorageServerInterface.h
+++ b/fdbclient/include/fdbclient/StorageServerInterface.h
@ -890,16 +890,16 @@ struct ChangeFeedStreamRequest {
 	KeyRange range;
 	int replyBufferSize = -1;
 	bool canReadPopped = true;
-	UID debugUID; // This is only used for debugging and tracing, but being able to link a client + server side stream
-	              // is so useful for testing, and this is such small overhead compared to streaming large amounts of
-	              // change feed data, it is left in the interface
+	UID id; // This must be globally unique among ChangeFeedStreamRequest instances
+	Optional<ReadOptions> options;

 	ReplyPromiseStream<ChangeFeedStreamReply> reply;

 	ChangeFeedStreamRequest() {}
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, rangeID, begin, end, range, reply, spanContext, replyBufferSize, canReadPopped, debugUID, arena);
+		serializer(
+		    ar, rangeID, begin, end, range, reply, spanContext, replyBufferSize, canReadPopped, id, options, arena);
 	}
 };

--- a/fdbclient/include/fdbclient/SystemData.h
+++ b/fdbclient/include/fdbclient/SystemData.h
@ -92,8 +92,6 @@ void decodeKeyServersValue(RangeResult result,
                           UID& destID,
                           bool missingIsError = true);

-extern const KeyRef clusterIdKey;
-
 extern const KeyRangeRef auditRange;
 extern const KeyRef auditPrefix;
 const Key auditRangeKey(const AuditType type, const UID& auditId, const KeyRef& key);
@ -505,6 +503,9 @@ extern const KeyRangeRef timeKeeperPrefixRange;
 extern const KeyRef timeKeeperVersionKey;
 extern const KeyRef timeKeeperDisableKey;

+// Durable cluster ID key
+extern const KeyRef clusterIdKey;
+
 // Layer status metadata prefix
 extern const KeyRangeRef layerStatusMetaPrefixRange;

--- a/fdbclient/include/fdbclient/TenantEntryCache.actor.h
+++ b/fdbclient/include/fdbclient/TenantEntryCache.actor.h
@ -68,6 +68,10 @@ using TenantEntryCachePayloadFunc = std::function<TenantEntryCachePayload<T>(con
 // 1. Lookup by 'TenantId'
 // 2. Lookup by 'TenantPrefix'
 // 3. Lookup by 'TenantName'
+// TODO: Currently this cache performs poorly if there are tenant access happening to unknown tenants which happens most
+// frequently in optional tenant mode but can also happen in required mode if there are alot of tenants created. Further
+// as a consequence of the design we cannot be sure that the state of a given tenant is accurate even if its present in
+// the cache.

 template <class T>
 class TenantEntryCache : public ReferenceCounted<TenantEntryCache<T>>, NonCopyable {
--- a/fdbclient/include/fdbclient/Tracing.h
+++ b/fdbclient/include/fdbclient/Tracing.h
@ -273,17 +273,4 @@ struct ITracer {
 	virtual void trace(Span const& span) = 0;
 };

-void openTracer(TracerType type);
-
-template <class T>
-struct SpannedDeque : Deque<T> {
-	Span span;
-	explicit SpannedDeque(Location loc) : span(loc) {}
-	SpannedDeque(SpannedDeque&& other) : Deque<T>(std::move(other)), span(std::move(other.span)) {}
-	SpannedDeque(SpannedDeque const&) = delete;
-	SpannedDeque& operator=(SpannedDeque const&) = delete;
-	SpannedDeque& operator=(SpannedDeque&& other) {
-		*static_cast<Deque<T>*>(this) = std::move(other);
-		span = std::move(other.span);
-	}
-};
+void openTracer(TracerType type);
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@ -279,7 +279,7 @@ description is not currently required but encouraged.
            description="Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit." />
    <Option name="idempotency_id" code="504"
            paramType="String" paramDescription="Unique ID"
-            description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes. This feature is in development and not ready for general use."
+            description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes. This feature is in development and not ready for general use. Unless the automatic_idempotency option is set after this option, the client will not automatically attempt to remove this id from the cluster after a successful commit."
            hidden="true" />
    <Option name="automatic_idempotency" code="505"
            description="Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future. This feature is in development and not ready for general use."
--- a/fdbmonitor/CMakeLists.txt
+++ b/fdbmonitor/CMakeLists.txt
@ -5,9 +5,8 @@ get_target_property(fdbclient_target_includes fdbclient INCLUDE_DIRECTORIES)
 target_link_libraries(fdbmonitor PUBLIC SimpleOpt)
 target_include_directories(fdbmonitor PUBLIC "${fdbclient_target_includes}")
 strip_debug_symbols(fdbmonitor)
-assert_no_version_h(fdbmonitor)
 if(UNIX AND NOT APPLE)
-    target_link_libraries(fdbmonitor PRIVATE rt)
+  target_link_libraries(fdbmonitor PRIVATE rt)
 endif()
 # FIXME: This include directory is an ugly hack. We probably want to fix this.
 # as soon as we get rid of the old build system
@ -17,17 +16,17 @@ target_link_libraries(fdbmonitor PUBLIC Threads::Threads)
 # appears to change its behavior (it no longer seems to restart killed
 # processes). fdbmonitor is single-threaded anyway.
 get_target_property(fdbmonitor_options fdbmonitor COMPILE_OPTIONS)
-if (NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
+if(NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
  list(REMOVE_ITEM fdbmonitor_options "-fsanitize=thread")
  set_property(TARGET fdbmonitor PROPERTY COMPILE_OPTIONS ${fdbmonitor_options})
-endif ()
+endif()

 get_target_property(fdbmonitor_options fdbmonitor LINK_OPTIONS)

-if (NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
+if(NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
  list(REMOVE_ITEM fdbmonitor_options "-fsanitize=thread")
  set_property(TARGET fdbmonitor PROPERTY LINK_OPTIONS ${fdbmonitor_options})
-endif ()
+endif()

 if(GENERATE_DEBUG_PACKAGES)
  fdb_install(TARGETS fdbmonitor DESTINATION fdbmonitor COMPONENT server)
@ -51,7 +50,7 @@ add_custom_target(clean_sandbox

 add_custom_target(start_sandbox
  COMMAND ${CMAKE_BINARY_DIR}/bin/fdbmonitor --conffile ${CMAKE_BINARY_DIR}/sandbox/foundationdb.conf
-                                             --lockfile ${CMAKE_BINARY_DIR}/sandbox/fdbmonitor.lock)
+  --lockfile ${CMAKE_BINARY_DIR}/sandbox/fdbmonitor.lock)

 add_dependencies(start_sandbox fdbmonitor fdbserver)

@ -61,6 +60,6 @@ if(NOT EXISTS ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh)
 endif()

 add_custom_target(generate_profile
-  COMMAND  ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh ${CMAKE_BINARY_DIR})
+  COMMAND ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh ${CMAKE_BINARY_DIR})

 add_dependencies(generate_profile fdbmonitor fdbserver mako fdbcli)
--- a/fdbrpc/Stats.actor.cpp
+++ b/fdbrpc/Stats.actor.cpp
@ -24,8 +24,8 @@
 Counter::Counter(std::string const& name, CounterCollection& collection)
  : name(name), interval_start(0), last_event(0), interval_sq_time(0), roughness_interval_start(0), interval_delta(0),
    interval_start_value(0) {
-	metric.init(collection.name + "." + (char)toupper(name.at(0)) + name.substr(1), collection.id);
-	collection.counters.push_back(this);
+	metric.init(collection.getName() + "." + (char)toupper(name.at(0)) + name.substr(1), collection.getId());
+	collection.addCounter(this);
 }

 void Counter::operator+=(Value delta) {
@ -88,36 +88,48 @@ void CounterCollection::logToTraceEvent(TraceEvent& te) const {
 	}
 }

-ACTOR Future<Void> traceCounters(std::string traceEventName,
-                                 UID traceEventID,
-                                 double interval,
-                                 CounterCollection* counters,
-                                 std::string trackLatestName,
-                                 std::function<void(TraceEvent&)> decorator) {
-	wait(delay(0)); // Give an opportunity for all members used in special counters to be initialized
+class CounterCollectionImpl {
+public:
+	ACTOR static Future<Void> traceCounters(CounterCollection* counters,
+	                                        std::string traceEventName,
+	                                        UID traceEventID,
+	                                        double interval,
+	                                        std::string trackLatestName,
+	                                        std::function<void(TraceEvent&)> decorator) {
+		wait(delay(0)); // Give an opportunity for all members used in special counters to be initialized

-	for (ICounter* c : counters->counters)
-		c->resetInterval();
-
-	state Reference<EventCacheHolder> traceEventHolder;
-	if (!trackLatestName.empty()) {
-		traceEventHolder = makeReference<EventCacheHolder>(trackLatestName);
-	}
-
-	state double last_interval = now();
-
-	loop {
-		TraceEvent te(traceEventName.c_str(), traceEventID);
-		te.detail("Elapsed", now() - last_interval);
-
-		counters->logToTraceEvent(te);
-		decorator(te);
+		for (ICounter* c : counters->counters)
+			c->resetInterval();

+		state Reference<EventCacheHolder> traceEventHolder;
 		if (!trackLatestName.empty()) {
-			te.trackLatest(traceEventHolder->trackingKey);
+			traceEventHolder = makeReference<EventCacheHolder>(trackLatestName);
 		}

-		last_interval = now();
-		wait(delay(interval, TaskPriority::FlushTrace));
+		state double last_interval = now();
+
+		loop {
+			TraceEvent te(traceEventName.c_str(), traceEventID);
+			te.detail("Elapsed", now() - last_interval);
+
+			counters->logToTraceEvent(te);
+			decorator(te);
+
+			if (!trackLatestName.empty()) {
+				te.trackLatest(traceEventHolder->trackingKey);
+			}
+
+			last_interval = now();
+			wait(delay(interval, TaskPriority::FlushTrace));
+		}
 	}
+};
+
+Future<Void> CounterCollection::traceCounters(std::string const& traceEventName,
+                                              UID traceEventID,
+                                              double interval,
+                                              std::string const& trackLatestName,
+                                              std::function<void(TraceEvent&)> const& decorator) {
+	return CounterCollectionImpl::traceCounters(
+	    this, traceEventName, traceEventID, interval, trackLatestName, decorator);
 }
--- a/fdbrpc/include/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/include/fdbrpc/LoadBalance.actor.h
@ -757,12 +757,18 @@ Optional<BasicLoadBalancedReply> getBasicLoadBalancedReply(const BasicLoadBalanc
 Optional<BasicLoadBalancedReply> getBasicLoadBalancedReply(const void*);

 // A simpler version of LoadBalance that does not send second requests where the list of servers are always fresh
+//
+// If |alternativeChosen| is not null, then atMostOnce must be True, and if the returned future completes successfully
+// then *alternativeChosen will be the alternative to which the message was sent. *alternativeChosen must outlive the
+// returned future.
 ACTOR template <class Interface, class Request, class Multi, bool P>
 Future<REPLY_TYPE(Request)> basicLoadBalance(Reference<ModelInterface<Multi>> alternatives,
                                             RequestStream<Request, P> Interface::*channel,
                                             Request request = Request(),
                                             TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint,
-                                             AtMostOnce atMostOnce = AtMostOnce::False) {
+                                             AtMostOnce atMostOnce = AtMostOnce::False,
+                                             int* alternativeChosen = nullptr) {
+	ASSERT(alternativeChosen == nullptr || atMostOnce == AtMostOnce::True);
 	setReplyPriority(request, taskID);
 	if (!alternatives)
 		return Never();
@ -791,6 +797,9 @@ Future<REPLY_TYPE(Request)> basicLoadBalance(Reference<ModelInterface<Multi>> al
 				useAlt = (nextAlt + alternatives->size() - 1) % alternatives->size();

 			stream = &alternatives->get(useAlt, channel);
+			if (alternativeChosen != nullptr) {
+				*alternativeChosen = useAlt;
+			}
 			if (!IFailureMonitor::failureMonitor().getState(stream->getEndpoint()).failed)
 				break;
 			nextAlt = (nextAlt + 1) % alternatives->size();
--- a/fdbrpc/include/fdbrpc/Stats.h
+++ b/fdbrpc/include/fdbrpc/Stats.h
@ -67,17 +67,37 @@ struct Traceable<ICounter*> : std::true_type {
 	}
 };

-struct CounterCollection {
-	CounterCollection(std::string name, std::string id = std::string()) : name(name), id(id) {}
-	std::vector<struct ICounter*> counters, counters_to_remove;
-	~CounterCollection() {
-		for (auto c : counters_to_remove)
-			c->remove();
-	}
+class CounterCollection {
+	friend class CounterCollectionImpl;
+
 	std::string name;
 	std::string id;
+	std::vector<struct ICounter*> counters, countersToRemove;
+
+public:
+	CounterCollection(std::string const& name, std::string const& id = std::string()) : name(name), id(id) {}
+	~CounterCollection() {
+		for (auto c : countersToRemove)
+			c->remove();
+	}
+
+	void addCounter(ICounter* counter) { counters.push_back(counter); }
+
+	// Call remove method on this counter in ~CounterCollection
+	void markForRemoval(ICounter* counter) { countersToRemove.push_back(counter); }
+
+	std::string const& getName() const { return name; }
+
+	std::string const& getId() const { return id; }

 	void logToTraceEvent(TraceEvent& te) const;
+
+	Future<Void> traceCounters(
+	    std::string const& traceEventName,
+	    UID traceEventID,
+	    double interval,
+	    std::string const& trackLatestName = std::string(),
+	    std::function<void(TraceEvent&)> const& decorator = [](auto& te) {});
 };

 struct Counter final : ICounter, NonCopyable {
@ -131,8 +151,8 @@ struct Traceable<Counter> : std::true_type {
 template <class F>
 struct SpecialCounter final : ICounter, FastAllocated<SpecialCounter<F>>, NonCopyable {
 	SpecialCounter(CounterCollection& collection, std::string const& name, F&& f) : name(name), f(f) {
-		collection.counters.push_back(this);
-		collection.counters_to_remove.push_back(this);
+		collection.addCounter(this);
+		collection.markForRemoval(this);
 	}
 	void remove() override { delete this; }

@ -162,14 +182,6 @@ static void specialCounter(CounterCollection& collection, std::string const& nam
 	new SpecialCounter<F>(collection, name, std::move(f));
 }

-Future<Void> traceCounters(
-    std::string const& traceEventName,
-    UID const& traceEventID,
-    double const& interval,
-    CounterCollection* const& counters,
-    std::string const& trackLatestName = std::string(),
-    std::function<void(TraceEvent&)> const& decorator = [](TraceEvent& te) {});
-
 class LatencyBands {
 public:
 	LatencyBands(std::string name, UID id, double loggingInterval)
@ -180,7 +192,7 @@ public:
 			if (bands.size() == 0) {
 				ASSERT(!cc && !filteredCount);
 				cc = std::make_unique<CounterCollection>(name, id.toString());
-				logger = traceCounters(name, id, loggingInterval, cc.get(), id.toString() + "/" + name);
+				logger = cc->traceCounters(name, id, loggingInterval, id.toString() + "/" + name);
 				filteredCount = std::make_unique<Counter>("Filtered", *cc);
 				insertBand(std::numeric_limits<double>::infinity());
 			}
--- a/fdbrpc/include/fdbrpc/simulator.h
+++ b/fdbrpc/include/fdbrpc/simulator.h
@ -54,6 +54,7 @@ public:
 		FailDisk,
 		RebootAndDelete,
 		RebootProcessAndDelete,
+		RebootProcessAndSwitch,
 		Reboot,
 		RebootProcess,
 		None
@ -104,6 +105,7 @@ public:
 		bool excluded;
 		bool cleared;
 		bool rebooting;
+		bool drProcess;
 		std::vector<flowGlobalType> globals;

 		INetworkConnections* network;
@ -128,8 +130,8 @@ public:
 		            const char* coordinationFolder)
 		  : name(name), coordinationFolder(coordinationFolder), dataFolder(dataFolder), machine(nullptr),
 		    addresses(addresses), address(addresses.address), locality(locality), startingClass(startingClass),
-		    failed(false), excluded(false), cleared(false), rebooting(false), network(net), fault_injection_r(0),
-		    fault_injection_p1(0), fault_injection_p2(0), failedDisk(false) {
+		    failed(false), excluded(false), cleared(false), rebooting(false), drProcess(false), network(net),
+		    fault_injection_r(0), fault_injection_p1(0), fault_injection_p2(0), failedDisk(false) {
 			uid = deterministicRandom()->randomUniqueID();
 		}

@ -283,7 +285,8 @@ public:
 	                                ProcessClass startingClass,
 	                                const char* dataFolder,
 	                                const char* coordinationFolder,
-	                                ProtocolVersion protocol) = 0;
+	                                ProtocolVersion protocol,
+	                                bool drProcess) = 0;
 	virtual void killProcess(ProcessInfo* machine, KillType) = 0;
 	virtual void rebootProcess(Optional<Standalone<StringRef>> zoneId, bool allProcesses) = 0;
 	virtual void rebootProcess(ProcessInfo* process, KillType kt) = 0;
@ -304,6 +307,7 @@ public:
 	                          KillType kt,
 	                          bool forceKill = false,
 	                          KillType* ktFinal = nullptr) = 0;
+	virtual bool killAll(KillType kt, bool forceKill = false, KillType* ktFinal = nullptr) = 0;
 	// virtual KillType getMachineKillState( UID zoneID ) = 0;
 	virtual bool canKillProcesses(std::vector<ProcessInfo*> const& availableProcesses,
 	                              std::vector<ProcessInfo*> const& deadProcesses,
@ -390,6 +394,13 @@ public:
 		return clearedAddresses.find(address) != clearedAddresses.end();
 	}

+	void switchCluster(NetworkAddress const& address) { switchedCluster[address] = !switchedCluster[address]; }
+	bool hasSwitchedCluster(NetworkAddress const& address) const {
+		return switchedCluster.find(address) != switchedCluster.end() ? switchedCluster.at(address) : false;
+	}
+	void toggleGlobalSwitchCluster() { globalSwitchedCluster = !globalSwitchedCluster; }
+	bool globalHasSwitchedCluster() const { return globalSwitchedCluster; }
+
 	void excludeAddress(NetworkAddress const& address) {
 		excludedAddresses[address]++;
 		TraceEvent("ExcludeAddress").detail("Address", address).detail("Value", excludedAddresses[address]);
@ -540,6 +551,8 @@ private:
 	std::set<Optional<Standalone<StringRef>>> swapsDisabled;
 	std::map<NetworkAddress, int> excludedAddresses;
 	std::map<NetworkAddress, int> clearedAddresses;
+	std::map<NetworkAddress, bool> switchedCluster;
+	bool globalSwitchedCluster = false;
 	std::map<NetworkAddress, std::map<std::string, int>> roleAddresses;
 	std::map<std::string, double> disabledMap;
 	bool allSwapsDisabled;
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@ -1261,7 +1261,8 @@ public:
 	                        ProcessClass startingClass,
 	                        const char* dataFolder,
 	                        const char* coordinationFolder,
-	                        ProtocolVersion protocol) override {
+	                        ProtocolVersion protocol,
+	                        bool drProcess) override {
 		ASSERT(locality.machineId().present());
 		MachineInfo& machine = machines[locality.machineId().get()];
 		if (!machine.machineId.present())
@ -1311,6 +1312,7 @@ public:
 		m->excluded = g_simulator->isExcluded(NetworkAddress(ip, port, true, false));
 		m->cleared = g_simulator->isCleared(addresses.address);
 		m->protocolVersion = protocol;
+		m->drProcess = drProcess;

 		m->setGlobal(enTDMetrics, (flowGlobalType)&m->tdmetrics);
 		if (FLOW_KNOBS->ENABLE_CHAOS_FEATURES) {
@ -1324,7 +1326,8 @@ public:
 		    .detail("Address", m->address)
 		    .detail("MachineId", m->locality.machineId())
 		    .detail("Excluded", m->excluded)
-		    .detail("Cleared", m->cleared);
+		    .detail("Cleared", m->cleared)
+		    .detail("DrProcess", m->drProcess);

 		if (std::string(name) == "remote flow process") {
 			protectedAddresses.insert(m->address);
@ -1794,6 +1797,15 @@ public:
 		}
 		return result;
 	}
+	bool killAll(KillType kt, bool forceKill, KillType* ktFinal) override {
+		bool result = false;
+		for (auto& machine : machines) {
+			if (killMachine(machine.second.machineId, kt, forceKill, ktFinal)) {
+				result = true;
+			}
+		}
+		return result;
+	}
 	bool killMachine(Optional<Standalone<StringRef>> machineId,
 	                 KillType kt,
 	                 bool forceKill,
@ -1816,6 +1828,7 @@ public:
 		}

 		int processesOnMachine = 0;
+		bool isMainCluster = true; // false for machines running DR processes

 		KillType originalKt = kt;
 		// Reboot if any of the processes are protected and count the number of processes not rebooting
@ -1824,6 +1837,9 @@ public:
 				kt = Reboot;
 			if (!process->rebooting)
 				processesOnMachine++;
+			if (process->drProcess) {
+				isMainCluster = false;
+			}
 		}

 		// Do nothing, if no processes to kill
@ -1950,8 +1966,13 @@ public:
 		           probe::context::sim2,
 		           probe::assert::simOnly);

-		// Check if any processes on machine are rebooting
-		if (processesOnMachine != processesPerMachine && kt >= RebootAndDelete) {
+		if (isMainCluster && originalKt == RebootProcessAndSwitch) {
+			// When killing processes with the RebootProcessAndSwitch kill
+			// type, processes in the original cluster should be rebooted in
+			// order to kill any zombie processes.
+			kt = KillType::Reboot;
+		} else if (processesOnMachine != processesPerMachine && kt != RebootProcessAndSwitch) {
+			// Check if any processes on machine are rebooting
 			CODE_PROBE(true,
 			           "Attempted reboot, but the target did not have all of its processes running",
 			           probe::context::sim2,
@ -1968,24 +1989,6 @@ public:
 			return false;
 		}

-		// Check if any processes on machine are rebooting
-		if (processesOnMachine != processesPerMachine) {
-			CODE_PROBE(true,
-			           "Attempted reboot and kill, but the target did not have all of its processes running",
-			           probe::context::sim2,
-			           probe::assert::simOnly);
-			TraceEvent(SevWarn, "AbortedKill")
-			    .detail("KillType", kt)
-			    .detail("MachineId", machineId)
-			    .detail("Reason", "Machine processes does not match number of processes per machine")
-			    .detail("Processes", processesOnMachine)
-			    .detail("ProcessesPerMachine", processesPerMachine)
-			    .backtrace();
-			if (ktFinal)
-				*ktFinal = None;
-			return false;
-		}
-
 		TraceEvent("KillMachine")
 		    .detail("MachineId", machineId)
 		    .detail("Kt", kt)
@ -2008,7 +2011,7 @@ public:
 				if (process->startingClass != ProcessClass::TesterClass)
 					killProcess_internal(process, kt);
 			}
-		} else if (kt == Reboot || kt == RebootAndDelete) {
+		} else if (kt == Reboot || kt == RebootAndDelete || kt == RebootProcessAndSwitch) {
 			for (auto& process : machines[machineId].processes) {
 				TraceEvent("KillMachineProcess")
 				    .detail("KillType", kt)
@ -2564,7 +2567,7 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {

 	try {
 		ASSERT(kt == ISimulator::RebootProcess || kt == ISimulator::Reboot || kt == ISimulator::RebootAndDelete ||
-		       kt == ISimulator::RebootProcessAndDelete);
+		       kt == ISimulator::RebootProcessAndDelete || kt == ISimulator::RebootProcessAndSwitch);

 		CODE_PROBE(kt == ISimulator::RebootProcess,
 		           "Simulated process rebooted",
@ -2580,6 +2583,10 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
 		           "Simulated process rebooted with data and coordination state deletion",
 		           probe::assert::simOnly,
 		           probe::context::sim2);
+		CODE_PROBE(kt == ISimulator::RebootProcessAndSwitch,
+		           "Simulated process rebooted with different cluster file",
+		           probe::assert::simOnly,
+		           probe::context::sim2);

 		if (p->rebooting || !p->isReliable()) {
 			TraceEvent(SevDebug, "DoRebootFailed")
@ -2608,6 +2615,8 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
 		if ((kt == ISimulator::RebootAndDelete) || (kt == ISimulator::RebootProcessAndDelete)) {
 			p->cleared = true;
 			g_simulator->clearAddress(p->address);
+		} else if (kt == ISimulator::RebootProcessAndSwitch) {
+			g_simulator->switchCluster(p->address);
 		}
 		p->shutdownSignal.send(kt);
 	} catch (Error& e) {
--- a/fdbserver/ApplyMetadataMutation.cpp
+++ b/fdbserver/ApplyMetadataMutation.cpp
@ -613,7 +613,7 @@ private:
 		    m.param1.startsWith(applyMutationsAddPrefixRange.begin) ||
 		    m.param1.startsWith(applyMutationsRemovePrefixRange.begin) || m.param1.startsWith(tagLocalityListPrefix) ||
 		    m.param1.startsWith(serverTagHistoryPrefix) ||
-		    m.param1.startsWith(testOnlyTxnStateStorePrefixRange.begin) || m.param1 == clusterIdKey) {
+		    m.param1.startsWith(testOnlyTxnStateStorePrefixRange.begin)) {

 			txnStateStore->set(KeyValueRef(m.param1, m.param2));
 		}
--- a/fdbserver/BackupWorker.actor.cpp
+++ b/fdbserver/BackupWorker.actor.cpp
@ -290,8 +290,8 @@ struct BackupData {
 		specialCounter(cc, "MsgQ", [this]() { return this->messages.size(); });
 		specialCounter(cc, "BufferedBytes", [this]() { return this->lock->activePermits(); });
 		specialCounter(cc, "AvailableBytes", [this]() { return this->lock->available(); });
-		logger = traceCounters(
-		    "BackupWorkerMetrics", myId, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "BackupWorkerMetrics");
+		logger =
+		    cc.traceCounters("BackupWorkerMetrics", myId, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "BackupWorkerMetrics");
 	}

 	bool pullFinished() const { return endVersion.present() && pulledVersion.get() > endVersion.get(); }
--- a/fdbserver/BlobConnectionProviderTest.actor.cpp
+++ b/fdbserver/BlobConnectionProviderTest.actor.cpp
@ -0,0 +1,202 @@
+/*
+ * BlobConnectionProviderTest.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/BlobConnectionProvider.h"
+
+#include "flow/UnitTest.h"
+#include "fdbserver/Knobs.h"
+#include "flow/actorcompiler.h" // has to be last include
+
+void forceLinkBlobConnectionProviderTests() {}
+
+struct ConnectionProviderTestSettings {
+	uint32_t numProviders;
+	uint32_t filesPerProvider;
+	uint32_t maxFileMemory;
+	uint32_t maxFileSize;
+	uint32_t threads;
+	bool uniformProviderChoice;
+	double readWriteSplit;
+
+	double runtime;
+
+	int writeOps;
+	int readOps;
+
+	ConnectionProviderTestSettings() {
+		numProviders = deterministicRandom()->randomSkewedUInt32(1, 1000);
+		filesPerProvider =
+		    1 + std::min((uint32_t)100, deterministicRandom()->randomSkewedUInt32(10, 10000) / numProviders);
+		maxFileMemory = 1024 * 1024 * 1024;
+		maxFileSize = maxFileMemory / (numProviders * filesPerProvider);
+		maxFileSize = deterministicRandom()->randomSkewedUInt32(8, std::min((uint32_t)(16 * 1024 * 1024), maxFileSize));
+		threads = deterministicRandom()->randomInt(16, 128);
+
+		uniformProviderChoice = deterministicRandom()->coinflip();
+		readWriteSplit = deterministicRandom()->randomInt(1, 10) / 10.0;
+
+		runtime = 60.0;
+
+		writeOps = 0;
+		readOps = 0;
+	}
+};
+
+struct ProviderTestData {
+	Reference<BlobConnectionProvider> provider;
+	std::vector<std::pair<std::string, Value>> data;
+	std::unordered_set<std::string> usedNames;
+
+	ProviderTestData() {}
+	explicit ProviderTestData(Reference<BlobConnectionProvider> provider) : provider(provider) {}
+};
+
+ACTOR Future<Void> createObject(ConnectionProviderTestSettings* settings, ProviderTestData* provider) {
+	// pick object name before wait so no collisions between concurrent writes
+	std::string objName;
+	loop {
+		objName = deterministicRandom()->randomAlphaNumeric(12);
+		if (provider->usedNames.insert(objName).second) {
+			break;
+		}
+	}
+
+	int randomDataSize = deterministicRandom()->randomInt(1, settings->maxFileSize);
+	state Value data = makeString(randomDataSize);
+	deterministicRandom()->randomBytes(mutateString(data), randomDataSize);
+
+	state Reference<BackupContainerFileSystem> bstore;
+	state std::string fullPath;
+	std::tie(bstore, fullPath) = provider->provider->createForWrite(objName);
+
+	state Reference<IBackupFile> file = wait(bstore->writeFile(fullPath));
+	wait(file->append(data.begin(), data.size()));
+	wait(file->finish());
+
+	// after write, put in the readable list
+	provider->data.push_back({ fullPath, data });
+
+	return Void();
+}
+
+ACTOR Future<Void> readAndVerifyObject(ProviderTestData* provider, std::string objFullPath, Value expectedData) {
+	Reference<BackupContainerFileSystem> bstore = provider->provider->getForRead(objFullPath);
+	state Reference<IAsyncFile> reader = wait(bstore->readFile(objFullPath));
+
+	state Value actualData = makeString(expectedData.size());
+	int readSize = wait(reader->read(mutateString(actualData), expectedData.size(), 0));
+	ASSERT_EQ(expectedData.size(), readSize);
+	ASSERT(expectedData == actualData);
+
+	return Void();
+}
+
+Future<Void> deleteObject(ProviderTestData* provider, std::string objFullPath) {
+	Reference<BackupContainerFileSystem> bstore = provider->provider->getForRead(objFullPath);
+	return bstore->deleteFile(objFullPath);
+}
+
+ACTOR Future<Void> workerThread(ConnectionProviderTestSettings* settings, std::vector<ProviderTestData>* providers) {
+	state double endTime = now() + settings->runtime;
+	try {
+		while (now() < endTime) {
+			// randomly pick provider
+			int providerIdx;
+			if (settings->uniformProviderChoice) {
+				providerIdx = deterministicRandom()->randomInt(0, providers->size());
+			} else {
+				providerIdx = deterministicRandom()->randomSkewedUInt32(0, providers->size());
+			}
+			ProviderTestData* provider = &(*providers)[providerIdx];
+
+			// randomly pick create or read
+			bool doWrite = deterministicRandom()->random01() < settings->readWriteSplit;
+			if (provider->usedNames.size() < settings->filesPerProvider && (provider->data.empty() || doWrite)) {
+				// create an object
+				wait(createObject(settings, provider));
+				settings->writeOps++;
+			} else if (!provider->data.empty()) {
+				// read a random object
+				auto& readInfo = provider->data[deterministicRandom()->randomInt(0, provider->data.size())];
+				wait(readAndVerifyObject(provider, readInfo.first, readInfo.second));
+				settings->readOps++;
+			} else {
+				// other threads are creating files up to filesPerProvider limit, but none finished yet. Just wait
+				wait(delay(0.1));
+			}
+		}
+
+		return Void();
+	} catch (Error& e) {
+		fmt::print("WorkerThread Unexpected Error {0}\n", e.name());
+		throw e;
+	}
+}
+
+ACTOR Future<Void> checkAndCleanUp(ProviderTestData* provider) {
+	state int i;
+	ASSERT(provider->usedNames.size() == provider->data.size());
+
+	for (i = 0; i < provider->data.size(); i++) {
+		auto& readInfo = provider->data[i];
+		wait(readAndVerifyObject(provider, readInfo.first, readInfo.second));
+		wait(deleteObject(provider, provider->data[i].first));
+	}
+
+	return Void();
+}
+
+// maybe this should be a workload instead?
+TEST_CASE("/fdbserver/blob/connectionprovider") {
+	state ConnectionProviderTestSettings settings;
+
+	state std::vector<ProviderTestData> providers;
+	providers.reserve(settings.numProviders);
+	for (int i = 0; i < settings.numProviders; i++) {
+		std::string nameStr = std::to_string(i);
+		BlobMetadataDomainName name(nameStr);
+		auto metadata = createRandomTestBlobMetadata(SERVER_KNOBS->BG_URL, i, name);
+		providers.emplace_back(BlobConnectionProvider::newBlobConnectionProvider(metadata));
+	}
+	fmt::print("BlobConnectionProviderTest\n");
+
+	state std::vector<Future<Void>> futures;
+	futures.reserve(settings.threads);
+	for (int i = 0; i < settings.threads; i++) {
+		futures.push_back(workerThread(&settings, &providers));
+	}
+
+	wait(waitForAll(futures));
+
+	fmt::print("BlobConnectionProviderTest workload phase complete with {0} files and {1} reads\n",
+	           settings.writeOps,
+	           settings.readOps);
+
+	futures.clear();
+	futures.reserve(providers.size());
+	for (int i = 0; i < providers.size(); i++) {
+		futures.push_back(checkAndCleanUp(&providers[i]));
+	}
+
+	wait(waitForAll(futures));
+
+	fmt::print("BlobConnectionProviderTest check and cleanup phase complete\n");
+	return Void();
+}
--- a/fdbserver/BlobManager.actor.cpp
+++ b/fdbserver/BlobManager.actor.cpp
@ -296,7 +296,7 @@ struct BlobManagerStats {
 		specialCounter(cc, "HardBoundaries", [mergeHardBoundaries]() { return mergeHardBoundaries->size(); });
 		specialCounter(cc, "SoftBoundaries", [mergeBoundaries]() { return mergeBoundaries->size(); });
 		specialCounter(cc, "BlockedAssignments", [this]() { return this->blockedAssignments; });
-		logger = traceCounters("BlobManagerMetrics", id, interval, &cc, "BlobManagerMetrics");
+		logger = cc.traceCounters("BlobManagerMetrics", id, interval, "BlobManagerMetrics");
 	}
 };

--- a/fdbserver/BlobWorker.actor.cpp
+++ b/fdbserver/BlobWorker.actor.cpp
@ -84,6 +84,15 @@ struct GranuleStartState {
 	Optional<GranuleHistory> history;
 };

+// TODO: add more (blob file request cost, in-memory mutations vs blob delta file, etc...)
+struct GranuleReadStats {
+	int64_t deltaBytesRead;
+
+	void reset() { deltaBytesRead = 0; }
+
+	GranuleReadStats() { reset(); }
+};
+
 struct GranuleMetadata : NonCopyable, ReferenceCounted<GranuleMetadata> {
 	KeyRange keyRange;

@ -120,11 +129,74 @@ struct GranuleMetadata : NonCopyable, ReferenceCounted<GranuleMetadata> {

 	AssignBlobRangeRequest originalReq;

+	GranuleReadStats readStats;
+	bool rdcCandidate;
+	Promise<Void> runRDC;
+
 	void resume() {
 		if (resumeSnapshot.canBeSet()) {
 			resumeSnapshot.send(Void());
 		}
 	}
+
+	void resetReadStats() {
+		rdcCandidate = false;
+		readStats.reset();
+		runRDC.reset();
+	}
+
+	// determine eligibility (>1) and priority for re-snapshotting this granule
+	double weightRDC() {
+		// ratio of read amp to write amp that would be incurred by re-snapshotting now
+		int64_t lastSnapshotSize = (files.snapshotFiles.empty()) ? 0 : files.snapshotFiles.back().length;
+		int64_t minSnapshotSize = SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES / 2;
+		lastSnapshotSize = std::max(minSnapshotSize, lastSnapshotSize);
+
+		int64_t writeAmp = lastSnapshotSize + bufferedDeltaBytes + bytesInNewDeltaFiles;
+		// read amp is deltaBytesRead. Read amp must be READ_FACTOR times larger than write amp
+		return (1.0 * readStats.deltaBytesRead) / (writeAmp * SERVER_KNOBS->BG_RDC_READ_FACTOR);
+	}
+
+	bool isEligibleRDC() {
+		// granule should be reasonably read-hot to be eligible
+		int64_t bytesWritten = bufferedDeltaBytes + bytesInNewDeltaFiles;
+		return bytesWritten * SERVER_KNOBS->BG_RDC_READ_FACTOR < readStats.deltaBytesRead;
+	}
+
+	bool updateReadStats(Version readVersion, const BlobGranuleChunkRef& chunk) {
+		// Only update stats for re-compacting for at-latest reads that have to do snapshot + delta merge
+		if (!SERVER_KNOBS->BG_ENABLE_READ_DRIVEN_COMPACTION || !chunk.snapshotFile.present() ||
+		    pendingSnapshotVersion != durableSnapshotVersion.get() || readVersion <= pendingSnapshotVersion) {
+			return false;
+		}
+
+		if (chunk.newDeltas.empty() && chunk.deltaFiles.empty()) {
+			return false;
+		}
+
+		readStats.deltaBytesRead += chunk.newDeltas.expectedSize();
+		for (auto& it : chunk.deltaFiles) {
+			readStats.deltaBytesRead += it.length;
+		}
+
+		if (rdcCandidate) {
+			return false;
+		}
+
+		if (isEligibleRDC() && weightRDC() > 1.0) {
+			rdcCandidate = true;
+			CODE_PROBE(true, "Granule read triggering read-driven compaction");
+			if (BW_DEBUG) {
+				fmt::print("Triggering read-driven compaction of [{0} - {1})\n",
+				           keyRange.begin.printable(),
+				           keyRange.end.printable());
+			}
+			return true;
+		}
+		return false;
+	}
+
+	inline bool doReadDrivenCompaction() { return runRDC.isSet(); }
 };

 struct GranuleRangeMetadata {
@ -200,6 +272,7 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
 	NotifiedVersion grvVersion;
 	Promise<Void> fatalError;
 	Promise<Void> simInjectFailure;
+	Promise<Void> doReadDrivenCompaction;

 	Reference<FlowLock> initialSnapshotLock;
 	Reference<FlowLock> resnapshotLock;
@ -293,6 +366,13 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
 		return stats.estimatedMaxResidentMemory >= memoryFullThreshold;
 	}

+	void triggerReadDrivenCompaction() {
+		Promise<Void> doRDC = doReadDrivenCompaction;
+		if (doRDC.canBeSet()) {
+			doRDC.send(Void());
+		}
+	}
+
 	bool maybeInjectTargetedRestart() {
 		// inject a BW restart at most once per test
 		if (g_network->isSimulated() && !g_simulator->speedUpSimulation &&
@ -1107,7 +1187,6 @@ ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData>
 			}
 			retries++;
 			CODE_PROBE(true, "Granule initial snapshot failed");
-			// FIXME: why can't we supress error event?
 			TraceEvent(retries < 10 ? SevDebug : SevWarn, "BlobGranuleInitialSnapshotRetry", bwData->id)
 			    .error(err)
 			    .detail("Granule", metadata->keyRange)
@ -2043,6 +2122,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 		metadata->pendingDeltaVersion = startVersion;
 		metadata->bufferedDeltaVersion = startVersion;
 		metadata->knownCommittedVersion = startVersion;
+		metadata->resetReadStats();

 		Reference<ChangeFeedData> cfData = makeReference<ChangeFeedData>(bwData->db.getPtr());

@ -2185,6 +2265,10 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 						}
 						nextForceFlush = metadata->forceFlushVersion.whenAtLeast(lastForceFlushVersion + 1);
 					}
+					when(wait(metadata->runRDC.getFuture())) {
+						// return control flow back to the triggering actor before continuing
+						wait(delay(0));
+					}
 				}
 			} catch (Error& e) {
 				// only error we should expect here is when we finish consuming old change feed
@ -2311,6 +2395,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 										    startState.granuleID,
 										    inFlightFiles.empty() ? Future<Void>(Void())
 										                          : success(inFlightFiles.back().future));
+										metadata->resetReadStats();
 									}
 									// reset force flush state, requests should retry and add it back once feed is ready
 									forceFlushVersions.clear();
@ -2419,20 +2504,20 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			// The force flush contract is a version cannot be put in forceFlushVersion unless the change feed
 			// is already whenAtLeast that version
 			bool forceFlush = !forceFlushVersions.empty() && forceFlushVersions.back() > metadata->pendingDeltaVersion;
+			bool doReadDrivenFlush = !metadata->currentDeltas.empty() && metadata->doReadDrivenCompaction();
 			CODE_PROBE(forceFlush, "Force flushing granule");
-			if (metadata->bufferedDeltaBytes >= SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES || forceFlush) {
+			if (metadata->bufferedDeltaBytes >= SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES || forceFlush ||
+			    doReadDrivenFlush) {
 				TraceEvent(SevDebug, "BlobGranuleDeltaFile", bwData->id)
 				    .detail("Granule", metadata->keyRange)
 				    .detail("Version", lastDeltaVersion);

 				// sanity check for version order
-
-				if (forceFlush) {
+				if (forceFlush || doReadDrivenFlush) {
 					if (lastDeltaVersion == invalidVersion) {
-						lastDeltaVersion = metadata->currentDeltas.empty() ? metadata->pendingDeltaVersion
-						                                                   : metadata->currentDeltas.back().version;
+						lastDeltaVersion = metadata->bufferedDeltaVersion;
 					}
-					if (lastDeltaVersion < forceFlushVersions.back()) {
+					if (!forceFlushVersions.empty() && lastDeltaVersion < forceFlushVersions.back()) {
 						if (BW_DEBUG) {
 							fmt::print("Granule [{0} - {1}) force flushing delta version {2} -> {3}\n",
 							           metadata->keyRange.begin.printable(),
@ -2444,13 +2529,6 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 					}
 				}
 				if (!metadata->currentDeltas.empty()) {
-					if (lastDeltaVersion < metadata->currentDeltas.back().version) {
-						fmt::print("Granule [{0} - {1}) LDV {2} < DeltaBack {3}\n",
-						           metadata->keyRange.begin.printable(),
-						           metadata->keyRange.end.printable(),
-						           lastDeltaVersion,
-						           metadata->currentDeltas.back().version);
-					}
 					ASSERT(lastDeltaVersion >= metadata->currentDeltas.back().version);
 					ASSERT(metadata->pendingDeltaVersion < metadata->currentDeltas.front().version);
 				} else {
@ -2507,6 +2585,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 				// add new pending delta file
 				ASSERT(metadata->pendingDeltaVersion < lastDeltaVersion);
 				metadata->pendingDeltaVersion = lastDeltaVersion;
+				ASSERT(metadata->bufferedDeltaVersion <= lastDeltaVersion);
 				metadata->bufferedDeltaVersion = lastDeltaVersion; // In case flush was forced at non-mutation version
 				metadata->bytesInNewDeltaFiles += metadata->bufferedDeltaBytes;

@ -2528,6 +2607,9 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 				// Wait on delta file starting here. If we have too many pending delta file writes, we need to not
 				// continue to consume from the change feed, as that will pile on even more delta files to write
 				wait(startDeltaFileWrite);
+			} else if (metadata->doReadDrivenCompaction()) {
+				ASSERT(metadata->currentDeltas.empty());
+				snapshotEligible = true;
 			}

 			// FIXME: if we're still reading from old change feed, we should probably compact if we're
@ -2535,7 +2617,8 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			// yet

 			// If we have enough delta files, try to re-snapshot
-			if (snapshotEligible && metadata->bytesInNewDeltaFiles >= SERVER_KNOBS->BG_DELTA_BYTES_BEFORE_COMPACT) {
+			if (snapshotEligible && (metadata->doReadDrivenCompaction() ||
+			                         metadata->bytesInNewDeltaFiles >= SERVER_KNOBS->BG_DELTA_BYTES_BEFORE_COMPACT)) {
 				if (BW_DEBUG && !inFlightFiles.empty()) {
 					fmt::print("Granule [{0} - {1}) ready to re-snapshot at {2} after {3} > {4} bytes, "
 					           "waiting for "
@ -2583,6 +2666,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,

 				// reset metadata
 				metadata->bytesInNewDeltaFiles = 0;
+				metadata->resetReadStats();

 				// If we have more than one snapshot file and that file is unblocked (committedVersion >=
 				// snapshotVersion), wait for it to finish
@ -3740,6 +3824,11 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 							}
 						}
 					}
+
+					// don't update read stats on a summarize read
+					if (metadata->updateReadStats(req.readVersion, chunk)) {
+						bwData->triggerReadDrivenCompaction();
+					}
 				}

 				rep.chunks.push_back(rep.arena, chunk);
@ -4554,6 +4643,74 @@ ACTOR Future<Void> runGRVChecks(Reference<BlobWorkerData> bwData) {
 	}
 }

+struct RDCEntry {
+	double weight;
+	Reference<GranuleMetadata> granule;
+	RDCEntry(double weight, Reference<GranuleMetadata> granule) : weight(weight), granule(granule) {}
+};
+
+// for a top-k algorithm, we actually want a min-heap, so reverse the sort order
+struct OrderForTopK {
+	bool operator()(RDCEntry const& a, RDCEntry const& b) const { return b.weight - a.weight; }
+};
+
+typedef std::priority_queue<RDCEntry, std::vector<RDCEntry>, OrderForTopK> TopKPQ;
+
+ACTOR Future<Void> runReadDrivenCompaction(Reference<BlobWorkerData> bwData) {
+	state bool processedAll = true;
+	loop {
+		if (processedAll) {
+			wait(bwData->doReadDrivenCompaction.getFuture());
+			bwData->doReadDrivenCompaction.reset();
+			wait(delay(0));
+		}
+
+		TopKPQ topK;
+
+		// FIXME: possible to scan candidates instead of all granules?
+		int candidates = 0;
+		auto allRanges = bwData->granuleMetadata.intersectingRanges(normalKeys);
+		for (auto& it : allRanges) {
+			if (it.value().activeMetadata.isValid() && it.value().activeMetadata->cancelled.canBeSet()) {
+				auto metadata = it.value().activeMetadata;
+				if (metadata->rdcCandidate && metadata->isEligibleRDC() && metadata->runRDC.canBeSet() &&
+				    metadata->pendingSnapshotVersion == metadata->durableSnapshotVersion.get()) {
+					candidates++;
+					double weight = metadata->weightRDC();
+					if (weight > 1.0 &&
+					    (topK.size() < SERVER_KNOBS->BLOB_WORKER_RDC_PARALLELISM || weight > topK.top().weight)) {
+						if (topK.size() == SERVER_KNOBS->BLOB_WORKER_RDC_PARALLELISM) {
+							topK.pop();
+						}
+						topK.push(RDCEntry(weight, metadata));
+					}
+				}
+			}
+		}
+
+		CODE_PROBE(candidates > topK.size(), "Too many read-driven compaction candidates for one cycle");
+
+		std::vector<Future<Void>> futures;
+		futures.reserve(topK.size());
+		while (!topK.empty()) {
+			++bwData->stats.readDrivenCompactions;
+			Promise<Void> runRDC = topK.top().granule->runRDC;
+			ASSERT(runRDC.canBeSet());
+			Future<Void> waitForSnapshotComplete = topK.top().granule->durableSnapshotVersion.whenAtLeast(
+			                                           topK.top().granule->durableSnapshotVersion.get() + 1) ||
+			                                       topK.top().granule->cancelled.getFuture();
+			futures.push_back(waitForSnapshotComplete);
+			topK.pop();
+			runRDC.send(Void());
+		}
+		processedAll = futures.empty();
+		if (!futures.empty()) {
+			// wait at least one second to throttle this actor a bit
+			wait(waitForAll(futures) && delay(1.0));
+		}
+	}
+}
+
 // FIXME: better way to do this?
 // monitor system keyspace for new tenants
 ACTOR Future<Void> monitorTenants(Reference<BlobWorkerData> bwData) {
@ -4891,6 +5048,7 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
 	self->addActor.send(waitFailureServer(bwInterf.waitFailure.getFuture()));
 	self->addActor.send(runGRVChecks(self));
 	self->addActor.send(monitorTenants(self));
+	self->addActor.send(runReadDrivenCompaction(self));
 	state Future<Void> selfRemoved = monitorRemoval(self);
 	if (g_network->isSimulated() && BUGGIFY_WITH_PROB(0.25)) {
 		self->addActor.send(simForceFileWriteContention(self));
@ -5024,13 +5182,22 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
 				ASSERT(false);
 				throw internal_error();
 			}
-			when(wait(selfRemoved || self->simInjectFailure.getFuture())) {
+			when(wait(selfRemoved)) {
 				if (BW_DEBUG) {
 					printf("Blob worker detected removal. Exiting...\n");
 				}
 				TraceEvent("BlobWorkerRemoved", self->id);
 				break;
 			}
+			when(wait(self->simInjectFailure.getFuture())) {
+				// wait to let triggering actor finish to prevent weird shutdown races
+				wait(delay(0));
+				if (BW_DEBUG) {
+					printf("Blob worker simulation injected failure. Exiting...\n");
+				}
+				TraceEvent("BlobWorkerSimRemoved", self->id);
+				break;
+			}
 			when(wait(self->fatalError.getFuture())) {
 				TraceEvent(SevError, "BlobWorkerActorCollectionFatalErrorNotError", self->id);
 				ASSERT(false);
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -1060,8 +1060,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
 	    .detail("GrvProxies", req.grvProxies.size())
 	    .detail("RecoveryCount", req.recoveryCount)
 	    .detail("Stalled", req.recoveryStalled)
-	    .detail("OldestBackupEpoch", req.logSystemConfig.oldestBackupEpoch)
-	    .detail("ClusterId", req.clusterId);
+	    .detail("OldestBackupEpoch", req.logSystemConfig.oldestBackupEpoch);

 	// make sure the request comes from an active database
 	auto db = &self->db;
@ -1120,8 +1119,9 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
 	// Construct the client information
 	if (db->clientInfo->get().commitProxies != req.commitProxies ||
 	    db->clientInfo->get().grvProxies != req.grvProxies ||
-	    db->clientInfo->get().tenantMode != db->config.tenantMode || db->clientInfo->get().clusterId != req.clusterId ||
+	    db->clientInfo->get().tenantMode != db->config.tenantMode ||
 	    db->clientInfo->get().isEncryptionEnabled != SERVER_KNOBS->ENABLE_ENCRYPTION ||
+	    db->clientInfo->get().clusterId != db->serverInfo->get().client.clusterId ||
 	    db->clientInfo->get().clusterType != db->clusterType ||
 	    db->clientInfo->get().metaclusterName != db->metaclusterName ||
 	    db->clientInfo->get().encryptKeyProxy != db->serverInfo->get().encryptKeyProxy) {
@ -1133,9 +1133,9 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
 		    .detail("ReqCPs", req.commitProxies)
 		    .detail("TenantMode", db->clientInfo->get().tenantMode.toString())
 		    .detail("ReqTenantMode", db->config.tenantMode.toString())
-		    .detail("ClusterId", db->clientInfo->get().clusterId)
-		    .detail("ReqClusterId", req.clusterId)
 		    .detail("EncryptionEnabled", SERVER_KNOBS->ENABLE_ENCRYPTION)
+		    .detail("ClusterId", db->serverInfo->get().client.clusterId)
+		    .detail("ClientClusterId", db->clientInfo->get().clusterId)
 		    .detail("ClusterType", db->clientInfo->get().clusterType)
 		    .detail("ReqClusterType", db->clusterType)
 		    .detail("MetaclusterName", db->clientInfo->get().metaclusterName)
@ -1149,7 +1149,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
 		clientInfo.commitProxies = req.commitProxies;
 		clientInfo.grvProxies = req.grvProxies;
 		clientInfo.tenantMode = TenantAPI::tenantModeForClusterType(db->clusterType, db->config.tenantMode);
-		clientInfo.clusterId = req.clusterId;
+		clientInfo.clusterId = db->serverInfo->get().client.clusterId;
 		clientInfo.clusterType = db->clusterType;
 		clientInfo.metaclusterName = db->metaclusterName;
 		db->clientInfo->set(clientInfo);
@ -1228,6 +1228,17 @@ ACTOR Future<Void> registerWorker(RegisterWorkerRequest req,
 	std::vector<NetworkAddress> coordinatorAddresses = wait(cs.tryResolveHostnames());

 	const WorkerInterface& w = req.wi;
+	if (req.clusterId.present() && self->clusterId->get().present() && req.clusterId != self->clusterId->get() &&
+	    req.processClass != ProcessClass::TesterClass) {
+		TraceEvent(g_network->isSimulated() ? SevWarnAlways : SevError, "WorkerBelongsToExistingCluster", self->id)
+		    .detail("WorkerClusterId", req.clusterId)
+		    .detail("ClusterControllerClusterId", self->clusterId->get())
+		    .detail("WorkerId", w.id())
+		    .detail("ProcessId", w.locality.processId());
+		req.reply.sendError(invalid_cluster_id());
+		return Void();
+	}
+
 	ProcessClass newProcessClass = req.processClass;
 	auto info = self->id_worker.find(w.locality.processId());
 	ClusterControllerPriorityInfo newPriorityInfo = req.priorityInfo;
@ -2964,13 +2975,64 @@ ACTOR Future<Void> metaclusterMetricsUpdater(ClusterControllerData* self) {
 	}
 }

+// Update the DBInfo state with this processes cluster ID. If this process does
+// not have a cluster ID and one does not exist in the database, generate one.
+ACTOR Future<Void> updateClusterId(ClusterControllerData* self) {
+	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->cx);
+	loop {
+		try {
+			state Optional<UID> durableClusterId = self->clusterId->get();
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+
+			Optional<Value> clusterIdVal = wait(tr->get(clusterIdKey));
+
+			if (clusterIdVal.present()) {
+				UID clusterId = BinaryReader::fromStringRef<UID>(clusterIdVal.get(), IncludeVersion());
+				if (durableClusterId.present()) {
+					// If this process has an on disk file for the cluster ID,
+					// verify it matches the value in the database.
+					ASSERT(clusterId == durableClusterId.get());
+				} else {
+					// Otherwise, write the cluster ID in the database to the
+					// DbInfo object so all clients will learn of the cluster
+					// ID.
+					durableClusterId = clusterId;
+				}
+			} else if (!durableClusterId.present()) {
+				// No cluster ID exists in the database or on the machine. Generate and set one.
+				ASSERT(!durableClusterId.present());
+				durableClusterId = deterministicRandom()->randomUniqueID();
+				tr->set(clusterIdKey, BinaryWriter::toValue(durableClusterId.get(), IncludeVersion()));
+				wait(tr->commit());
+			}
+			auto serverInfo = self->db.serverInfo->get();
+			if (!serverInfo.client.clusterId.isValid()) {
+				ASSERT(durableClusterId.present());
+				serverInfo.id = deterministicRandom()->randomUniqueID();
+				serverInfo.client.clusterId = durableClusterId.get();
+				self->db.serverInfo->set(serverInfo);
+
+				ClientDBInfo clientInfo = self->db.clientInfo->get();
+				clientInfo.id = deterministicRandom()->randomUniqueID();
+				clientInfo.clusterId = durableClusterId.get();
+				self->db.clientInfo->set(clientInfo);
+			}
+			return Void();
+		} catch (Error& e) {
+			wait(tr->onError(e));
+		}
+	}
+}
+
 ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
                                         Future<Void> leaderFail,
                                         ServerCoordinators coordinators,
                                         LocalityData locality,
                                         ConfigDBType configDBType,
-                                         Future<Void> recoveredDiskFiles) {
-	state ClusterControllerData self(interf, locality, coordinators);
+                                         Future<Void> recoveredDiskFiles,
+                                         Reference<AsyncVar<Optional<UID>>> clusterId) {
+	state ClusterControllerData self(interf, locality, coordinators, clusterId);
 	state Future<Void> coordinationPingDelay = delay(SERVER_KNOBS->WORKER_COORDINATION_PING_DELAY);
 	state uint64_t step = 0;
 	state Future<ErrorOr<Void>> error = errorOr(actorCollection(self.addActor.getFuture()));
@ -3007,11 +3069,11 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
 	self.addActor.send(monitorConsistencyScan(&self));
 	self.addActor.send(metaclusterMetricsUpdater(&self));
 	self.addActor.send(dbInfoUpdater(&self));
-	self.addActor.send(traceCounters("ClusterControllerMetrics",
-	                                 self.id,
-	                                 SERVER_KNOBS->STORAGE_LOGGING_DELAY,
-	                                 &self.clusterControllerMetrics,
-	                                 self.id.toString() + "/ClusterControllerMetrics"));
+	self.addActor.send(updateClusterId(&self));
+	self.addActor.send(self.clusterControllerMetrics.traceCounters("ClusterControllerMetrics",
+	                                                               self.id,
+	                                                               SERVER_KNOBS->STORAGE_LOGGING_DELAY,
+	                                                               self.id.toString() + "/ClusterControllerMetrics"));
 	self.addActor.send(traceRole(Role::CLUSTER_CONTROLLER, interf.id()));
 	// printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str());

@ -3124,7 +3186,8 @@ ACTOR Future<Void> clusterController(ServerCoordinators coordinators,
                                     Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
                                     LocalityData locality,
                                     ConfigDBType configDBType,
-                                     Future<Void> recoveredDiskFiles) {
+                                     Future<Void> recoveredDiskFiles,
+                                     Reference<AsyncVar<Optional<UID>>> clusterId) {
 	loop {
 		state ClusterControllerFullInterface cci;
 		state bool inRole = false;
@ -3151,7 +3214,8 @@ ACTOR Future<Void> clusterController(ServerCoordinators coordinators,
 				startRole(Role::CLUSTER_CONTROLLER, cci.id(), UID());
 				inRole = true;

-				wait(clusterControllerCore(cci, leaderFail, coordinators, locality, configDBType, recoveredDiskFiles));
+				wait(clusterControllerCore(
+				    cci, leaderFail, coordinators, locality, configDBType, recoveredDiskFiles, clusterId));
 			}
 		} catch (Error& e) {
 			if (inRole)
@ -3175,7 +3239,8 @@ ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRec
                                     Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
                                     Future<Void> recoveredDiskFiles,
                                     LocalityData locality,
-                                     ConfigDBType configDBType) {
+                                     ConfigDBType configDBType,
+                                     Reference<AsyncVar<Optional<UID>>> clusterId) {

 	// Defer this wait optimization of cluster configuration has 'Encryption data at-rest' enabled.
 	// Encryption depends on available of EncryptKeyProxy (EKP) FDB role to enable fetch/refresh of encryption keys
@ -3195,8 +3260,14 @@ ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRec
 	loop {
 		try {
 			ServerCoordinators coordinators(connRecord, configDBType);
-			wait(clusterController(
-			    coordinators, currentCC, hasConnected, asyncPriorityInfo, locality, configDBType, recoveredDiskFiles));
+			wait(clusterController(coordinators,
+			                       currentCC,
+			                       hasConnected,
+			                       asyncPriorityInfo,
+			                       locality,
+			                       configDBType,
+			                       recoveredDiskFiles,
+			                       clusterId));
 			hasConnected = true;
 		} catch (Error& e) {
 			if (e.code() != error_code_coordinators_changed)
@ -3214,7 +3285,8 @@ TEST_CASE("/fdbserver/clustercontroller/updateWorkerHealth") {
 	state ClusterControllerData data(ClusterControllerFullInterface(),
 	                                 LocalityData(),
 	                                 ServerCoordinators(Reference<IClusterConnectionRecord>(
-	                                     new ClusterConnectionMemoryRecord(ClusterConnectionString()))));
+	                                     new ClusterConnectionMemoryRecord(ClusterConnectionString()))),
+	                                 makeReference<AsyncVar<Optional<UID>>>());
 	state NetworkAddress workerAddress(IPAddress(0x01010101), 1);
 	state NetworkAddress badPeer1(IPAddress(0x02020202), 1);
 	state NetworkAddress badPeer2(IPAddress(0x03030303), 1);
@ -3309,7 +3381,8 @@ TEST_CASE("/fdbserver/clustercontroller/updateRecoveredWorkers") {
 	ClusterControllerData data(ClusterControllerFullInterface(),
 	                           LocalityData(),
 	                           ServerCoordinators(Reference<IClusterConnectionRecord>(
-	                               new ClusterConnectionMemoryRecord(ClusterConnectionString()))));
+	                               new ClusterConnectionMemoryRecord(ClusterConnectionString()))),
+	                           makeReference<AsyncVar<Optional<UID>>>());
 	NetworkAddress worker1(IPAddress(0x01010101), 1);
 	NetworkAddress worker2(IPAddress(0x11111111), 1);
 	NetworkAddress badPeer1(IPAddress(0x02020202), 1);
@ -3358,7 +3431,8 @@ TEST_CASE("/fdbserver/clustercontroller/getDegradationInfo") {
 	ClusterControllerData data(ClusterControllerFullInterface(),
 	                           LocalityData(),
 	                           ServerCoordinators(Reference<IClusterConnectionRecord>(
-	                               new ClusterConnectionMemoryRecord(ClusterConnectionString()))));
+	                               new ClusterConnectionMemoryRecord(ClusterConnectionString()))),
+	                           makeReference<AsyncVar<Optional<UID>>>());
 	NetworkAddress worker(IPAddress(0x01010101), 1);
 	NetworkAddress badPeer1(IPAddress(0x02020202), 1);
 	NetworkAddress badPeer2(IPAddress(0x03030303), 1);
@ -3511,7 +3585,8 @@ TEST_CASE("/fdbserver/clustercontroller/recentRecoveryCountDueToHealth") {
 	ClusterControllerData data(ClusterControllerFullInterface(),
 	                           LocalityData(),
 	                           ServerCoordinators(Reference<IClusterConnectionRecord>(
-	                               new ClusterConnectionMemoryRecord(ClusterConnectionString()))));
+	                               new ClusterConnectionMemoryRecord(ClusterConnectionString()))),
+	                           makeReference<AsyncVar<Optional<UID>>>());

 	ASSERT_EQ(data.recentRecoveryCountDueToHealth(), 0);

@ -3532,7 +3607,8 @@ TEST_CASE("/fdbserver/clustercontroller/shouldTriggerRecoveryDueToDegradedServer
 	ClusterControllerData data(ClusterControllerFullInterface(),
 	                           LocalityData(),
 	                           ServerCoordinators(Reference<IClusterConnectionRecord>(
-	                               new ClusterConnectionMemoryRecord(ClusterConnectionString()))));
+	                               new ClusterConnectionMemoryRecord(ClusterConnectionString()))),
+	                           makeReference<AsyncVar<Optional<UID>>>());
 	NetworkAddress master(IPAddress(0x01010101), 1);
 	NetworkAddress tlog(IPAddress(0x02020202), 1);
 	NetworkAddress satelliteTlog(IPAddress(0x03030303), 1);
@ -3668,7 +3744,8 @@ TEST_CASE("/fdbserver/clustercontroller/shouldTriggerFailoverDueToDegradedServer
 	ClusterControllerData data(ClusterControllerFullInterface(),
 	                           LocalityData(),
 	                           ServerCoordinators(Reference<IClusterConnectionRecord>(
-	                               new ClusterConnectionMemoryRecord(ClusterConnectionString()))));
+	                               new ClusterConnectionMemoryRecord(ClusterConnectionString()))),
+	                           makeReference<AsyncVar<Optional<UID>>>());
 	NetworkAddress master(IPAddress(0x01010101), 1);
 	NetworkAddress tlog(IPAddress(0x02020202), 1);
 	NetworkAddress satelliteTlog(IPAddress(0x03030303), 1);
--- a/fdbserver/ClusterRecovery.actor.cpp
+++ b/fdbserver/ClusterRecovery.actor.cpp
@ -297,7 +297,6 @@ ACTOR Future<Void> newTLogServers(Reference<ClusterRecoveryData> self,
 		self->logSystem = Reference<ILogSystem>(); // Cancels the actors in the previous log system.
 		Reference<ILogSystem> newLogSystem = wait(oldLogSystem->newEpoch(recr,
 		                                                                 fRemoteWorkers,
-		                                                                 self->clusterId,
 		                                                                 self->configuration,
 		                                                                 self->cstate.myDBState.recoveryCount + 1,
 		                                                                 self->recoveryTransactionVersion,
@ -311,7 +310,6 @@ ACTOR Future<Void> newTLogServers(Reference<ClusterRecoveryData> self,
 		self->logSystem = Reference<ILogSystem>(); // Cancels the actors in the previous log system.
 		Reference<ILogSystem> newLogSystem = wait(oldLogSystem->newEpoch(recr,
 		                                                                 Never(),
-		                                                                 self->clusterId,
 		                                                                 self->configuration,
 		                                                                 self->cstate.myDBState.recoveryCount + 1,
 		                                                                 self->recoveryTransactionVersion,
@ -347,7 +345,6 @@ ACTOR Future<Void> newSeedServers(Reference<ClusterRecoveryData> self,
 		isr.storeType = self->configuration.storageServerStoreType;
 		isr.reqId = deterministicRandom()->randomUniqueID();
 		isr.interfaceId = deterministicRandom()->randomUniqueID();
-		isr.clusterId = self->clusterId;
 		isr.initialClusterVersion = self->recoveryTransactionVersion;

 		ErrorOr<InitializeStorageReply> newServer = wait(recruits.storageServers[idx].storage.tryGetReply(isr));
@ -477,7 +474,6 @@ ACTOR Future<Void> trackTlogRecovery(Reference<ClusterRecoveryData> self,
 			           self->dbgid)
 			    .detail("StatusCode", RecoveryStatus::fully_recovered)
 			    .detail("Status", RecoveryStatus::names[RecoveryStatus::fully_recovered])
-			    .detail("ClusterId", self->clusterId)
 			    .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey);

 			TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_GENERATION_EVENT_NAME).c_str(),
@ -786,7 +782,6 @@ Future<Void> sendMasterRegistration(ClusterRecoveryData* self,
 	masterReq.priorCommittedLogServers = priorCommittedLogServers;
 	masterReq.recoveryState = self->recoveryState;
 	masterReq.recoveryStalled = self->recruitmentStalled->get();
-	masterReq.clusterId = self->clusterId;
 	return brokenPromiseToNever(self->clusterController.registerMaster.getReply(masterReq));
 }

@ -1350,8 +1345,7 @@ ACTOR Future<Void> recoverFrom(Reference<ClusterRecoveryData> self,
                               Reference<ILogSystem> oldLogSystem,
                               std::vector<StorageServerInterface>* seedServers,
                               std::vector<Standalone<CommitTransactionRef>>* initialConfChanges,
-                               Future<Version> poppedTxsVersion,
-                               bool* clusterIdExists) {
+                               Future<Version> poppedTxsVersion) {
 	TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(), self->dbgid)
 	    .detail("StatusCode", RecoveryStatus::reading_transaction_system_state)
 	    .detail("Status", RecoveryStatus::names[RecoveryStatus::reading_transaction_system_state])
@ -1375,16 +1369,6 @@ ACTOR Future<Void> recoverFrom(Reference<ClusterRecoveryData> self,

 	debug_checkMaxRestoredVersion(UID(), self->lastEpochEnd, "DBRecovery");

-	// Generate a cluster ID to uniquely identify the cluster if it doesn't
-	// already exist in the txnStateStore.
-	Optional<Value> clusterId = self->txnStateStore->readValue(clusterIdKey).get();
-	*clusterIdExists = clusterId.present();
-	if (!clusterId.present()) {
-		self->clusterId = deterministicRandom()->randomUniqueID();
-	} else {
-		self->clusterId = BinaryReader::fromStringRef<UID>(clusterId.get(), Unversioned());
-	}
-
 	// Ordinarily we pass through this loop once and recover.  We go around the loop if recovery stalls for more than a
 	// second, a provisional master is initialized, and an "emergency transaction" is submitted that might change the
 	// configuration so that we can finish recovery.
@ -1540,7 +1524,6 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
 	state Future<Void> logChanges;
 	state Future<Void> minRecoveryDuration;
 	state Future<Version> poppedTxsVersion;
-	state bool clusterIdExists = false;

 	loop {
 		Reference<ILogSystem> oldLogSystem = oldLogSystems->get();
@ -1556,13 +1539,9 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
 		self->registrationTrigger.trigger();

 		choose {
-			when(wait(oldLogSystem ? recoverFrom(self,
-			                                     oldLogSystem,
-			                                     &seedServers,
-			                                     &initialConfChanges,
-			                                     poppedTxsVersion,
-			                                     std::addressof(clusterIdExists))
-			                       : Never())) {
+			when(wait(oldLogSystem
+			              ? recoverFrom(self, oldLogSystem, &seedServers, &initialConfChanges, poppedTxsVersion)
+			              : Never())) {
 				reg.cancel();
 				break;
 			}
@ -1591,7 +1570,6 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
 	    .detail("Status", RecoveryStatus::names[RecoveryStatus::recovery_transaction])
 	    .detail("PrimaryLocality", self->primaryLocality)
 	    .detail("DcId", self->masterInterface.locality.dcId())
-	    .detail("ClusterId", self->clusterId)
 	    .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey);

 	// Recovery transaction
@ -1680,11 +1658,6 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
 		}
 	}

-	// Write cluster ID into txnStateStore if it is missing.
-	if (!clusterIdExists) {
-		tr.set(recoveryCommitRequest.arena, clusterIdKey, BinaryWriter::toValue(self->clusterId, Unversioned()));
-	}
-
 	applyMetadataMutations(SpanContext(),
 	                       self->dbgid,
 	                       recoveryCommitRequest.arena,
--- a/fdbserver/CommitProxyServer.actor.cpp
+++ b/fdbserver/CommitProxyServer.actor.cpp
@ -28,7 +28,7 @@
 #include "fdbclient/CommitTransaction.h"
 #include "fdbclient/DatabaseContext.h"
 #include "fdbclient/FDBTypes.h"
-#include "fdbclient/IdempotencyId.h"
+#include "fdbclient/IdempotencyId.actor.h"
 #include "fdbclient/Knobs.h"
 #include "fdbclient/CommitProxyInterface.h"
 #include "fdbclient/NativeAPI.actor.h"
@ -1616,6 +1616,14 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {
 		                            self->toCommit.writeTypedMessage(idempotencyIdSet);
 	                            });

+	for (const auto& m : pProxyCommitData->idempotencyClears) {
+		auto& tags = pProxyCommitData->tagsForKey(m.param1);
+		self->toCommit.addTags(tags);
+		// TODO(nwijetunga): Encrypt these mutations
+		self->toCommit.writeTypedMessage(m);
+	}
+	pProxyCommitData->idempotencyClears = Standalone<VectorRef<MutationRef>>();
+
 	self->toCommit.saveTags(self->writtenTags);

 	pProxyCommitData->stats.mutations += self->mutationCount;
@ -1864,10 +1872,14 @@ ACTOR Future<Void> reply(CommitBatchContext* self) {
 	// Reset all to zero, used to track the correct index of each commitTransacitonRef on each resolver

 	std::fill(self->nextTr.begin(), self->nextTr.end(), 0);
+	std::unordered_map<uint8_t, int16_t> idCountsForKey;
 	for (int t = 0; t < self->trs.size(); t++) {
 		auto& tr = self->trs[t];
 		if (self->committed[t] == ConflictBatch::TransactionCommitted && (!self->locked || tr.isLockAware())) {
 			ASSERT_WE_THINK(self->commitVersion != invalidVersion);
+			if (self->trs[t].idempotencyId.valid()) {
+				idCountsForKey[uint8_t(t >> 8)] += 1;
+			}
 			tr.reply.send(CommitID(self->commitVersion, t, self->metadataVersionAfter));
 		} else if (self->committed[t] == ConflictBatch::TransactionTooOld) {
 			tr.reply.sendError(transaction_too_old());
@ -1914,6 +1926,11 @@ ACTOR Future<Void> reply(CommitBatchContext* self) {
 		}
 	}

+	for (auto [highOrderBatchIndex, count] : idCountsForKey) {
+		pProxyCommitData->expectedIdempotencyIdCountForKey.send(
+		    ExpectedIdempotencyIdCountForKey{ self->commitVersion, count, highOrderBatchIndex });
+	}
+
 	++pProxyCommitData->stats.commitBatchOut;
 	pProxyCommitData->stats.txnCommitOut += self->trs.size();
 	pProxyCommitData->stats.txnConflicts += self->trs.size() - self->commitCount;
@ -2469,6 +2486,96 @@ ACTOR Future<Void> reportTxnTagCommitCost(UID myID,
 	}
 }

+namespace {
+struct ExpireServerEntry {
+	int64_t timeReceived;
+	int expectedCount = 0;
+	int receivedCount = 0;
+	bool initialized = false;
+};
+
+struct IdempotencyKey {
+	Version version;
+	uint8_t highOrderBatchIndex;
+	bool operator==(const IdempotencyKey& other) const {
+		return version == other.version && highOrderBatchIndex == other.highOrderBatchIndex;
+	}
+};
+
+} // namespace
+
+namespace std {
+template <>
+struct hash<IdempotencyKey> {
+	std::size_t operator()(const IdempotencyKey& key) const {
+		std::size_t seed = 0;
+		boost::hash_combine(seed, std::hash<Version>{}(key.version));
+		boost::hash_combine(seed, std::hash<uint8_t>{}(key.highOrderBatchIndex));
+		return seed;
+	}
+};
+
+} // namespace std
+
+ACTOR static Future<Void> idempotencyIdsExpireServer(
+    Database db,
+    PublicRequestStream<ExpireIdempotencyIdRequest> expireIdempotencyId,
+    PromiseStream<ExpectedIdempotencyIdCountForKey> expectedIdempotencyIdCountForKey,
+    Standalone<VectorRef<MutationRef>>* idempotencyClears) {
+	state std::unordered_map<IdempotencyKey, ExpireServerEntry> idStatus;
+	state std::unordered_map<IdempotencyKey, ExpireServerEntry>::iterator iter;
+	state int64_t purgeBefore;
+	state IdempotencyKey key;
+	state ExpireServerEntry* status = nullptr;
+	state Future<Void> purgeOld = Void();
+	loop {
+		choose {
+			when(ExpireIdempotencyIdRequest req = waitNext(expireIdempotencyId.getFuture())) {
+				key = IdempotencyKey{ req.commitVersion, req.batchIndexHighByte };
+				status = &idStatus[key];
+				status->receivedCount += 1;
+				CODE_PROBE(status->expectedCount == 0, "ExpireIdempotencyIdRequest received before count is known");
+				if (status->expectedCount > 0) {
+					ASSERT_LE(status->receivedCount, status->expectedCount);
+				}
+			}
+			when(ExpectedIdempotencyIdCountForKey req = waitNext(expectedIdempotencyIdCountForKey.getFuture())) {
+				key = IdempotencyKey{ req.commitVersion, req.batchIndexHighByte };
+				status = &idStatus[key];
+				ASSERT_EQ(status->expectedCount, 0);
+				status->expectedCount = req.idempotencyIdCount;
+			}
+			when(wait(purgeOld)) {
+				purgeOld = delay(SERVER_KNOBS->IDEMPOTENCY_ID_IN_MEMORY_LIFETIME);
+				purgeBefore = now() - SERVER_KNOBS->IDEMPOTENCY_ID_IN_MEMORY_LIFETIME;
+				for (iter = idStatus.begin(); iter != idStatus.end();) {
+					// We have exclusive access to idStatus in this when block, so iter will still be valid after the
+					// wait
+					wait(yield());
+					if (iter->second.timeReceived < purgeBefore) {
+						iter = idStatus.erase(iter);
+					} else {
+						++iter;
+					}
+				}
+				continue;
+			}
+		}
+		if (status->initialized) {
+			if (status->receivedCount == status->expectedCount) {
+				auto keyRange =
+				    makeIdempotencySingleKeyRange(idempotencyClears->arena(), key.version, key.highOrderBatchIndex);
+				idempotencyClears->push_back(idempotencyClears->arena(),
+				                             MutationRef(MutationRef::ClearRange, keyRange.begin, keyRange.end));
+				idStatus.erase(key);
+			}
+		} else {
+			status->timeReceived = now();
+			status->initialized = true;
+		}
+	}
+}
+
 namespace {

 struct TransactionStateResolveContext {
@ -2733,6 +2840,10 @@ ACTOR Future<Void> commitProxyServerCore(CommitProxyInterface proxy,
 	addActor.send(rejoinServer(proxy, &commitData));
 	addActor.send(ddMetricsRequestServer(proxy, db));
 	addActor.send(reportTxnTagCommitCost(proxy.id(), db, &commitData.ssTrTagCommitCost));
+	addActor.send(idempotencyIdsExpireServer(openDBOnServer(db),
+	                                         proxy.expireIdempotencyId,
+	                                         commitData.expectedIdempotencyIdCountForKey,
+	                                         &commitData.idempotencyClears));

 	// wait for txnStateStore recovery
 	wait(success(commitData.txnStateStore->readValue(StringRef())));
--- a/fdbserver/ConfigBroadcaster.actor.cpp
+++ b/fdbserver/ConfigBroadcaster.actor.cpp
@ -183,8 +183,8 @@ class ConfigBroadcasterImpl {
 	    id(deterministicRandom()->randomUniqueID()), cc("ConfigBroadcaster"), compactRequest("CompactRequest", cc),
 	    successfulChangeRequest("SuccessfulChangeRequest", cc), failedChangeRequest("FailedChangeRequest", cc),
 	    snapshotRequest("SnapshotRequest", cc) {
-		logger = traceCounters(
-		    "ConfigBroadcasterMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ConfigBroadcasterMetrics");
+		logger = cc.traceCounters(
+		    "ConfigBroadcasterMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ConfigBroadcasterMetrics");
 	}

 	void addChanges(Standalone<VectorRef<VersionedConfigMutationRef>> const& changes,
--- a/fdbserver/ConfigNode.actor.cpp
+++ b/fdbserver/ConfigNode.actor.cpp
@ -812,7 +812,7 @@ public:
 	    successfulCommits("SuccessfulCommits", cc), failedCommits("FailedCommits", cc),
 	    setMutations("SetMutations", cc), clearMutations("ClearMutations", cc),
 	    getValueRequests("GetValueRequests", cc), getGenerationRequests("GetGenerationRequests", cc) {
-		logger = traceCounters("ConfigNodeMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ConfigNode");
+		logger = cc.traceCounters("ConfigNodeMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ConfigNode");
 		TraceEvent(SevInfo, "StartingConfigNode", id).detail("KVStoreAlreadyExists", kvStore.exists());
 	}

--- a/fdbserver/ConsistencyScan.actor.cpp
+++ b/fdbserver/ConsistencyScan.actor.cpp
@ -29,7 +29,7 @@
 #include "fdbclient/ReadYourWrites.h"
 #include "fdbclient/TagThrottle.actor.h"
 #include "fdbserver/Knobs.h"
-#include "fdbserver/StorageMetrics.h"
+#include "fdbserver/StorageMetrics.actor.h"
 #include "fdbserver/DataDistribution.actor.h"
 #include "fdbserver/RatekeeperInterface.h"
 #include "fdbserver/ServerDBInfo.h"
@ -393,6 +393,7 @@ ACTOR Future<bool> checkDataConsistency(Database cx,
 	state double rateLimiterStartTime = now();
 	state int64_t bytesReadInthisRound = 0;
 	state bool resume = !(restart || shuffleShards);
+	state bool testResult = true;

 	state double dbSize = 100e12;
 	if (g_network->isSimulated()) {
@ -710,7 +711,7 @@ ACTOR Future<bool> checkDataConsistency(Database cx,
 									    (!storageServerInterfaces[j].isTss() &&
 									     !storageServerInterfaces[firstValidServer].isTss())) {
 										testFailure("Data inconsistent", performQuiescentChecks, true);
-										return false;
+										testResult = false;
 									}
 								}
 							}
@ -949,7 +950,7 @@ ACTOR Future<bool> checkDataConsistency(Database cx,
 	}

 	*bytesReadInPrevRound = bytesReadInthisRound;
-	return true;
+	return testResult;
 }

 ACTOR Future<Void> runDataValidationCheck(ConsistencyScanData* self) {
--- a/fdbserver/DDShardTracker.actor.cpp
+++ b/fdbserver/DDShardTracker.actor.cpp
@ -212,7 +212,7 @@ ShardSizeBounds calculateShardSizeBounds(const KeyRange& keys,
                                         const Reference<AsyncVar<Optional<ShardMetrics>>>& shardMetrics,
                                         const BandwidthStatus& bandwidthStatus,
                                         PromiseStream<KeyRange> readHotShard) {
-	ShardSizeBounds bounds;
+	ShardSizeBounds bounds = ShardSizeBounds::shardSizeBoundsBeforeTrack();
 	if (shardMetrics->get().present()) {
 		auto bytes = shardMetrics->get().get().metrics.bytes;
 		auto readBandwidthStatus = getReadBandwidthStatus(shardMetrics->get().get().metrics);
@ -259,21 +259,7 @@ ShardSizeBounds calculateShardSizeBounds(const KeyRange& keys,
 		} else {
 			ASSERT(false);
 		}
-	} else {
-		bounds.max.bytes = -1;
-		bounds.min.bytes = -1;
-		bounds.permittedError.bytes = -1;
-		bounds.max.bytesPerKSecond = bounds.max.infinity;
-		bounds.min.bytesPerKSecond = 0;
-		bounds.permittedError.bytesPerKSecond = bounds.permittedError.infinity;
-		bounds.max.bytesReadPerKSecond = bounds.max.infinity;
-		bounds.min.bytesReadPerKSecond = 0;
-		bounds.permittedError.bytesReadPerKSecond = bounds.permittedError.infinity;
 	}
-
-	bounds.max.iosPerKSecond = bounds.max.infinity;
-	bounds.min.iosPerKSecond = 0;
-	bounds.permittedError.iosPerKSecond = bounds.permittedError.infinity;
 	return bounds;
 }

--- a/fdbserver/DDTeamCollection.actor.cpp
+++ b/fdbserver/DDTeamCollection.actor.cpp
@ -895,7 +895,7 @@ public:
 							if (maxPriority < SERVER_KNOBS->PRIORITY_TEAM_FAILED) {
 								std::pair<std::vector<ShardsAffectedByTeamFailure::Team>,
 								          std::vector<ShardsAffectedByTeamFailure::Team>>
-								    teams = self->shardsAffectedByTeamFailure->getTeamsFor(shards[i]);
+								    teams = self->shardsAffectedByTeamFailure->getTeamsForFirstShard(shards[i]);
 								for (int j = 0; j < teams.first.size() + teams.second.size(); j++) {
 									// t is the team in primary DC or the remote DC
 									auto& t =
@ -2284,15 +2284,12 @@ public:
 			self->recruitingIds.insert(interfaceId);
 			self->recruitingLocalities.insert(candidateWorker.worker.stableAddress());

-			UID clusterId = wait(self->getClusterId());
-
 			state InitializeStorageRequest isr;
 			isr.storeType = recruitTss ? self->configuration.testingStorageServerStoreType
 			                           : self->configuration.storageServerStoreType;
 			isr.seedTag = invalidTag;
 			isr.reqId = deterministicRandom()->randomUniqueID();
 			isr.interfaceId = interfaceId;
-			isr.clusterId = clusterId;

 			// if tss, wait for pair ss to finish and add its id to isr. If pair fails, don't recruit tss
 			state bool doRecruit = true;
@ -3470,10 +3467,6 @@ Future<Void> DDTeamCollection::monitorHealthyTeams() {
 	return DDTeamCollectionImpl::monitorHealthyTeams(this);
 }

-Future<UID> DDTeamCollection::getClusterId() {
-	return db->getClusterId();
-}
-
 Future<UID> DDTeamCollection::getNextWigglingServerID() {
 	Optional<Value> localityKey;
 	Optional<Value> localityValue;
--- a/fdbserver/DDTxnProcessor.actor.cpp
+++ b/fdbserver/DDTxnProcessor.actor.cpp
@ -221,21 +221,6 @@ class DDTxnProcessorImpl {
 		}
 	}

-	ACTOR static Future<UID> getClusterId(Database cx) {
-		state Transaction tr(cx);
-		loop {
-			try {
-				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-				tr.setOption(FDBTransactionOptions::LOCK_AWARE);
-				Optional<Value> clusterId = wait(tr.get(clusterIdKey));
-				ASSERT(clusterId.present());
-				return BinaryReader::fromStringRef<UID>(clusterId.get(), Unversioned());
-			} catch (Error& e) {
-				wait(tr.onError(e));
-			}
-		}
-	}
-
 	// Read keyservers, return unique set of teams
 	ACTOR static Future<Reference<InitialDataDistribution>> getInitialDataDistribution(
 	    Database cx,
@ -319,6 +304,7 @@ class DDTxnProcessorImpl {
 				for (int i = 0; i < dms.size(); ++i) {
 					auto dataMove = std::make_shared<DataMove>(decodeDataMoveValue(dms[i].value), true);
 					const DataMoveMetaData& meta = dataMove->meta;
+					ASSERT(!meta.ranges.empty());
 					for (const UID& id : meta.src) {
 						auto& dc = server_dc[id];
 						if (std::find(remoteDcIds.begin(), remoteDcIds.end(), dc) != remoteDcIds.end()) {
@ -340,11 +326,11 @@ class DDTxnProcessorImpl {
 					std::sort(dataMove->primaryDest.begin(), dataMove->primaryDest.end());
 					std::sort(dataMove->remoteDest.begin(), dataMove->remoteDest.end());

-					auto ranges = result->dataMoveMap.intersectingRanges(meta.range);
+					auto ranges = result->dataMoveMap.intersectingRanges(meta.ranges.front());
 					for (auto& r : ranges) {
 						ASSERT(!r.value()->valid);
 					}
-					result->dataMoveMap.insert(meta.range, std::move(dataMove));
+					result->dataMoveMap.insert(meta.ranges.front(), std::move(dataMove));
 					++numDataMoves;
 				}

@ -675,10 +661,6 @@ Future<int> DDTxnProcessor::tryUpdateReplicasKeyForDc(const Optional<Key>& dcId,
 	return DDTxnProcessorImpl::tryUpdateReplicasKeyForDc(cx, dcId, storageTeamSize);
 }

-Future<UID> DDTxnProcessor::getClusterId() const {
-	return DDTxnProcessorImpl::getClusterId(cx);
-}
-
 Future<Void> DDTxnProcessor::waitDDTeamInfoPrintSignal() const {
 	return DDTxnProcessorImpl::waitDDTeamInfoPrintSignal(cx);
 }
@ -763,7 +745,7 @@ std::vector<DDShardInfo> DDMockTxnProcessor::getDDShardInfos() const {
 		KeyRangeRef curRange = it->range();
 		DDShardInfo info(curRange.begin);

-		auto teams = mgs->shardMapping->getTeamsFor(curRange);
+		auto teams = mgs->shardMapping->getTeamsForFirstShard(curRange);
 		if (!teams.first.empty() && !teams.second.empty()) {
 			CODE_PROBE(true, "Mock InitialDataDistribution In-Flight shard");
 			info.hasDest = true;
@ -816,7 +798,7 @@ Future<Void> DDMockTxnProcessor::removeStorageServer(const UID& serverID,
                                                     const Optional<UID>& tssPairID,
                                                     const MoveKeysLock& lock,
                                                     const DDEnabledState* ddEnabledState) const {
-	ASSERT(mgs->allShardRemovedFromServer(serverID));
+	ASSERT(mgs->allShardsRemovedFromServer(serverID));
 	mgs->allServers.erase(serverID);
 	return Void();
 }
@ -862,16 +844,14 @@ Future<HealthMetrics> DDMockTxnProcessor::getHealthMetrics(bool detailed) const
 	return Future<HealthMetrics>();
 }

-// FIXME: finish implementation
 Future<Standalone<VectorRef<KeyRef>>> DDMockTxnProcessor::splitStorageMetrics(
    const KeyRange& keys,
    const StorageMetrics& limit,
    const StorageMetrics& estimated,
    const Optional<int>& minSplitBytes) const {
-	return Future<Standalone<VectorRef<KeyRef>>>();
+	return mgs->splitStorageMetrics(keys, limit, estimated, minSplitBytes);
 }

-// FIXME: finish implementation
 Future<std::pair<Optional<StorageMetrics>, int>> DDMockTxnProcessor::waitStorageMetrics(
    const KeyRange& keys,
    const StorageMetrics& min,
@ -879,7 +859,7 @@ Future<std::pair<Optional<StorageMetrics>, int>> DDMockTxnProcessor::waitStorage
    const StorageMetrics& permittedError,
    int shardLimit,
    int expectedShardCount) const {
-	return Future<std::pair<Optional<StorageMetrics>, int>>();
+	return mgs->waitStorageMetrics(keys, min, max, permittedError, shardLimit, expectedShardCount);
 }

 // FIXME: finish implementation
@ -910,7 +890,7 @@ void DDMockTxnProcessor::rawFinishMovement(MoveKeysParams& params,
 	ASSERT(params.finishMoveKeysParallelismLock->take().isReady());

 	// get source and dest teams
-	auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsFor(params.keys);
+	auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsForFirstShard(params.keys);

 	ASSERT_EQ(destTeams.size(), 0);
 	if (destTeams.front() != ShardsAffectedByTeamFailure::Team{ params.destinationTeam, true }) {
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -53,6 +53,20 @@
 #include "fdbserver/DDSharedContext.h"
 #include "flow/actorcompiler.h" // This must be the last #include.

+ShardSizeBounds ShardSizeBounds::shardSizeBoundsBeforeTrack() {
+	return ShardSizeBounds{
+		.max = StorageMetrics{ .bytes = -1,
+		                       .bytesPerKSecond = StorageMetrics::infinity,
+		                       .iosPerKSecond = StorageMetrics::infinity,
+		                       .bytesReadPerKSecond = StorageMetrics::infinity },
+		.min = StorageMetrics{ .bytes = -1, .bytesPerKSecond = 0, .iosPerKSecond = 0, .bytesReadPerKSecond = 0 },
+		.permittedError = StorageMetrics{ .bytes = -1,
+		                                  .bytesPerKSecond = StorageMetrics::infinity,
+		                                  .iosPerKSecond = StorageMetrics::infinity,
+		                                  .bytesReadPerKSecond = StorageMetrics::infinity }
+	};
+}
+
 struct DDAudit {
 	DDAudit(UID id, KeyRange range, AuditType type)
 	  : id(id), range(range), type(type), auditMap(AuditPhase::Invalid, allKeys.end), actors(true) {}
@ -76,7 +90,7 @@ void DataMove::validateShard(const DDShardInfo& shard, KeyRangeRef range, int pr
 		return;
 	}

-	ASSERT(this->meta.range.contains(range));
+	ASSERT(!this->meta.ranges.empty() && this->meta.ranges.front().contains(range));

 	if (!shard.hasDest) {
 		TraceEvent(SevError, "DataMoveValidationError")
@ -480,17 +494,21 @@ public:

 		for (; it != self->initData->dataMoveMap.ranges().end(); ++it) {
 			const DataMoveMetaData& meta = it.value()->meta;
+			if (meta.ranges.empty()) {
+				TraceEvent(SevWarnAlways, "EmptyDataMoveRange", self->ddId).detail("DataMoveMetaData", meta.toString());
+				continue;
+			}
 			if (it.value()->isCancelled() || (it.value()->valid && !SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA)) {
-				RelocateShard rs(meta.range, DataMovementReason::RECOVER_MOVE, RelocateReason::OTHER);
+				RelocateShard rs(meta.ranges.front(), DataMovementReason::RECOVER_MOVE, RelocateReason::OTHER);
 				rs.dataMoveId = meta.id;
 				rs.cancelled = true;
 				self->relocationProducer.send(rs);
 				TraceEvent("DDInitScheduledCancelDataMove", self->ddId).detail("DataMove", meta.toString());
 			} else if (it.value()->valid) {
 				TraceEvent(SevDebug, "DDInitFoundDataMove", self->ddId).detail("DataMove", meta.toString());
-				ASSERT(meta.range == it.range());
+				ASSERT(meta.ranges.front() == it.range());
 				// TODO: Persist priority in DataMoveMetaData.
-				RelocateShard rs(meta.range, DataMovementReason::RECOVER_MOVE, RelocateReason::OTHER);
+				RelocateShard rs(meta.ranges.front(), DataMovementReason::RECOVER_MOVE, RelocateReason::OTHER);
 				rs.dataMoveId = meta.id;
 				rs.dataMove = it.value();
 				std::vector<ShardsAffectedByTeamFailure::Team> teams;
--- a/fdbserver/DiskQueue.actor.cpp
+++ b/fdbserver/DiskQueue.actor.cpp
@ -1662,3 +1662,43 @@ IDiskQueue* openDiskQueue(std::string basename,
                          int64_t fileSizeWarningLimit) {
 	return new DiskQueue_PopUncommitted(basename, ext, dbgid, dqv, fileSizeWarningLimit);
 }
+
+TEST_CASE("performance/fdbserver/DiskQueue") {
+	state IDiskQueue* queue =
+	    openDiskQueue("test-", "fdq", deterministicRandom()->randomUniqueID(), DiskQueueVersion::V2);
+	state std::string valueString = std::string(10e6, '.');
+	state StringRef valueStr((uint8_t*)valueString.c_str(), 10e6);
+	state std::deque<IDiskQueue::location> locations;
+	state int loopCount = 0;
+	state Future<Void> lastCommit = Void();
+	bool fullyRecovered = wait(queue->initializeRecovery(0));
+	if (!fullyRecovered) {
+		loop {
+			Standalone<StringRef> h = wait(queue->readNext(1e6));
+			if (h.size() < 1e6) {
+				break;
+			}
+		}
+	}
+	while (loopCount < 4000) {
+		if (loopCount % 100 == 0) {
+			printf("loop count: %d\n", loopCount);
+		}
+		if (++loopCount % 2 == 0) {
+			state IDiskQueue::location frontLocation = locations.front();
+			locations.pop_front();
+			if (locations.size() > 10) {
+				Standalone<StringRef> r = wait(queue->read(frontLocation, locations.front(), CheckHashes::True));
+			}
+			queue->pop(frontLocation);
+		}
+		wait(delay(0.001));
+		locations.push_back(queue->push(valueStr));
+		Future<Void> prevCommit = lastCommit;
+		lastCommit = queue->commit();
+		wait(prevCommit);
+	}
+	queue->dispose();
+	wait(queue->onClosed());
+	return Void();
+}
--- a/fdbserver/EncryptKeyProxy.actor.cpp
+++ b/fdbserver/EncryptKeyProxy.actor.cpp
@ -625,7 +625,7 @@ bool isBlobMetadataEligibleForRefresh(const BlobMetadataDetailsRef& blobMetadata
 	if (BUGGIFY_WITH_PROB(0.01)) {
 		return true;
 	}
-	int64_t nextRefreshCycleTS = currTS + SERVER_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
+	int64_t nextRefreshCycleTS = currTS + CLIENT_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
 	return nextRefreshCycleTS > blobMetadata.expireAt || nextRefreshCycleTS > blobMetadata.refreshAt;
 }

@ -895,7 +895,7 @@ ACTOR Future<Void> encryptKeyProxyServer(EncryptKeyProxyInterface ekpInterface,
 	                                              TaskPriority::Worker);

 	self->blobMetadataRefresher = recurring([&]() { refreshBlobMetadata(self, kmsConnectorInf); },
-	                                        SERVER_KNOBS->BLOB_METADATA_REFRESH_INTERVAL,
+	                                        CLIENT_KNOBS->BLOB_METADATA_REFRESH_INTERVAL,
 	                                        TaskPriority::Worker);

 	try {
--- a/fdbserver/FDBExecHelper.actor.cpp
+++ b/fdbserver/FDBExecHelper.actor.cpp
@ -170,7 +170,8 @@ ACTOR Future<int> spawnSimulated(std::vector<std::string> paramList,
 	    ProcessClass(ProcessClass::UnsetClass, ProcessClass::AutoSource),
 	    self->dataFolder.c_str(),
 	    self->coordinationFolder.c_str(), // do we need to customize this coordination folder path?
-	    self->protocolVersion);
+	    self->protocolVersion,
+	    false);
 	wait(g_simulator->onProcess(child));
 	state Future<ISimulator::KillType> onShutdown = child->onShutdown();
 	state Future<ISimulator::KillType> parentShutdown = self->onShutdown();
--- a/fdbserver/GlobalTagThrottler.actor.cpp
+++ b/fdbserver/GlobalTagThrottler.actor.cpp
@ -202,7 +202,8 @@ class GlobalTagThrottlerImpl {
 		for (const auto& [id, _] : throughput) {
 			result += getCurrentCost(id, tag).orDefault(0);
 		}
-		TraceEvent("GlobalTagThrottler_GetCurrentCost").detail("Tag", printable(tag)).detail("Cost", result);
+		// FIXME: Disabled due to noisy trace events. Fix the noise and reenabled
+		//TraceEvent("GlobalTagThrottler_GetCurrentCost").detail("Tag", printable(tag)).detail("Cost", result);

 		return result;
 	}
@ -235,10 +236,13 @@ class GlobalTagThrottlerImpl {
 			return 1.0;
 		}
 		auto const transactionRate = stats.get().getTransactionRate();
+		// FIXME: Disabled due to noisy trace events. Fix the noise and reenabled
+		/*
 		TraceEvent("GlobalTagThrottler_GetAverageTransactionCost")
 		    .detail("Tag", tag)
 		    .detail("TransactionRate", transactionRate)
 		    .detail("Cost", cost);
+		*/
 		if (transactionRate == 0.0) {
 			return 1.0;
 		} else {
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@ -154,7 +154,7 @@ struct GrvProxyStats {
 			return int64_t(100 * this->percentageOfBatchGRVQueueProcessed);
 		});

-		logger = traceCounters("GrvProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "GrvProxyMetrics");
+		logger = cc.traceCounters("GrvProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "GrvProxyMetrics");
 		for (int i = 0; i < FLOW_KNOBS->BASIC_LOAD_BALANCE_BUCKETS; i++) {
 			requestBuckets.push_back(0);
 		}
@ -459,9 +459,9 @@ void dropRequestFromQueue(Deque<GetReadVersionRequest>* queue, GrvProxyStats* st

 // Put a GetReadVersion request into the queue corresponding to its priority.
 ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo> const> db,
-                                               SpannedDeque<GetReadVersionRequest>* systemQueue,
-                                               SpannedDeque<GetReadVersionRequest>* defaultQueue,
-                                               SpannedDeque<GetReadVersionRequest>* batchQueue,
+                                               Deque<GetReadVersionRequest>* systemQueue,
+                                               Deque<GetReadVersionRequest>* defaultQueue,
+                                               Deque<GetReadVersionRequest>* batchQueue,
                                               FutureStream<GetReadVersionRequest> readVersionRequests,
                                               PromiseStream<Void> GRVTimer,
                                               double* lastGRVTime,
@ -531,7 +531,6 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>
 					stats->txnSystemPriorityStartIn += req.transactionCount;
 					++stats->systemGRVQueueSize;
 					systemQueue->push_back(req);
-					// systemQueue->span.addParent(req.spanContext);
 				} else if (req.priority >= TransactionPriority::DEFAULT) {
 					++stats->txnRequestIn;
 					stats->txnStartIn += req.transactionCount;
@ -542,7 +541,6 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>
 					} else {
 						defaultQueue->push_back(req);
 					}
-					// defaultQueue->span.addParent(req.spanContext);
 				} else {
 					// Return error for batch_priority GRV requests
 					int64_t proxiesCount = std::max((int)db->get().client.grvProxies.size(), 1);
@ -559,7 +557,6 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>
 						} else {
 							batchQueue->push_back(req);
 						}
-						// batchQueue->span.addParent(req.spanContext);
 					}
 				}
 			}
@ -607,7 +604,7 @@ ACTOR Future<Void> lastCommitUpdater(GrvProxyData* self, PromiseStream<Future<Vo
 	}
 }

-ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(SpanContext parentSpan,
+ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(std::vector<SpanContext> spanContexts,
                                                          GrvProxyData* grvProxyData,
                                                          uint32_t flags,
                                                          Optional<UID> debugID,
@ -620,7 +617,10 @@ ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(SpanContext parentSpan
 	// before the request returns, so it is committed. (2) No proxy on our list reported committed a higher version
 	// before this request was received, because then its committedVersion would have been higher,
 	//     and no other proxy could have already committed anything without first ending the epoch
-	state Span span("GP:getLiveCommittedVersion"_loc, parentSpan);
+	state Span span("GP:getLiveCommittedVersion"_loc);
+	for (const SpanContext& spanContext : spanContexts) {
+		span.addLink(spanContext);
+	}
 	++grvProxyData->stats.txnStartBatch;

 	state double grvStart = now();
@ -826,15 +826,14 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 	state GrvTransactionRateInfo batchRateInfo(0);
 	state GrvProxyTransactionTagThrottler tagThrottler;

-	state SpannedDeque<GetReadVersionRequest> systemQueue("GP:transactionStarterSystemQueue"_loc);
-	state SpannedDeque<GetReadVersionRequest> defaultQueue("GP:transactionStarterDefaultQueue"_loc);
-	state SpannedDeque<GetReadVersionRequest> batchQueue("GP:transactionStarterBatchQueue"_loc);
+	state Deque<GetReadVersionRequest> systemQueue;
+	state Deque<GetReadVersionRequest> defaultQueue;
+	state Deque<GetReadVersionRequest> batchQueue;

 	state TransactionTagMap<uint64_t> transactionTagCounter;
 	state PrioritizedTransactionTagMap<ClientTagThrottleLimits> clientThrottledTags;

 	state PromiseStream<double> normalGRVLatency;
-	// state Span span;

 	state int64_t midShardSize = SERVER_KNOBS->MIN_SHARD_BYTES;
 	getCurrentLineage()->modify(&TransactionLineage::operation) =
@ -911,7 +910,7 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 		uint32_t defaultQueueSize = defaultQueue.size();
 		uint32_t batchQueueSize = batchQueue.size();
 		while (requestsToStart < SERVER_KNOBS->START_TRANSACTION_MAX_REQUESTS_TO_START) {
-			SpannedDeque<GetReadVersionRequest>* transactionQueue;
+			Deque<GetReadVersionRequest>* transactionQueue;
 			if (!systemQueue.empty()) {
 				transactionQueue = &systemQueue;
 			} else if (!defaultQueue.empty()) {
@ -921,7 +920,6 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 			} else {
 				break;
 			}
-			// transactionQueue->span.swap(span);

 			auto& req = transactionQueue->front();
 			int tc = req.transactionCount;
@ -1017,7 +1015,13 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 		int batchGRVProcessed = 0;
 		for (int i = 0; i < start.size(); i++) {
 			if (start[i].size()) {
-				Future<GetReadVersionReply> readVersionReply = getLiveCommittedVersion(SpanContext(),
+				std::vector<SpanContext> spanContexts;
+				spanContexts.reserve(start[i].size());
+				for (const GetReadVersionRequest& request : start[i]) {
+					spanContexts.push_back(request.spanContext);
+				}
+
+				Future<GetReadVersionReply> readVersionReply = getLiveCommittedVersion(spanContexts,
 				                                                                       grvProxyData,
 				                                                                       i,
 				                                                                       debugID,
@ -1041,7 +1045,6 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 				batchGRVProcessed += batchPriTransactionsStarted[i];
 			}
 		}
-		// span = Span(span.location);

 		grvProxyData->stats.percentageOfDefaultGRVQueueProcessed =
 		    defaultQueueSize ? (double)defaultGRVProcessed / defaultQueueSize : 1;
--- a/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp
+++ b/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp
@ -18,6 +18,7 @@
 * limitations under the License.
 */

+#include "fdbclient/Knobs.h"
 #include "fdbserver/GrvProxyTransactionTagThrottler.h"
 #include "flow/UnitTest.h"
 #include "flow/actorcompiler.h" // must be last include
@ -28,6 +29,10 @@ void GrvProxyTransactionTagThrottler::DelayedRequest::updateProxyTagThrottledDur
 	req.proxyTagThrottledDuration = now() - startTime;
 }

+bool GrvProxyTransactionTagThrottler::DelayedRequest::isMaxThrottled() const {
+	return now() - startTime > CLIENT_KNOBS->PROXY_MAX_TAG_THROTTLE_DURATION;
+}
+
 void GrvProxyTransactionTagThrottler::TagQueue::setRate(double rate) {
 	if (rateInfo.present()) {
 		rateInfo.get().setRate(rate);
@ -36,6 +41,20 @@ void GrvProxyTransactionTagThrottler::TagQueue::setRate(double rate) {
 	}
 }

+bool GrvProxyTransactionTagThrottler::TagQueue::isMaxThrottled() const {
+	return !requests.empty() && requests.front().isMaxThrottled();
+}
+
+void GrvProxyTransactionTagThrottler::TagQueue::rejectRequests() {
+	CODE_PROBE(true, "GrvProxyTransactionTagThrottler rejecting requests");
+	while (!requests.empty()) {
+		auto& delayedReq = requests.front();
+		delayedReq.updateProxyTagThrottledDuration();
+		delayedReq.req.reply.sendError(proxy_tag_throttled());
+		requests.pop_front();
+	}
+}
+
 void GrvProxyTransactionTagThrottler::updateRates(TransactionTagMap<double> const& newRates) {
 	for (const auto& [tag, rate] : newRates) {
 		auto it = queues.find(tag);
@ -73,6 +92,7 @@ void GrvProxyTransactionTagThrottler::addRequest(GetReadVersionRequest const& re
 		// SERVER_KNOBS->ENFORCE_TAG_THROTTLING_ON_PROXIES is enabled, there may be
 		// unexpected behaviour, because only one tag is used for throttling.
 		TraceEvent(SevWarnAlways, "GrvProxyTransactionTagThrottler_MultipleTags")
+		    .suppressFor(1.0)
 		    .detail("NumTags", req.tags.size())
 		    .detail("UsingTag", printable(tag));
 	}
@ -80,8 +100,8 @@ void GrvProxyTransactionTagThrottler::addRequest(GetReadVersionRequest const& re
 }

 void GrvProxyTransactionTagThrottler::releaseTransactions(double elapsed,
-                                                          SpannedDeque<GetReadVersionRequest>& outBatchPriority,
-                                                          SpannedDeque<GetReadVersionRequest>& outDefaultPriority) {
+                                                          Deque<GetReadVersionRequest>& outBatchPriority,
+                                                          Deque<GetReadVersionRequest>& outDefaultPriority) {
 	// Pointer to a TagQueue with some extra metadata stored alongside
 	struct TagQueueHandle {
 		// Store pointers here to avoid frequent std::unordered_map lookups
@ -140,6 +160,11 @@ void GrvProxyTransactionTagThrottler::releaseTransactions(double elapsed,
 				// Cannot release any more transaction from this tag (don't push the tag queue handle back into
 				// pqOfQueues)
 				CODE_PROBE(true, "GrvProxyTransactionTagThrottler throttling transaction");
+				if (tagQueueHandle.queue->isMaxThrottled()) {
+					// Requests in this queue have been throttled too long and errors
+					// should be sent to clients.
+					tagQueueHandle.queue->rejectRequests();
+				}
 				break;
 			} else {
 				if (tagQueueHandle.nextSeqNo < nextQueueSeqNo) {
@ -255,8 +280,8 @@ ACTOR static Future<Void> mockFifoClient(GrvProxyTransactionTagThrottler* thrott
 }

 ACTOR static Future<Void> mockServer(GrvProxyTransactionTagThrottler* throttler) {
-	state SpannedDeque<GetReadVersionRequest> outBatchPriority("TestGrvProxyTransactionTagThrottler_Batch"_loc);
-	state SpannedDeque<GetReadVersionRequest> outDefaultPriority("TestGrvProxyTransactionTagThrottler_Default"_loc);
+	state Deque<GetReadVersionRequest> outBatchPriority;
+	state Deque<GetReadVersionRequest> outDefaultPriority;
 	loop {
 		state double elapsed = (0.009 + 0.002 * deterministicRandom()->random01());
 		wait(delay(elapsed));
@ -379,8 +404,8 @@ TEST_CASE("/GrvProxyTransactionTagThrottler/Cleanup2") {
 	throttler.updateRates(TransactionTagMap<double>{});
 	ASSERT_EQ(throttler.size(), 1);
 	{
-		SpannedDeque<GetReadVersionRequest> outBatchPriority("TestGrvProxyTransactionTagThrottler_Batch"_loc);
-		SpannedDeque<GetReadVersionRequest> outDefaultPriority("TestGrvProxyTransactionTagThrottler_Default"_loc);
+		Deque<GetReadVersionRequest> outBatchPriority;
+		Deque<GetReadVersionRequest> outDefaultPriority;
 		throttler.releaseTransactions(0.1, outBatchPriority, outDefaultPriority);
 	}
 	// Calling updates cleans up the queues in throttler
--- a/fdbserver/LocalConfiguration.actor.cpp
+++ b/fdbserver/LocalConfiguration.actor.cpp
@ -347,8 +347,8 @@ public:
 			                            Randomize::False,
 			                            g_network->isSimulated() ? IsSimulated::True : IsSimulated::False);
 		}
-		logger = traceCounters(
-		    "LocalConfigurationMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "LocalConfigurationMetrics");
+		logger = cc.traceCounters(
+		    "LocalConfigurationMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "LocalConfigurationMetrics");
 	}

 	Future<Void> addChanges(Standalone<VectorRef<VersionedConfigMutationRef>> changes,
--- a/fdbserver/LogRouter.actor.cpp
+++ b/fdbserver/LogRouter.actor.cpp
@ -190,15 +190,14 @@ struct LogRouterData {
 		});
 		specialCounter(cc, "Generation", [this]() { return this->generation; });
 		specialCounter(cc, "ActivePeekStreams", [this]() { return this->activePeekStreams; });
-		logger = traceCounters("LogRouterMetrics",
-		                       dbgid,
-		                       SERVER_KNOBS->WORKER_LOGGING_INTERVAL,
-		                       &cc,
-		                       "LogRouterMetrics",
-		                       [this](TraceEvent& te) {
-			                       te.detail("PrimaryPeekLocation", this->primaryPeekLocation);
-			                       te.detail("RouterTag", this->routerTag.toString());
-		                       });
+		logger = cc.traceCounters("LogRouterMetrics",
+		                          dbgid,
+		                          SERVER_KNOBS->WORKER_LOGGING_INTERVAL,
+		                          "LogRouterMetrics",
+		                          [this](TraceEvent& te) {
+			                          te.detail("PrimaryPeekLocation", this->primaryPeekLocation);
+			                          te.detail("RouterTag", this->routerTag.toString());
+		                          });
 	}
 };

--- a/fdbserver/MockGlobalState.actor.cpp
+++ b/fdbserver/MockGlobalState.actor.cpp
@ -0,0 +1,623 @@
+/*
+ * MockGlobalState.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbserver/MockGlobalState.h"
+#include "fdbserver/workloads/workloads.actor.h"
+#include "fdbserver/DataDistribution.actor.h"
+#include "flow/actorcompiler.h"
+
+class MockGlobalStateImpl {
+public:
+	ACTOR static Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(MockGlobalState* mgs,
+	                                                                                 KeyRange keys,
+	                                                                                 StorageMetrics min,
+	                                                                                 StorageMetrics max,
+	                                                                                 StorageMetrics permittedError,
+	                                                                                 int shardLimit,
+	                                                                                 int expectedShardCount) {
+		state TenantInfo tenantInfo;
+		loop {
+			auto locations = mgs->getKeyRangeLocations(tenantInfo,
+			                                           keys,
+			                                           shardLimit,
+			                                           Reverse::False,
+			                                           SpanContext(),
+			                                           Optional<UID>(),
+			                                           UseProvisionalProxies::False,
+			                                           0)
+			                     .get();
+			TraceEvent(SevDebug, "MGSWaitStorageMetrics").detail("Phase", "GetLocation");
+			// NOTE(xwang): in native API, there's code handling the non-equal situation, but I think in mock world
+			// there shouldn't have any delay to update the locations.
+			ASSERT_EQ(expectedShardCount, locations.size());
+
+			Optional<StorageMetrics> res =
+			    wait(::waitStorageMetricsWithLocation(tenantInfo, keys, locations, min, max, permittedError));
+
+			if (res.present()) {
+				return std::make_pair(res, -1);
+			}
+			wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
+		}
+	}
+
+	// SOMEDAY: reuse the NativeAPI implementation
+	ACTOR static Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(MockGlobalState* mgs,
+	                                                                       KeyRange keys,
+	                                                                       StorageMetrics limit,
+	                                                                       StorageMetrics estimated,
+	                                                                       Optional<int> minSplitBytes) {
+		state TenantInfo tenantInfo;
+		loop {
+			state std::vector<KeyRangeLocationInfo> locations =
+			    mgs->getKeyRangeLocations(tenantInfo,
+			                              keys,
+			                              CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT,
+			                              Reverse::False,
+			                              SpanContext(),
+			                              Optional<UID>(),
+			                              UseProvisionalProxies::False,
+			                              0)
+			        .get();
+
+			// Same solution to NativeAPI::splitStorageMetrics, wait some merge finished
+			if (locations.size() == CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT) {
+				wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
+			}
+
+			Optional<Standalone<VectorRef<KeyRef>>> results =
+			    wait(splitStorageMetricsWithLocations(locations, keys, limit, estimated, minSplitBytes));
+
+			if (results.present()) {
+				return results.get();
+			}
+
+			wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
+		}
+	}
+};
+
+class MockStorageServerImpl {
+public:
+	ACTOR static Future<Void> waitMetricsTenantAware(MockStorageServer* self, WaitMetricsRequest req) {
+		if (req.tenantInfo.present() && req.tenantInfo.get().tenantId != TenantInfo::INVALID_TENANT) {
+			// TODO(xwang) add support for tenant test, search for tenant entry
+			Optional<TenantMapEntry> entry;
+			Optional<Key> tenantPrefix = entry.map<Key>([](TenantMapEntry e) { return e.prefix; });
+			if (tenantPrefix.present()) {
+				UNREACHABLE();
+				// req.keys = req.keys.withPrefix(tenantPrefix.get(), req.arena);
+			}
+		}
+
+		if (!self->isReadable(req.keys)) {
+			self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
+		} else {
+			wait(self->metrics.waitMetrics(req, delayJittered(SERVER_KNOBS->STORAGE_METRIC_TIMEOUT)));
+		}
+		return Void();
+	}
+};
+
+bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus status) {
+	auto ranges = serverKeys.intersectingRanges(range);
+	ASSERT(!ranges.empty()); // at least the range is allKeys
+
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		if (it->cvalue().status != status)
+			return false;
+	}
+	return true;
+}
+
+void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status, bool restrictSize) {
+	auto ranges = serverKeys.intersectingRanges(range);
+	ASSERT(!ranges.empty());
+	if (ranges.begin().range().contains(range)) {
+		CODE_PROBE(true, "Implicitly split single shard to 3 pieces");
+		threeWayShardSplitting(ranges.begin().range(), range, ranges.begin().cvalue().shardSize, restrictSize);
+		return;
+	}
+	if (ranges.begin().begin() < range.begin) {
+		CODE_PROBE(true, "Implicitly split begin range to 2 pieces");
+		twoWayShardSplitting(ranges.begin().range(), range.begin, ranges.begin().cvalue().shardSize, restrictSize);
+	}
+	if (ranges.end().end() > range.end) {
+		CODE_PROBE(true, "Implicitly split end range to 2 pieces");
+		twoWayShardSplitting(ranges.end().range(), range.end, ranges.end().cvalue().shardSize, restrictSize);
+	}
+	ranges = serverKeys.containedRanges(range);
+	// now the boundary must be aligned
+	ASSERT(ranges.begin().begin() == range.begin);
+	ASSERT(ranges.end().end() == range.end);
+	uint64_t newSize = 0;
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		newSize += it->cvalue().shardSize;
+	}
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		auto oldStatus = it.value().status;
+		if (isStatusTransitionValid(oldStatus, status)) {
+			it.value() = ShardInfo{ status, newSize };
+		} else if (oldStatus == MockShardStatus::COMPLETED && status == MockShardStatus::INFLIGHT) {
+			CODE_PROBE(true, "Shard already on server");
+		} else {
+			TraceEvent(SevError, "MockShardStatusTransitionError")
+			    .detail("From", oldStatus)
+			    .detail("To", status)
+			    .detail("ID", id)
+			    .detail("KeyBegin", range.begin.toHexString())
+			    .detail("KeyEnd", range.begin.toHexString());
+		}
+	}
+	serverKeys.coalesce(range);
+}
+
+// split the out range [a, d) based on the inner range's boundary [b, c). The result would be [a,b), [b,c), [c,d). The
+// size of the new shards are randomly split from old size of [a, d)
+void MockStorageServer::threeWayShardSplitting(KeyRangeRef outerRange,
+                                               KeyRangeRef innerRange,
+                                               uint64_t outerRangeSize,
+                                               bool restrictSize) {
+	ASSERT(outerRange.contains(innerRange));
+
+	Key left = outerRange.begin;
+	// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
+	int leftSize = deterministicRandom()->randomInt(
+	    SERVER_KNOBS->MIN_SHARD_BYTES,
+	    restrictSize ? outerRangeSize - 2 * SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
+	int midSize = deterministicRandom()->randomInt(
+	    SERVER_KNOBS->MIN_SHARD_BYTES,
+	    restrictSize ? outerRangeSize - leftSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
+	int rightSize =
+	    restrictSize ? outerRangeSize - leftSize - midSize
+	                 : deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
+
+	serverKeys.insert(innerRange, { serverKeys[left].status, (uint64_t)midSize });
+	serverKeys[left].shardSize = leftSize;
+	serverKeys[innerRange.end].shardSize = rightSize;
+}
+
+// split the range [a,c) with split point b. The result would be [a, b), [b, c). The
+// size of the new shards are randomly split from old size of [a, c)
+void MockStorageServer::twoWayShardSplitting(KeyRangeRef range,
+                                             KeyRef splitPoint,
+                                             uint64_t rangeSize,
+                                             bool restrictSize) {
+	Key left = range.begin;
+	// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
+	int leftSize = deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES,
+	                                                restrictSize ? rangeSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1
+	                                                             : SERVER_KNOBS->MAX_SHARD_BYTES);
+	int rightSize =
+	    restrictSize ? rangeSize - leftSize
+	                 : deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
+	serverKeys.rawInsert(splitPoint, { serverKeys[left].status, (uint64_t)rightSize });
+	serverKeys[left].shardSize = leftSize;
+}
+
+void MockStorageServer::removeShard(KeyRangeRef range) {
+	auto ranges = serverKeys.containedRanges(range);
+	ASSERT(ranges.begin().range() == range);
+	serverKeys.rawErase(range);
+}
+
+uint64_t MockStorageServer::sumRangeSize(KeyRangeRef range) const {
+	auto ranges = serverKeys.intersectingRanges(range);
+	uint64_t totalSize = 0;
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		totalSize += it->cvalue().shardSize;
+	}
+	return totalSize;
+}
+
+void MockStorageServer::addActor(Future<Void> future) {
+	actors.add(future);
+}
+
+void MockStorageServer::getSplitPoints(const SplitRangeRequest& req) {}
+
+Future<Void> MockStorageServer::waitMetricsTenantAware(const WaitMetricsRequest& req) {
+	return MockStorageServerImpl::waitMetricsTenantAware(this, req);
+}
+
+void MockStorageServer::getStorageMetrics(const GetStorageMetricsRequest& req) {}
+
+Future<Void> MockStorageServer::run() {
+	ssi.locality = LocalityData(Optional<Standalone<StringRef>>(),
+	                            Standalone<StringRef>(deterministicRandom()->randomUniqueID().toString()),
+	                            Standalone<StringRef>(deterministicRandom()->randomUniqueID().toString()),
+	                            Optional<Standalone<StringRef>>());
+	ssi.initEndpoints();
+	ssi.startAcceptingRequests();
+	TraceEvent("MockStorageServerStart").detail("Address", ssi.address());
+	return serveStorageMetricsRequests(this, ssi);
+}
+
+void MockGlobalState::initializeAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) {
+	ASSERT(conf.storageTeamSize > 0);
+	configuration = conf;
+	std::vector<UID> serverIds;
+	for (int i = 1; i <= conf.storageTeamSize; ++i) {
+		UID id = indexToUID(i);
+		serverIds.push_back(id);
+		allServers[id] = MockStorageServer(id, defaultDiskSpace);
+		allServers[id].serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 });
+	}
+	shardMapping->assignRangeToTeams(allKeys, { Team(serverIds, true) });
+}
+
+void MockGlobalState::addStorageServer(StorageServerInterface server, uint64_t diskSpace) {
+	allServers[server.id()] = MockStorageServer(server, diskSpace);
+}
+
+bool MockGlobalState::serverIsSourceForShard(const UID& serverId, KeyRangeRef shard, bool inFlightShard) {
+	if (!allServers.count(serverId))
+		return false;
+
+	// check serverKeys
+	auto& mss = allServers.at(serverId);
+	if (!mss.allShardStatusEqual(shard, MockShardStatus::COMPLETED)) {
+		return false;
+	}
+
+	// check keyServers
+	auto teams = shardMapping->getTeamsForFirstShard(shard);
+	if (inFlightShard) {
+		return std::any_of(teams.second.begin(), teams.second.end(), [&serverId](const Team& team) {
+			return team.hasServer(serverId);
+		});
+	}
+	return std::any_of(
+	    teams.first.begin(), teams.first.end(), [&serverId](const Team& team) { return team.hasServer(serverId); });
+}
+
+bool MockGlobalState::serverIsDestForShard(const UID& serverId, KeyRangeRef shard) {
+	if (!allServers.count(serverId))
+		return false;
+
+	// check serverKeys
+	auto& mss = allServers.at(serverId);
+	if (!mss.allShardStatusEqual(shard, MockShardStatus::INFLIGHT)) {
+		return false;
+	}
+
+	// check keyServers
+	auto teams = shardMapping->getTeamsForFirstShard(shard);
+	return !teams.second.empty() && std::any_of(teams.first.begin(), teams.first.end(), [&serverId](const Team& team) {
+		return team.hasServer(serverId);
+	});
+}
+
+bool MockGlobalState::allShardsRemovedFromServer(const UID& serverId) {
+	return allServers.count(serverId) && shardMapping->getNumberOfShards(serverId) == 0;
+}
+
+Future<std::pair<Optional<StorageMetrics>, int>> MockGlobalState::waitStorageMetrics(
+    const KeyRange& keys,
+    const StorageMetrics& min,
+    const StorageMetrics& max,
+    const StorageMetrics& permittedError,
+    int shardLimit,
+    int expectedShardCount) {
+	return MockGlobalStateImpl::waitStorageMetrics(
+	    this, keys, min, max, permittedError, shardLimit, expectedShardCount);
+}
+
+Reference<LocationInfo> buildLocationInfo(const std::vector<StorageServerInterface>& interfaces) {
+	// construct the location info with the servers
+	std::vector<Reference<ReferencedInterface<StorageServerInterface>>> serverRefs;
+	serverRefs.reserve(interfaces.size());
+	for (const auto& interf : interfaces) {
+		serverRefs.push_back(makeReference<ReferencedInterface<StorageServerInterface>>(interf));
+	}
+
+	return makeReference<LocationInfo>(serverRefs);
+}
+
+Future<KeyRangeLocationInfo> MockGlobalState::getKeyLocation(TenantInfo tenant,
+                                                             Key key,
+                                                             SpanContext spanContext,
+                                                             Optional<UID> debugID,
+                                                             UseProvisionalProxies useProvisionalProxies,
+                                                             Reverse isBackward,
+                                                             Version version) {
+	if (isBackward) {
+		// DD never ask for backward range.
+		UNREACHABLE();
+	}
+	ASSERT(key < allKeys.end);
+
+	GetKeyServerLocationsReply rep;
+	KeyRange single = singleKeyRange(key);
+	auto teamPair = shardMapping->getTeamsForFirstShard(single);
+	auto& srcTeam = teamPair.second.empty() ? teamPair.first : teamPair.second;
+	ASSERT_EQ(srcTeam.size(), 1);
+	rep.results.emplace_back(single, extractStorageServerInterfaces(srcTeam.front().servers));
+
+	return KeyRangeLocationInfo(
+	    rep.tenantEntry,
+	    KeyRange(toPrefixRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena),
+	    buildLocationInfo(rep.results[0].second));
+}
+
+Future<std::vector<KeyRangeLocationInfo>> MockGlobalState::getKeyRangeLocations(
+    TenantInfo tenant,
+    KeyRange keys,
+    int limit,
+    Reverse reverse,
+    SpanContext spanContext,
+    Optional<UID> debugID,
+    UseProvisionalProxies useProvisionalProxies,
+    Version version) {
+
+	if (reverse) {
+		// DD never ask for backward range.
+		ASSERT(false);
+	}
+	ASSERT(keys.begin < keys.end);
+
+	GetKeyServerLocationsReply rep;
+	auto ranges = shardMapping->intersectingRanges(keys);
+	auto it = ranges.begin();
+	for (int count = 0; it != ranges.end() && count < limit; ++it, ++count) {
+		auto teamPair = shardMapping->getTeamsFor(it->begin());
+		auto& srcTeam = teamPair.second.empty() ? teamPair.first : teamPair.second;
+		ASSERT_EQ(srcTeam.size(), 1);
+		rep.results.emplace_back(it->range(), extractStorageServerInterfaces(srcTeam.front().servers));
+	}
+	CODE_PROBE(it != ranges.end(), "getKeyRangeLocations is limited", probe::decoration::rare);
+
+	std::vector<KeyRangeLocationInfo> results;
+	for (int shard = 0; shard < rep.results.size(); shard++) {
+		results.emplace_back(rep.tenantEntry,
+		                     (toPrefixRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys),
+		                     buildLocationInfo(rep.results[shard].second));
+	}
+	return results;
+}
+
+std::vector<StorageServerInterface> MockGlobalState::extractStorageServerInterfaces(const std::vector<UID>& ids) const {
+	std::vector<StorageServerInterface> interfaces;
+	for (auto& id : ids) {
+		interfaces.emplace_back(allServers.at(id).ssi);
+	}
+	return interfaces;
+}
+
+Future<Standalone<VectorRef<KeyRef>>> MockGlobalState::splitStorageMetrics(const KeyRange& keys,
+                                                                           const StorageMetrics& limit,
+                                                                           const StorageMetrics& estimated,
+                                                                           const Optional<int>& minSplitBytes) {
+	return MockGlobalStateImpl::splitStorageMetrics(this, keys, limit, estimated, minSplitBytes);
+}
+
+TEST_CASE("/MockGlobalState/initializeAsEmptyDatabaseMGS/SimpleThree") {
+	BasicTestConfig testConfig;
+	testConfig.simpleConfig = true;
+	testConfig.minimumReplication = 3;
+	testConfig.logAntiQuorum = 0;
+	DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
+	TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
+
+	auto mgs = std::make_shared<MockGlobalState>();
+	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
+	for (int i = 1; i <= dbConfig.storageTeamSize; ++i) {
+		auto id = MockGlobalState::indexToUID(i);
+		std::cout << "Check server " << i << "\n";
+		ASSERT(mgs->serverIsSourceForShard(id, allKeys));
+		ASSERT(mgs->allServers.at(id).sumRangeSize(allKeys) == 0);
+	}
+
+	return Void();
+}
+
+struct MockGlobalStateTester {
+
+	// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, x2), [x2, r0.end)
+	void testThreeWaySplitFirstRange(MockStorageServer& mss) {
+		auto it = mss.serverKeys.ranges().begin();
+		uint64_t oldSize =
+		    deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
+		MockShardStatus oldStatus = it.cvalue().status;
+		it->value().shardSize = oldSize;
+		KeyRangeRef outerRange = it->range();
+		Key x1 = keyAfter(it->range().begin);
+		Key x2 = keyAfter(x1);
+		std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
+
+		mss.threeWayShardSplitting(outerRange, KeyRangeRef(x1, x2), oldSize, false);
+		auto ranges = mss.serverKeys.containedRanges(outerRange);
+		ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
+		ranges.pop_front();
+		ASSERT(ranges.begin().range() == KeyRangeRef(x1, x2));
+		ASSERT(ranges.begin().cvalue().status == oldStatus);
+		ranges.pop_front();
+		ASSERT(ranges.begin().range() == KeyRangeRef(x2, outerRange.end));
+		ranges.pop_front();
+		ASSERT(ranges.empty());
+	}
+
+	// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, r0.end)
+	void testTwoWaySplitFirstRange(MockStorageServer& mss) {
+		auto it = mss.serverKeys.nthRange(0);
+		MockShardStatus oldStatus = it.cvalue().status;
+		uint64_t oldSize =
+		    deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
+		it->value().shardSize = oldSize;
+		KeyRangeRef outerRange = it->range();
+		Key x1 = keyAfter(it->range().begin);
+		std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
+
+		mss.twoWayShardSplitting(it->range(), x1, oldSize, false);
+		auto ranges = mss.serverKeys.containedRanges(outerRange);
+		ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
+		ranges.pop_front();
+		ASSERT(ranges.begin().range() == KeyRangeRef(x1, outerRange.end));
+		ASSERT(ranges.begin().cvalue().status == oldStatus);
+		ranges.pop_front();
+		ASSERT(ranges.empty());
+	}
+
+	KeyRangeLocationInfo getKeyLocationInfo(KeyRef key, std::shared_ptr<MockGlobalState> mgs) {
+		return mgs
+		    ->getKeyLocation(
+		        TenantInfo(), key, SpanContext(), Optional<UID>(), UseProvisionalProxies::False, Reverse::False, 0)
+		    .get();
+	}
+
+	std::vector<KeyRangeLocationInfo> getKeyRangeLocations(KeyRangeRef keys,
+	                                                       int limit,
+	                                                       std::shared_ptr<MockGlobalState> mgs) {
+		return mgs
+		    ->getKeyRangeLocations(TenantInfo(),
+		                           keys,
+		                           limit,
+		                           Reverse::False,
+		                           SpanContext(),
+		                           Optional<UID>(),
+		                           UseProvisionalProxies::False,
+		                           0)
+		    .get();
+	}
+};
+
+TEST_CASE("/MockGlobalState/MockStorageServer/SplittingFunctions") {
+	BasicTestConfig testConfig;
+	testConfig.simpleConfig = true;
+	testConfig.minimumReplication = 1;
+	testConfig.logAntiQuorum = 0;
+	DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
+	TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
+
+	auto mgs = std::make_shared<MockGlobalState>();
+	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
+
+	MockGlobalStateTester tester;
+	auto& mss = mgs->allServers.at(MockGlobalState::indexToUID(1));
+	std::cout << "Test 3-way splitting...\n";
+	tester.testThreeWaySplitFirstRange(mss);
+	std::cout << "Test 2-way splitting...\n";
+	mss.serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 }); // reset to empty
+	tester.testTwoWaySplitFirstRange(mss);
+
+	return Void();
+}
+
+namespace {
+inline bool locationInfoEqualsToTeam(Reference<LocationInfo> loc, const std::vector<UID>& ids) {
+	return loc->locations()->size() == ids.size() &&
+	       std::all_of(ids.begin(), ids.end(), [loc](const UID& id) { return loc->locations()->hasInterface(id); });
+}
+}; // namespace
+TEST_CASE("/MockGlobalState/MockStorageServer/GetKeyLocations") {
+	BasicTestConfig testConfig;
+	testConfig.simpleConfig = true;
+	testConfig.minimumReplication = 1;
+	testConfig.logAntiQuorum = 0;
+	DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
+	TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
+
+	auto mgs = std::make_shared<MockGlobalState>();
+	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
+	// add one empty server
+	mgs->addStorageServer(StorageServerInterface(mgs->indexToUID(mgs->allServers.size() + 1)));
+
+	// define 3 ranges:
+	// team 1 (UID 1,2,...,n-1):[begin, 1.0), [2.0, end)
+	// team 2 (UID 2,3,...n-1, n): [1.0, 2.0)
+	ShardsAffectedByTeamFailure::Team team1, team2;
+	for (int i = 0; i < mgs->allServers.size() - 1; ++i) {
+		UID id = mgs->indexToUID(i + 1);
+		team1.servers.emplace_back(id);
+		id = mgs->indexToUID(i + 2);
+		team2.servers.emplace_back(id);
+	}
+	Key one = doubleToTestKey(1.0), two = doubleToTestKey(2.0);
+	std::vector<KeyRangeRef> ranges{ KeyRangeRef(allKeys.begin, one),
+		                             KeyRangeRef(one, two),
+		                             KeyRangeRef(two, allKeys.end) };
+	mgs->shardMapping->assignRangeToTeams(ranges[0], { team1 });
+	mgs->shardMapping->assignRangeToTeams(ranges[1], { team2 });
+	mgs->shardMapping->assignRangeToTeams(ranges[2], { team1 });
+
+	// query key location
+	MockGlobalStateTester tester;
+	// -- team 1
+	Key testKey = doubleToTestKey(0.5);
+	auto locInfo = tester.getKeyLocationInfo(testKey, mgs);
+	ASSERT(locationInfoEqualsToTeam(locInfo.locations, team1.servers));
+
+	// -- team 2
+	testKey = doubleToTestKey(1.3);
+	locInfo = tester.getKeyLocationInfo(testKey, mgs);
+	ASSERT(locationInfoEqualsToTeam(locInfo.locations, team2.servers));
+
+	// query range location
+	testKey = doubleToTestKey(3.0);
+	// team 1,2,1
+	auto locInfos = tester.getKeyRangeLocations(KeyRangeRef(allKeys.begin, testKey), 100, mgs);
+	ASSERT(locInfos.size() == 3);
+	ASSERT(locInfos[0].range == ranges[0]);
+	ASSERT(locationInfoEqualsToTeam(locInfos[0].locations, team1.servers));
+	ASSERT(locInfos[1].range == ranges[1]);
+	ASSERT(locationInfoEqualsToTeam(locInfos[1].locations, team2.servers));
+	ASSERT(locInfos[2].range == KeyRangeRef(ranges[2].begin, testKey));
+	ASSERT(locationInfoEqualsToTeam(locInfos[2].locations, team1.servers));
+
+	// team 1,2
+	locInfos = tester.getKeyRangeLocations(KeyRangeRef(allKeys.begin, testKey), 2, mgs);
+	ASSERT(locInfos.size() == 2);
+	ASSERT(locInfos[0].range == ranges[0]);
+	ASSERT(locationInfoEqualsToTeam(locInfos[0].locations, team1.servers));
+	ASSERT(locInfos[1].range == ranges[1]);
+	ASSERT(locationInfoEqualsToTeam(locInfos[1].locations, team2.servers));
+
+	return Void();
+}
+
+TEST_CASE("/MockGlobalState/MockStorageServer/WaitStorageMetricsRequest") {
+	BasicTestConfig testConfig;
+	testConfig.simpleConfig = true;
+	testConfig.minimumReplication = 1;
+	testConfig.logAntiQuorum = 0;
+	DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
+	TraceEvent("WaitStorageMetricsRequestUnitTestConfig").detail("Config", dbConfig.toString());
+
+	state std::shared_ptr<MockGlobalState> mgs = std::make_shared<MockGlobalState>();
+	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
+	state ActorCollection actors;
+
+	ActorCollection* ptr = &actors; // get around ACTOR syntax restriction
+	std::for_each(mgs->allServers.begin(), mgs->allServers.end(), [ptr](auto& server) {
+		ptr->add(server.second.run());
+		IFailureMonitor::failureMonitor().setStatus(server.second.ssi.address(), FailureStatus(false));
+		server.second.metrics.byteSample.sample.insert("something"_sr, 500000);
+	});
+
+	KeyRange testRange = allKeys;
+	ShardSizeBounds bounds = ShardSizeBounds::shardSizeBoundsBeforeTrack();
+	std::pair<Optional<StorageMetrics>, int> res =
+	    wait(mgs->waitStorageMetrics(testRange, bounds.min, bounds.max, bounds.permittedError, 1, 1));
+	// std::cout << "get result " << res.second << "\n";
+	// std::cout << "get byte "<< res.first.get().bytes << "\n";
+	ASSERT_EQ(res.second, -1); // the valid result always return -1, strange contraction though.
+	ASSERT_EQ(res.first.get().bytes, 500000);
+	return Void();
+}
--- a/fdbserver/MockGlobalState.cpp
+++ b/fdbserver/MockGlobalState.cpp
@ -1,281 +0,0 @@
-/*
- * MockGlobalState.cpp
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "fdbserver/MockGlobalState.h"
-
-bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus status) {
-	auto ranges = serverKeys.intersectingRanges(range);
-	ASSERT(!ranges.empty()); // at least the range is allKeys
-
-	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
-		if (it->cvalue().status != status)
-			return false;
-	}
-	return true;
-}
-
-void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status, bool restrictSize) {
-	auto ranges = serverKeys.intersectingRanges(range);
-	ASSERT(!ranges.empty());
-	if (ranges.begin().range().contains(range)) {
-		CODE_PROBE(true, "Implicitly split single shard to 3 pieces");
-		threeWayShardSplitting(ranges.begin().range(), range, ranges.begin().cvalue().shardSize, restrictSize);
-		return;
-	}
-	if (ranges.begin().begin() < range.begin) {
-		CODE_PROBE(true, "Implicitly split begin range to 2 pieces");
-		twoWayShardSplitting(ranges.begin().range(), range.begin, ranges.begin().cvalue().shardSize, restrictSize);
-	}
-	if (ranges.end().end() > range.end) {
-		CODE_PROBE(true, "Implicitly split end range to 2 pieces");
-		twoWayShardSplitting(ranges.end().range(), range.end, ranges.end().cvalue().shardSize, restrictSize);
-	}
-	ranges = serverKeys.containedRanges(range);
-	// now the boundary must be aligned
-	ASSERT(ranges.begin().begin() == range.begin);
-	ASSERT(ranges.end().end() == range.end);
-	uint64_t newSize = 0;
-	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
-		newSize += it->cvalue().shardSize;
-	}
-	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
-		auto oldStatus = it.value().status;
-		if (isStatusTransitionValid(oldStatus, status)) {
-			it.value() = ShardInfo{ status, newSize };
-		} else if (oldStatus == MockShardStatus::COMPLETED && status == MockShardStatus::INFLIGHT) {
-			CODE_PROBE(true, "Shard already on server");
-		} else {
-			TraceEvent(SevError, "MockShardStatusTransitionError")
-			    .detail("From", oldStatus)
-			    .detail("To", status)
-			    .detail("ID", id)
-			    .detail("KeyBegin", range.begin.toHexString())
-			    .detail("KeyEnd", range.begin.toHexString());
-		}
-	}
-	serverKeys.coalesce(range);
-}
-
-// split the out range [a, d) based on the inner range's boundary [b, c). The result would be [a,b), [b,c), [c,d). The
-// size of the new shards are randomly split from old size of [a, d)
-void MockStorageServer::threeWayShardSplitting(KeyRangeRef outerRange,
-                                               KeyRangeRef innerRange,
-                                               uint64_t outerRangeSize,
-                                               bool restrictSize) {
-	ASSERT(outerRange.contains(innerRange));
-
-	Key left = outerRange.begin;
-	// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
-	int leftSize = deterministicRandom()->randomInt(
-	    SERVER_KNOBS->MIN_SHARD_BYTES,
-	    restrictSize ? outerRangeSize - 2 * SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
-	int midSize = deterministicRandom()->randomInt(
-	    SERVER_KNOBS->MIN_SHARD_BYTES,
-	    restrictSize ? outerRangeSize - leftSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
-	int rightSize =
-	    restrictSize ? outerRangeSize - leftSize - midSize
-	                 : deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
-
-	serverKeys.insert(innerRange, { serverKeys[left].status, (uint64_t)midSize });
-	serverKeys[left].shardSize = leftSize;
-	serverKeys[innerRange.end].shardSize = rightSize;
-}
-
-// split the range [a,c) with split point b. The result would be [a, b), [b, c). The
-// size of the new shards are randomly split from old size of [a, c)
-void MockStorageServer::twoWayShardSplitting(KeyRangeRef range,
-                                             KeyRef splitPoint,
-                                             uint64_t rangeSize,
-                                             bool restrictSize) {
-	Key left = range.begin;
-	// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
-	int leftSize = deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES,
-	                                                restrictSize ? rangeSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1
-	                                                             : SERVER_KNOBS->MAX_SHARD_BYTES);
-	int rightSize =
-	    restrictSize ? rangeSize - leftSize
-	                 : deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
-	serverKeys.rawInsert(splitPoint, { serverKeys[left].status, (uint64_t)rightSize });
-	serverKeys[left].shardSize = leftSize;
-}
-
-void MockStorageServer::removeShard(KeyRangeRef range) {
-	auto ranges = serverKeys.containedRanges(range);
-	ASSERT(ranges.begin().range() == range);
-	serverKeys.rawErase(range);
-}
-
-uint64_t MockStorageServer::sumRangeSize(KeyRangeRef range) const {
-	auto ranges = serverKeys.intersectingRanges(range);
-	uint64_t totalSize = 0;
-	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
-		totalSize += it->cvalue().shardSize;
-	}
-	return totalSize;
-}
-
-void MockGlobalState::initializeAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) {
-	ASSERT(conf.storageTeamSize > 0);
-	configuration = conf;
-	std::vector<UID> serverIds;
-	for (int i = 1; i <= conf.storageTeamSize; ++i) {
-		UID id = indexToUID(i);
-		serverIds.push_back(id);
-		allServers[id] = MockStorageServer(id, defaultDiskSpace);
-		allServers[id].serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 });
-	}
-	shardMapping->assignRangeToTeams(allKeys, { Team(serverIds, true) });
-}
-
-void MockGlobalState::addStorageServer(StorageServerInterface server, uint64_t diskSpace) {
-	allServers[server.id()] = MockStorageServer(server, diskSpace);
-}
-
-bool MockGlobalState::serverIsSourceForShard(const UID& serverId, KeyRangeRef shard, bool inFlightShard) {
-	if (!allServers.count(serverId))
-		return false;
-
-	// check serverKeys
-	auto& mss = allServers.at(serverId);
-	if (!mss.allShardStatusEqual(shard, MockShardStatus::COMPLETED)) {
-		return false;
-	}
-
-	// check keyServers
-	auto teams = shardMapping->getTeamsFor(shard);
-	if (inFlightShard) {
-		return std::any_of(teams.second.begin(), teams.second.end(), [&serverId](const Team& team) {
-			return team.hasServer(serverId);
-		});
-	}
-	return std::any_of(
-	    teams.first.begin(), teams.first.end(), [&serverId](const Team& team) { return team.hasServer(serverId); });
-}
-
-bool MockGlobalState::serverIsDestForShard(const UID& serverId, KeyRangeRef shard) {
-	if (!allServers.count(serverId))
-		return false;
-
-	// check serverKeys
-	auto& mss = allServers.at(serverId);
-	if (!mss.allShardStatusEqual(shard, MockShardStatus::INFLIGHT)) {
-		return false;
-	}
-
-	// check keyServers
-	auto teams = shardMapping->getTeamsFor(shard);
-	return !teams.second.empty() && std::any_of(teams.first.begin(), teams.first.end(), [&serverId](const Team& team) {
-		return team.hasServer(serverId);
-	});
-}
-
-bool MockGlobalState::allShardRemovedFromServer(const UID& serverId) {
-	return allServers.count(serverId) && shardMapping->getNumberOfShards(serverId) == 0;
-}
-
-TEST_CASE("/MockGlobalState/initializeAsEmptyDatabaseMGS/SimpleThree") {
-	BasicTestConfig testConfig;
-	testConfig.simpleConfig = true;
-	testConfig.minimumReplication = 3;
-	testConfig.logAntiQuorum = 0;
-	DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
-	TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
-
-	auto mgs = std::make_shared<MockGlobalState>();
-	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
-	for (int i = 1; i <= dbConfig.storageTeamSize; ++i) {
-		auto id = MockGlobalState::indexToUID(i);
-		std::cout << "Check server " << i << "\n";
-		ASSERT(mgs->serverIsSourceForShard(id, allKeys));
-		ASSERT(mgs->allServers.at(id).sumRangeSize(allKeys) == 0);
-	}
-
-	return Void();
-}
-
-struct MockGlobalStateTester {
-
-	// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, x2), [x2, r0.end)
-	void testThreeWaySplitFirstRange(MockStorageServer& mss) {
-		auto it = mss.serverKeys.ranges().begin();
-		uint64_t oldSize =
-		    deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
-		MockShardStatus oldStatus = it.cvalue().status;
-		it->value().shardSize = oldSize;
-		KeyRangeRef outerRange = it->range();
-		Key x1 = keyAfter(it->range().begin);
-		Key x2 = keyAfter(x1);
-		std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
-
-		mss.threeWayShardSplitting(outerRange, KeyRangeRef(x1, x2), oldSize, false);
-		auto ranges = mss.serverKeys.containedRanges(outerRange);
-		ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
-		ranges.pop_front();
-		ASSERT(ranges.begin().range() == KeyRangeRef(x1, x2));
-		ASSERT(ranges.begin().cvalue().status == oldStatus);
-		ranges.pop_front();
-		ASSERT(ranges.begin().range() == KeyRangeRef(x2, outerRange.end));
-		ranges.pop_front();
-		ASSERT(ranges.empty());
-	}
-
-	// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, r0.end)
-	void testTwoWaySplitFirstRange(MockStorageServer& mss) {
-		auto it = mss.serverKeys.nthRange(0);
-		MockShardStatus oldStatus = it.cvalue().status;
-		uint64_t oldSize =
-		    deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
-		it->value().shardSize = oldSize;
-		KeyRangeRef outerRange = it->range();
-		Key x1 = keyAfter(it->range().begin);
-		std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
-
-		mss.twoWayShardSplitting(it->range(), x1, oldSize, false);
-		auto ranges = mss.serverKeys.containedRanges(outerRange);
-		ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
-		ranges.pop_front();
-		ASSERT(ranges.begin().range() == KeyRangeRef(x1, outerRange.end));
-		ASSERT(ranges.begin().cvalue().status == oldStatus);
-		ranges.pop_front();
-		ASSERT(ranges.empty());
-	}
-};
-
-TEST_CASE("/MockGlobalState/MockStorageServer/SplittingFunctions") {
-	BasicTestConfig testConfig;
-	testConfig.simpleConfig = true;
-	testConfig.minimumReplication = 1;
-	testConfig.logAntiQuorum = 0;
-	DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
-	TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
-
-	auto mgs = std::make_shared<MockGlobalState>();
-	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
-
-	MockGlobalStateTester tester;
-	auto& mss = mgs->allServers.at(MockGlobalState::indexToUID(1));
-	std::cout << "Test 3-way splitting...\n";
-	tester.testThreeWaySplitFirstRange(mss);
-	std::cout << "Test 2-way splitting...\n";
-	mss.serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 }); // reset to empty
-	tester.testTwoWaySplitFirstRange(mss);
-
-	return Void();
-}
--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@ -1287,7 +1287,7 @@ ACTOR static Future<Void> startMoveShards(Database occ,
 					TraceEvent(SevVerbose, "StartMoveShardsFoundDataMove", relocationIntervalId)
 					    .detail("DataMoveID", dataMoveId)
 					    .detail("DataMove", dataMove.toString());
-					ASSERT(dataMove.range.begin == keys.begin);
+					ASSERT(!dataMove.ranges.empty() && dataMove.ranges.front().begin == keys.begin);
 					if (dataMove.getPhase() == DataMoveMetaData::Deleting) {
 						TraceEvent(SevVerbose, "StartMoveShardsDataMove", relocationIntervalId)
 						    .detail("DataMoveBeingDeleted", dataMoveId);
@ -1296,10 +1296,10 @@ ACTOR static Future<Void> startMoveShards(Database occ,
 					if (dataMove.getPhase() == DataMoveMetaData::Running) {
 						TraceEvent(SevVerbose, "StartMoveShardsDataMove", relocationIntervalId)
 						    .detail("DataMoveAlreadyCommitted", dataMoveId);
-						ASSERT(keys == dataMove.range);
+						ASSERT(keys == dataMove.ranges.front());
 						return Void();
 					}
-					begin = dataMove.range.end;
+					begin = dataMove.ranges.front().end;
 				} else {
 					dataMove.id = dataMoveId;
 					TraceEvent(SevVerbose, "StartMoveKeysNewDataMove", relocationIntervalId)
@ -1441,7 +1441,8 @@ ACTOR static Future<Void> startMoveShards(Database occ,
 						    &tr, serverKeysPrefixFor(servers[i]), currentKeys, allKeys, serverKeysValue(dataMoveId)));
 					}

-					dataMove.range = KeyRangeRef(keys.begin, currentKeys.end);
+					dataMove.ranges.clear();
+					dataMove.ranges.push_back(KeyRangeRef(keys.begin, currentKeys.end));
 					dataMove.dest.insert(servers.begin(), servers.end());
 				}

@ -1471,7 +1472,7 @@ ACTOR static Future<Void> startMoveShards(Database occ,
 				    .detail("DataMoveKey", dataMoveKeyFor(dataMoveId))
 				    .detail("CommitVersion", tr.getCommittedVersion())
 				    .detail("DeltaRange", currentKeys.toString())
-				    .detail("Range", dataMove.range.toString())
+				    .detail("Range", describe(dataMove.ranges))
 				    .detail("DataMove", dataMove.toString());

 				dataMove = DataMoveMetaData();
@ -1628,7 +1629,8 @@ ACTOR static Future<Void> finishMoveShards(Database occ,
 						throw data_move_cancelled();
 					}
 					ASSERT(dataMove.getPhase() == DataMoveMetaData::Running);
-					range = dataMove.range;
+					ASSERT(!dataMove.ranges.empty());
+					range = dataMove.ranges.front();
 				} else {
 					TraceEvent(SevWarn, "FinishMoveShardsDataMoveDeleted", relocationIntervalId)
 					    .detail("DataMoveID", dataMoveId);
@ -1766,7 +1768,7 @@ ACTOR static Future<Void> finishMoveShards(Database occ,

 					wait(waitForAll(actors));

-					if (range.end == dataMove.range.end) {
+					if (range.end == dataMove.ranges.front().end) {
 						tr.clear(dataMoveKeyFor(dataMoveId));
 						complete = true;
 						TraceEvent(SevVerbose, "FinishMoveShardsDeleteMetaData", dataMoveId)
@ -1776,7 +1778,7 @@ ACTOR static Future<Void> finishMoveShards(Database occ,
 						    .detail("DataMoveID", dataMoveId)
 						    .detail("CurrentRange", range)
 						    .detail("NewDataMoveMetaData", dataMove.toString());
-						dataMove.range = KeyRangeRef(range.end, dataMove.range.end);
+						dataMove.ranges.front() = KeyRangeRef(range.end, dataMove.ranges.front().end);
 						tr.set(dataMoveKeyFor(dataMoveId), dataMoveValue(dataMove));
 					}

@ -2229,9 +2231,10 @@ ACTOR Future<Void> removeKeysFromFailedServer(Database cx,
 							Optional<Value> val = wait(tr.get(dataMoveKeyFor(destId)));
 							if (val.present()) {
 								state DataMoveMetaData dataMove = decodeDataMoveValue(val.get());
+								ASSERT(!dataMove.ranges.empty());
 								TraceEvent(SevVerbose, "RemoveRangeFoundDataMove", serverID)
 								    .detail("DataMoveMetaData", dataMove.toString());
-								if (range == dataMove.range) {
+								if (range == dataMove.ranges.front()) {
 									tr.clear(dataMoveKeyFor(destId));
 								} else {
 									dataMove.setPhase(DataMoveMetaData::Deleting);
@ -2350,10 +2353,11 @@ ACTOR Future<Void> cleanUpDataMove(Database occ,
 				Optional<Value> val = wait(tr.get(dataMoveKeyFor(dataMoveId)));
 				if (val.present()) {
 					dataMove = decodeDataMoveValue(val.get());
+					ASSERT(!dataMove.ranges.empty());
 					TraceEvent(SevVerbose, "CleanUpDataMoveMetaData", dataMoveId)
 					    .detail("DataMoveID", dataMoveId)
 					    .detail("DataMoveMetaData", dataMove.toString());
-					range = dataMove.range;
+					range = dataMove.ranges.front();
 					ASSERT(!range.empty());
 				} else {
 					TraceEvent(SevDebug, "CleanUpDataMoveNotExist", dataMoveId).detail("DataMoveID", dataMoveId);
@ -2419,14 +2423,14 @@ ACTOR Future<Void> cleanUpDataMove(Database occ,
 					                           currentShards[i + 1].value);
 				}

-				if (range.end == dataMove.range.end) {
+				if (range.end == dataMove.ranges.front().end) {
 					tr.clear(dataMoveKeyFor(dataMoveId));
 					complete = true;
 					TraceEvent(SevVerbose, "CleanUpDataMoveDeleteMetaData", dataMoveId)
 					    .detail("DataMoveID", dataMove.toString());

 				} else {
-					dataMove.range = KeyRangeRef(range.end, dataMove.range.end);
+					dataMove.ranges.front() = KeyRangeRef(range.end, dataMove.ranges.front().end);
 					dataMove.setPhase(DataMoveMetaData::Deleting);
 					tr.set(dataMoveKeyFor(dataMoveId), dataMoveValue(dataMove));
 					TraceEvent(SevVerbose, "CleanUpDataMovePartial", dataMoveId)
--- a/fdbserver/OldTLogServer_4_6.actor.cpp
+++ b/fdbserver/OldTLogServer_4_6.actor.cpp
@ -447,10 +447,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 		          "Restored");
 		addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id()));

-		persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id);
-		persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id);
-		version.initMetric("TLog.Version"_sr, cc.id);
-		queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id);
+		persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId());
+		persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId());
+		version.initMetric("TLog.Version"_sr, cc.getId());
+		queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId());

 		specialCounter(cc, "Version", [this]() { return this->version.get(); });
 		specialCounter(cc, "SharedBytesInput", [tLogData]() { return tLogData->bytesInput; });
@ -1399,26 +1399,26 @@ ACTOR Future<Void> tLogCore(TLogData* self, Reference<LogData> logData) {
 	logData->addActor.send(waitFailureServer(logData->tli.waitFailure.getFuture()));
 	logData->addActor.send(logData->removed);
 	// FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance
-	logData->addActor.send(traceCounters("TLogMetrics",
-	                                     logData->logId,
-	                                     SERVER_KNOBS->STORAGE_LOGGING_DELAY,
-	                                     &logData->cc,
-	                                     logData->logId.toString() + "/TLogMetrics",
-	                                     [self = self](TraceEvent& te) {
-		                                     StorageBytes sbTlog = self->persistentData->getStorageBytes();
-		                                     te.detail("KvstoreBytesUsed", sbTlog.used);
-		                                     te.detail("KvstoreBytesFree", sbTlog.free);
-		                                     te.detail("KvstoreBytesAvailable", sbTlog.available);
-		                                     te.detail("KvstoreBytesTotal", sbTlog.total);
-		                                     te.detail("KvstoreBytesTemp", sbTlog.temp);
+	logData->addActor.send(logData->cc.traceCounters("TLogMetrics",
+	                                                 logData->logId,
+	                                                 SERVER_KNOBS->STORAGE_LOGGING_DELAY,
+	                                                 logData->logId.toString() + "/TLogMetrics",
+	                                                 [self = self](TraceEvent& te) {
+		                                                 StorageBytes sbTlog = self->persistentData->getStorageBytes();
+		                                                 te.detail("KvstoreBytesUsed", sbTlog.used);
+		                                                 te.detail("KvstoreBytesFree", sbTlog.free);
+		                                                 te.detail("KvstoreBytesAvailable", sbTlog.available);
+		                                                 te.detail("KvstoreBytesTotal", sbTlog.total);
+		                                                 te.detail("KvstoreBytesTemp", sbTlog.temp);

-		                                     StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
-		                                     te.detail("QueueDiskBytesUsed", sbQueue.used);
-		                                     te.detail("QueueDiskBytesFree", sbQueue.free);
-		                                     te.detail("QueueDiskBytesAvailable", sbQueue.available);
-		                                     te.detail("QueueDiskBytesTotal", sbQueue.total);
-		                                     te.detail("QueueDiskBytesTemp", sbQueue.temp);
-	                                     }));
+		                                                 StorageBytes sbQueue =
+		                                                     self->rawPersistentQueue->getStorageBytes();
+		                                                 te.detail("QueueDiskBytesUsed", sbQueue.used);
+		                                                 te.detail("QueueDiskBytesFree", sbQueue.free);
+		                                                 te.detail("QueueDiskBytesAvailable", sbQueue.available);
+		                                                 te.detail("QueueDiskBytesTotal", sbQueue.total);
+		                                                 te.detail("QueueDiskBytesTemp", sbQueue.temp);
+	                                                 }));

 	logData->addActor.send(serveTLogInterface(self, logData->tli, logData, warningCollectorInput));

--- a/fdbserver/OldTLogServer_6_0.actor.cpp
+++ b/fdbserver/OldTLogServer_6_0.actor.cpp
@ -533,10 +533,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 		          context);
 		addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id()));

-		persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id);
-		persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id);
-		version.initMetric("TLog.Version"_sr, cc.id);
-		queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id);
+		persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId());
+		persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId());
+		version.initMetric("TLog.Version"_sr, cc.getId());
+		queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId());

 		specialCounter(cc, "Version", [this]() { return this->version.get(); });
 		specialCounter(cc, "QueueCommittedVersion", [this]() { return this->queueCommittedVersion.get(); });
@ -2212,26 +2212,26 @@ ACTOR Future<Void> tLogCore(TLogData* self,
 	logData->addActor.send(waitFailureServer(tli.waitFailure.getFuture()));
 	logData->addActor.send(logData->removed);
 	// FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance
-	logData->addActor.send(traceCounters("TLogMetrics",
-	                                     logData->logId,
-	                                     SERVER_KNOBS->STORAGE_LOGGING_DELAY,
-	                                     &logData->cc,
-	                                     logData->logId.toString() + "/TLogMetrics",
-	                                     [self = self](TraceEvent& te) {
-		                                     StorageBytes sbTlog = self->persistentData->getStorageBytes();
-		                                     te.detail("KvstoreBytesUsed", sbTlog.used);
-		                                     te.detail("KvstoreBytesFree", sbTlog.free);
-		                                     te.detail("KvstoreBytesAvailable", sbTlog.available);
-		                                     te.detail("KvstoreBytesTotal", sbTlog.total);
-		                                     te.detail("KvstoreBytesTemp", sbTlog.temp);
+	logData->addActor.send(logData->cc.traceCounters("TLogMetrics",
+	                                                 logData->logId,
+	                                                 SERVER_KNOBS->STORAGE_LOGGING_DELAY,
+	                                                 logData->logId.toString() + "/TLogMetrics",
+	                                                 [self = self](TraceEvent& te) {
+		                                                 StorageBytes sbTlog = self->persistentData->getStorageBytes();
+		                                                 te.detail("KvstoreBytesUsed", sbTlog.used);
+		                                                 te.detail("KvstoreBytesFree", sbTlog.free);
+		                                                 te.detail("KvstoreBytesAvailable", sbTlog.available);
+		                                                 te.detail("KvstoreBytesTotal", sbTlog.total);
+		                                                 te.detail("KvstoreBytesTemp", sbTlog.temp);

-		                                     StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
-		                                     te.detail("QueueDiskBytesUsed", sbQueue.used);
-		                                     te.detail("QueueDiskBytesFree", sbQueue.free);
-		                                     te.detail("QueueDiskBytesAvailable", sbQueue.available);
-		                                     te.detail("QueueDiskBytesTotal", sbQueue.total);
-		                                     te.detail("QueueDiskBytesTemp", sbQueue.temp);
-	                                     }));
+		                                                 StorageBytes sbQueue =
+		                                                     self->rawPersistentQueue->getStorageBytes();
+		                                                 te.detail("QueueDiskBytesUsed", sbQueue.used);
+		                                                 te.detail("QueueDiskBytesFree", sbQueue.free);
+		                                                 te.detail("QueueDiskBytesAvailable", sbQueue.available);
+		                                                 te.detail("QueueDiskBytesTotal", sbQueue.total);
+		                                                 te.detail("QueueDiskBytesTemp", sbQueue.temp);
+	                                                 }));

 	logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
 	logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));
--- a/fdbserver/OldTLogServer_6_2.actor.cpp
+++ b/fdbserver/OldTLogServer_6_2.actor.cpp
@ -616,10 +616,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 		          context);
 		addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id()));

-		persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id);
-		persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id);
-		version.initMetric("TLog.Version"_sr, cc.id);
-		queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id);
+		persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId());
+		persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId());
+		version.initMetric("TLog.Version"_sr, cc.getId());
+		queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId());

 		specialCounter(cc, "Version", [this]() { return this->version.get(); });
 		specialCounter(cc, "QueueCommittedVersion", [this]() { return this->queueCommittedVersion.get(); });
@ -2671,26 +2671,26 @@ ACTOR Future<Void> tLogCore(TLogData* self,
 	logData->addActor.send(waitFailureServer(tli.waitFailure.getFuture()));
 	logData->addActor.send(logData->removed);
 	// FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance
-	logData->addActor.send(traceCounters("TLogMetrics",
-	                                     logData->logId,
-	                                     SERVER_KNOBS->STORAGE_LOGGING_DELAY,
-	                                     &logData->cc,
-	                                     logData->logId.toString() + "/TLogMetrics",
-	                                     [self = self](TraceEvent& te) {
-		                                     StorageBytes sbTlog = self->persistentData->getStorageBytes();
-		                                     te.detail("KvstoreBytesUsed", sbTlog.used);
-		                                     te.detail("KvstoreBytesFree", sbTlog.free);
-		                                     te.detail("KvstoreBytesAvailable", sbTlog.available);
-		                                     te.detail("KvstoreBytesTotal", sbTlog.total);
-		                                     te.detail("KvstoreBytesTemp", sbTlog.temp);
+	logData->addActor.send(logData->cc.traceCounters("TLogMetrics",
+	                                                 logData->logId,
+	                                                 SERVER_KNOBS->STORAGE_LOGGING_DELAY,
+	                                                 logData->logId.toString() + "/TLogMetrics",
+	                                                 [self = self](TraceEvent& te) {
+		                                                 StorageBytes sbTlog = self->persistentData->getStorageBytes();
+		                                                 te.detail("KvstoreBytesUsed", sbTlog.used);
+		                                                 te.detail("KvstoreBytesFree", sbTlog.free);
+		                                                 te.detail("KvstoreBytesAvailable", sbTlog.available);
+		                                                 te.detail("KvstoreBytesTotal", sbTlog.total);
+		                                                 te.detail("KvstoreBytesTemp", sbTlog.temp);

-		                                     StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
-		                                     te.detail("QueueDiskBytesUsed", sbQueue.used);
-		                                     te.detail("QueueDiskBytesFree", sbQueue.free);
-		                                     te.detail("QueueDiskBytesAvailable", sbQueue.available);
-		                                     te.detail("QueueDiskBytesTotal", sbQueue.total);
-		                                     te.detail("QueueDiskBytesTemp", sbQueue.temp);
-	                                     }));
+		                                                 StorageBytes sbQueue =
+		                                                     self->rawPersistentQueue->getStorageBytes();
+		                                                 te.detail("QueueDiskBytesUsed", sbQueue.used);
+		                                                 te.detail("QueueDiskBytesFree", sbQueue.free);
+		                                                 te.detail("QueueDiskBytesAvailable", sbQueue.available);
+		                                                 te.detail("QueueDiskBytesTotal", sbQueue.total);
+		                                                 te.detail("QueueDiskBytesTemp", sbQueue.temp);
+	                                                 }));

 	logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
 	logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));
--- a/fdbserver/Resolver.actor.cpp
+++ b/fdbserver/Resolver.actor.cpp
@ -35,7 +35,7 @@
 #include "fdbserver/ResolverInterface.h"
 #include "fdbserver/RestoreUtil.h"
 #include "fdbserver/ServerDBInfo.h"
-#include "fdbserver/StorageMetrics.h"
+#include "fdbserver/StorageMetrics.actor.h"
 #include "fdbserver/WaitFailure.h"
 #include "fdbserver/WorkerInterface.actor.h"
 #include "flow/ActorCollection.h"
@ -188,7 +188,7 @@ struct Resolver : ReferenceCounted<Resolver> {
 		specialCounter(cc, "NeededVersion", [this]() { return this->neededVersion.get(); });
 		specialCounter(cc, "TotalStateBytes", [this]() { return this->totalStateBytes.get(); });

-		logger = traceCounters("ResolverMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ResolverMetrics");
+		logger = cc.traceCounters("ResolverMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ResolverMetrics");
 	}
 	~Resolver() { destroyConflictSet(conflictSet); }
 };
--- a/fdbserver/RestoreLoader.actor.cpp
+++ b/fdbserver/RestoreLoader.actor.cpp
@ -30,7 +30,7 @@
 #include "fdbserver/RestoreLoader.actor.h"
 #include "fdbserver/RestoreRoleCommon.actor.h"
 #include "fdbserver/MutationTracking.h"
-#include "fdbserver/StorageMetrics.h"
+#include "fdbserver/StorageMetrics.actor.h"

 #include "flow/actorcompiler.h" // This must be the last #include.

--- a/fdbserver/RocksDBCheckpointUtils.actor.cpp
+++ b/fdbserver/RocksDBCheckpointUtils.actor.cpp
@ -753,6 +753,8 @@ ACTOR Future<CheckpointMetaData> fetchRocksDBCheckpoint(Database cx,
 	    .detail("InitialState", initialState.toString())
 	    .detail("CheckpointDir", dir);

+	ASSERT(!initialState.ranges.empty());
+
 	state std::shared_ptr<CheckpointMetaData> metaData = std::make_shared<CheckpointMetaData>(initialState);

 	if (metaData->format == RocksDBColumnFamily) {
@ -771,7 +773,7 @@ ACTOR Future<CheckpointMetaData> fetchRocksDBCheckpoint(Database cx,
 	} else if (metaData->format == RocksDB) {
 		std::shared_ptr<rocksdb::SstFileWriter> writer =
 		    std::make_shared<rocksdb::SstFileWriter>(rocksdb::EnvOptions(), rocksdb::Options());
-		wait(fetchCheckpointRange(cx, metaData, metaData->range, dir, writer, cFun));
+		wait(fetchCheckpointRange(cx, metaData, metaData->ranges.front(), dir, writer, cFun));
 	}

 	return *metaData;
--- a/fdbserver/ShardsAffectedByTeamFailure.cpp
+++ b/fdbserver/ShardsAffectedByTeamFailure.cpp
@ -40,10 +40,16 @@ int ShardsAffectedByTeamFailure::getNumberOfShards(UID ssID) const {
 }

 std::pair<std::vector<ShardsAffectedByTeamFailure::Team>, std::vector<ShardsAffectedByTeamFailure::Team>>
-ShardsAffectedByTeamFailure::getTeamsFor(KeyRangeRef keys) {
+ShardsAffectedByTeamFailure::getTeamsForFirstShard(KeyRangeRef keys) {
 	return shard_teams[keys.begin];
 }

+std::pair<std::vector<ShardsAffectedByTeamFailure::Team>, std::vector<ShardsAffectedByTeamFailure::Team>>
+
+ShardsAffectedByTeamFailure::getTeamsFor(KeyRef key) {
+	return shard_teams[key];
+}
+
 void ShardsAffectedByTeamFailure::erase(Team team, KeyRange const& range) {
 	DisabledTraceEvent(SevDebug, "ShardsAffectedByTeamFailureErase")
 	    .detail("Range", range)
@ -236,3 +242,7 @@ void ShardsAffectedByTeamFailure::removeFailedServerForRange(KeyRangeRef keys, c
 	}
 	check();
 }
+
+auto ShardsAffectedByTeamFailure::intersectingRanges(KeyRangeRef keyRange) const -> decltype(shard_teams)::ConstRanges {
+	return shard_teams.intersectingRanges(keyRange);
+}
--- a/fdbserver/SimKmsConnector.actor.cpp
+++ b/fdbserver/SimKmsConnector.actor.cpp
@ -192,61 +192,6 @@ ACTOR Future<Void> ekLookupByDomainIds(Reference<SimKmsConnectorContext> ctx,
 	success ? req.reply.send(rep) : req.reply.sendError(encrypt_key_not_found());
 	return Void();
 }
-// TODO: switch this to use bg_url instead of hardcoding file://fdbblob, so it works as FDBPerfKmsConnector
-// FIXME: make this (more) deterministic outside of simulation for FDBPerfKmsConnector
-static Standalone<BlobMetadataDetailsRef> createBlobMetadata(BlobMetadataDomainId domainId,
-                                                             BlobMetadataDomainName domainName) {
-	Standalone<BlobMetadataDetailsRef> metadata;
-	metadata.domainId = domainId;
-	metadata.arena().dependsOn(domainName.arena());
-	metadata.domainName = domainName;
-	// 0 == no partition, 1 == suffix partitioned, 2 == storage location partitioned
-	int type = deterministicRandom()->randomInt(0, 3);
-	int partitionCount = (type == 0) ? 0 : deterministicRandom()->randomInt(2, 12);
-	fmt::print("SimBlobMetadata ({})\n", domainId);
-	TraceEvent ev(SevDebug, "SimBlobMetadata");
-	ev.detail("DomainId", domainId).detail("TypeNum", type).detail("PartitionCount", partitionCount);
-	if (type == 0) {
-		// single storage location
-		metadata.base = StringRef(metadata.arena(), "file://fdbblob/" + std::to_string(domainId) + "/");
-		fmt::print("  {}\n", metadata.base.get().printable());
-		ev.detail("Base", metadata.base);
-	}
-	if (type == 1) {
-		// simulate hash prefixing in s3
-		metadata.base = StringRef(metadata.arena(), "file://fdbblob/"_sr);
-		ev.detail("Base", metadata.base);
-		fmt::print("    {} ({})\n", metadata.base.get().printable(), partitionCount);
-		for (int i = 0; i < partitionCount; i++) {
-			metadata.partitions.push_back_deep(metadata.arena(),
-			                                   deterministicRandom()->randomUniqueID().shortString() + "-" +
-			                                       std::to_string(domainId) + "/");
-			fmt::print("      {}\n", metadata.partitions.back().printable());
-			ev.detail("P" + std::to_string(i), metadata.partitions.back());
-		}
-	}
-	if (type == 2) {
-		// simulate separate storage location per partition
-		for (int i = 0; i < partitionCount; i++) {
-			metadata.partitions.push_back_deep(
-			    metadata.arena(), "file://fdbblob" + std::to_string(domainId) + "_" + std::to_string(i) + "/");
-			fmt::print("      {}\n", metadata.partitions.back().printable());
-			ev.detail("P" + std::to_string(i), metadata.partitions.back());
-		}
-	}
-
-	// set random refresh + expire time
-	if (deterministicRandom()->coinflip()) {
-		metadata.refreshAt = now() + deterministicRandom()->random01() * SERVER_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
-		metadata.expireAt =
-		    metadata.refreshAt + deterministicRandom()->random01() * SERVER_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
-	} else {
-		metadata.refreshAt = std::numeric_limits<double>::max();
-		metadata.expireAt = metadata.refreshAt;
-	}
-
-	return metadata;
-}

 ACTOR Future<Void> blobMetadataLookup(KmsConnectorInterface interf, KmsConnBlobMetadataReq req) {
 	state KmsConnBlobMetadataRep rep;
@ -261,7 +206,9 @@ ACTOR Future<Void> blobMetadataLookup(KmsConnectorInterface interf, KmsConnBlobM
 		if (it == simBlobMetadataStore.end()) {
 			// construct new blob metadata
 			it = simBlobMetadataStore
-			         .insert({ domainInfo.domainId, createBlobMetadata(domainInfo.domainId, domainInfo.domainName) })
+			         .insert({ domainInfo.domainId,
+			                   createRandomTestBlobMetadata(
+			                       SERVER_KNOBS->BG_URL, domainInfo.domainId, domainInfo.domainName) })
 			         .first;
 		} else if (now() >= it->second.expireAt) {
 			// update random refresh and expire time
--- a/fdbserver/SimpleConfigConsumer.actor.cpp
+++ b/fdbserver/SimpleConfigConsumer.actor.cpp
@ -166,8 +166,8 @@ public:
 	    successfulChangeRequest("SuccessfulChangeRequest", cc), failedChangeRequest("FailedChangeRequest", cc),
 	    snapshotRequest("SnapshotRequest", cc) {
 		cfi = getConfigFollowerInterface(configSource);
-		logger = traceCounters(
-		    "ConfigConsumerMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ConfigConsumerMetrics");
+		logger = cc.traceCounters(
+		    "ConfigConsumerMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ConfigConsumerMetrics");
 	}

 	Future<Void> consume(ConfigBroadcaster& broadcaster) {
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@ -620,11 +620,13 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
                                                         std::string* coordFolder,
                                                         std::string baseFolder,
                                                         ClusterConnectionString connStr,
+                                                         ClusterConnectionString otherConnStr,
                                                         bool useSeedFile,
                                                         AgentMode runBackupAgents,
                                                         std::string whitelistBinPaths,
                                                         ProtocolVersion protocolVersion,
-                                                         ConfigDBType configDBType) {
+                                                         ConfigDBType configDBType,
+                                                         bool isDr) {
 	state ISimulator::ProcessInfo* simProcess = g_simulator->getCurrentProcess();
 	state UID randomId = nondeterministicRandom()->randomUniqueID();
 	state int cycles = 0;
@ -644,7 +646,8 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
 		    .detail("Address", NetworkAddress(ip, port, true, false))
 		    .detail("ZoneId", localities.zoneId())
 		    .detail("WaitTime", waitTime)
-		    .detail("Port", port);
+		    .detail("Port", port)
+		    .detail("IsDr", isDr);

 		wait(delay(waitTime));

@ -657,7 +660,8 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
 		                                                                 processClass,
 		                                                                 dataFolder->c_str(),
 		                                                                 coordFolder->c_str(),
-		                                                                 protocolVersion);
+		                                                                 protocolVersion,
+		                                                                 isDr);
 		wait(g_simulator->onProcess(
 		    process,
 		    TaskPriority::DefaultYield)); // Now switch execution to the process on which we will run
@ -724,6 +728,16 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
 				}

 				futures.push_back(success(onShutdown));
+				if (!g_simulator->globalHasSwitchedCluster() && g_simulator->hasSwitchedCluster(process->address)) {
+					// When switching machines between clusters, a simultaneous
+					// reboot followed by a reboot and switch can cause the
+					// reboot and switch to be ignored. Handle this case by
+					// sending the reboot and switch kill type when the process
+					// comes back online.
+					TraceEvent("RebootProcessAndSwitchLateReboot").detail("Address", process->address);
+					g_simulator->switchCluster(process->address);
+					process->shutdownSignal.send(ISimulator::KillType::RebootProcessAndSwitch);
+				}
 				wait(waitForAny(futures));
 			} catch (Error& e) {
 				// If in simulation, if we make it here with an error other than io_timeout but enASIOTimedOut is set
@ -830,6 +844,24 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
 				connRecord =
 				    makeReference<ClusterConnectionFile>(joinPath(*dataFolder, "fdb.cluster"), connStr.toString());
 			}
+		} else if (onShutdown.get() == ISimulator::RebootProcessAndSwitch) {
+			TraceEvent("SimulatedFDBDRebootAndSwitch")
+			    .detail("Cycles", cycles)
+			    .detail("RandomId", randomId)
+			    .detail("Address", process->address)
+			    .detail("ZoneId", localities.zoneId())
+			    .detail("KillType", shutdownResult)
+			    .detail("ConnectionString", connStr.toString())
+			    .detail("OtherConnectionString", otherConnStr.toString())
+			    .detail("SwitchingTo", g_simulator->hasSwitchedCluster(process->address))
+			    .detail("MachineId", process->machine->machineId);
+
+			// Handle the case where otherConnStr is '@'.
+			if (otherConnStr.toString().size() > 1) {
+				std::string newConnStr =
+				    g_simulator->hasSwitchedCluster(process->address) ? otherConnStr.toString() : connStr.toString();
+				connRecord = makeReference<ClusterConnectionFile>(joinPath(*dataFolder, "fdb.cluster"), newConnStr);
+			}
 		} else {
 			TraceEvent("SimulatedFDBDJustRepeat")
 			    .detail("Cycles", cycles)
@ -846,6 +878,7 @@ std::map<Optional<Standalone<StringRef>>, std::vector<std::vector<std::string>>>
 // process count is no longer needed because it is now the length of the vector of ip's, because it was one ip per
 // process
 ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
+                                    ClusterConnectionString otherConnStr,
                                    std::vector<IPAddress> ips,
                                    bool sslEnabled,
                                    LocalityData localities,
@ -857,7 +890,8 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
                                    bool sslOnly,
                                    std::string whitelistBinPaths,
                                    ProtocolVersion protocolVersion,
-                                    ConfigDBType configDBType) {
+                                    ConfigDBType configDBType,
+                                    bool isDr) {
 	state int bootCount = 0;
 	state std::vector<std::string> myFolders;
 	state std::vector<std::string> coordFolders;
@ -924,11 +958,13 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
 					                                          &coordFolders[i],
 					                                          baseFolder,
 					                                          connStr,
+					                                          otherConnStr,
 					                                          useSeedFile,
 					                                          agentMode,
 					                                          whitelistBinPaths,
 					                                          protocolVersion,
-					                                          configDBType));
+					                                          configDBType,
+					                                          isDr));
 					g_simulator->setDiffProtocol = true;
 				} else {
 					processes.push_back(simulatedFDBDRebooter(clusterFile,
@ -942,11 +978,13 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
 					                                          &coordFolders[i],
 					                                          baseFolder,
 					                                          connStr,
+					                                          otherConnStr,
 					                                          useSeedFile,
 					                                          agentMode,
 					                                          whitelistBinPaths,
 					                                          g_network->protocolVersion(),
-					                                          configDBType));
+					                                          configDBType,
+					                                          isDr));
 				}
 				TraceEvent("SimulatedMachineProcess", randomId)
 				    .detail("Address", NetworkAddress(ips[i], listenPort, true, false))
@ -1311,6 +1349,7 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
 			// SOMEDAY: parse backup agent from test file
 			systemActors->push_back(reportErrors(
 			    simulatedMachine(conn,
+			                     ClusterConnectionString(),
 			                     ipAddrs,
 			                     usingSSL,
 			                     localities,
@ -1322,7 +1361,8 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
 			                     usingSSL && (listenersPerProcess == 1 || processClass == ProcessClass::TesterClass),
 			                     whitelistBinPaths,
 			                     protocolVersion,
-			                     configDBType),
+			                     configDBType,
+			                     false),
 			    processClass == ProcessClass::TesterClass ? "SimulatedTesterMachine" : "SimulatedMachine"));
 		}

@ -2346,20 +2386,24 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
 			// check the sslEnablementMap using only one ip
 			LocalityData localities(Optional<Standalone<StringRef>>(), zoneId, machineId, dcUID);
 			localities.set("data_hall"_sr, dcUID);
-			systemActors->push_back(reportErrors(simulatedMachine(conn,
-			                                                      ips,
-			                                                      sslEnabled,
-			                                                      localities,
-			                                                      processClass,
-			                                                      baseFolder,
-			                                                      false,
-			                                                      machine == useSeedForMachine,
-			                                                      requiresExtraDBMachines ? AgentOnly : AgentAddition,
-			                                                      sslOnly,
-			                                                      whitelistBinPaths,
-			                                                      protocolVersion,
-			                                                      configDBType),
-			                                     "SimulatedMachine"));
+			systemActors->push_back(reportErrors(
+			    simulatedMachine(conn,
+			                     requiresExtraDBMachines ? ClusterConnectionString(g_simulator->extraDatabases.at(0))
+			                                             : ClusterConnectionString(),
+			                     ips,
+			                     sslEnabled,
+			                     localities,
+			                     processClass,
+			                     baseFolder,
+			                     false,
+			                     machine == useSeedForMachine,
+			                     requiresExtraDBMachines ? AgentOnly : AgentAddition,
+			                     sslOnly,
+			                     whitelistBinPaths,
+			                     protocolVersion,
+			                     configDBType,
+			                     false),
+			    "SimulatedMachine"));

 			if (requiresExtraDBMachines) {
 				int cluster = 4;
@ -2376,6 +2420,7 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
 					LocalityData localities(Optional<Standalone<StringRef>>(), newZoneId, newMachineId, dcUID);
 					localities.set("data_hall"_sr, dcUID);
 					systemActors->push_back(reportErrors(simulatedMachine(ClusterConnectionString(extraDatabase),
+					                                                      conn,
 					                                                      extraIps,
 					                                                      sslEnabled,
 					                                                      localities,
@ -2387,7 +2432,8 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
 					                                                      sslOnly,
 					                                                      whitelistBinPaths,
 					                                                      protocolVersion,
-					                                                      configDBType),
+					                                                      configDBType,
+					                                                      true),
 					                                     "SimulatedMachine"));
 					++cluster;
 				}
@ -2422,6 +2468,7 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
 		    Optional<Standalone<StringRef>>(), newZoneId, newZoneId, Optional<Standalone<StringRef>>());
 		systemActors->push_back(
 		    reportErrors(simulatedMachine(conn,
+		                                  ClusterConnectionString(),
 		                                  ips,
 		                                  sslEnabled,
 		                                  localities,
@ -2433,7 +2480,8 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
 		                                  sslOnly,
 		                                  whitelistBinPaths,
 		                                  protocolVersion,
-		                                  configDBType),
+		                                  configDBType,
+		                                  false),
 		                 "SimulatedTesterMachine"));
 	}

@ -2557,7 +2605,8 @@ ACTOR void setupAndRun(std::string dataFolder,
 	                            ProcessClass(ProcessClass::TesterClass, ProcessClass::CommandLineSource),
 	                            "",
 	                            "",
-	                            currentProtocolVersion());
+	                            currentProtocolVersion(),
+	                            false);
 	testSystem->excludeFromRestarts = true;
 	wait(g_simulator->onProcess(testSystem, TaskPriority::DefaultYield));
 	Sim2FileSystem::newFileSystem();
--- a/fdbserver/StorageCache.actor.cpp
+++ b/fdbserver/StorageCache.actor.cpp
@ -248,9 +248,9 @@ public:
 	    lastTLogVersion(0), lastVersionWithData(0), peekVersion(0), compactionInProgress(Void()),
 	    fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_BYTES), debug_inApplyUpdate(false),
 	    debug_lastValidateTime(0), versionLag(0), behind(false), counters(this) {
-		version.initMetric("StorageCacheData.Version"_sr, counters.cc.id);
-		desiredOldestVersion.initMetric("StorageCacheData.DesriedOldestVersion"_sr, counters.cc.id);
-		oldestVersion.initMetric("StorageCacheData.OldestVersion"_sr, counters.cc.id);
+		version.initMetric("StorageCacheData.Version"_sr, counters.cc.getId());
+		desiredOldestVersion.initMetric("StorageCacheData.DesriedOldestVersion"_sr, counters.cc.getId());
+		oldestVersion.initMetric("StorageCacheData.OldestVersion"_sr, counters.cc.getId());

 		newestAvailableVersion.insert(allKeys, invalidVersion);
 		newestDirtyVersion.insert(allKeys, invalidVersion);
@ -1188,7 +1188,7 @@ ACTOR Future<RangeResult> tryFetchRange(Database cx,
 	state RangeResult output;
 	state KeySelectorRef begin = firstGreaterOrEqual(keys.begin);
 	state KeySelectorRef end = firstGreaterOrEqual(keys.end);
-	state ReadOptions options = ReadOptions(Optional<UID>(), ReadType::FETCH);
+	state ReadOptions options = ReadOptions(ReadType::FETCH, CacheResult::False);

 	if (*isTooOld)
 		throw transaction_too_old();
@ -2224,11 +2224,10 @@ ACTOR Future<Void> storageCacheServer(StorageServerInterface ssi,
 	self.ck = cacheKeysPrefixFor(id).withPrefix(systemKeys.begin); // FFFF/02cacheKeys/[this server]/

 	actors.add(waitFailureServer(ssi.waitFailure.getFuture()));
-	actors.add(traceCounters("CacheMetrics",
-	                         self.thisServerID,
-	                         SERVER_KNOBS->STORAGE_LOGGING_DELAY,
-	                         &self.counters.cc,
-	                         self.thisServerID.toString() + "/CacheMetrics"));
+	actors.add(self.counters.cc.traceCounters("CacheMetrics",
+	                                          self.thisServerID,
+	                                          SERVER_KNOBS->STORAGE_LOGGING_DELAY,
+	                                          self.thisServerID.toString() + "/CacheMetrics"));

 	// fetch already cached ranges from the database and apply them before proceeding
 	wait(storageCacheStartUpWarmup(&self));
--- a/fdbserver/StorageMetrics.actor.cpp
+++ b/fdbserver/StorageMetrics.actor.cpp
@ -19,7 +19,7 @@
 */

 #include "flow/UnitTest.h"
-#include "fdbserver/StorageMetrics.h"
+#include "fdbserver/StorageMetrics.actor.h"
 #include "flow/actorcompiler.h" // This must be the last #include.

 int64_t StorageMetricSample::getEstimate(KeyRangeRef keys) const {
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@ -26,7 +26,6 @@
 #include "fdbclient/RunTransaction.actor.h"
 #include "fdbclient/SystemData.h"
 #include "fdbclient/FDBTypes.h"
-#include "fdbclient/ManagementAPI.actor.h"
 #include "fdbserver/WorkerInterface.actor.h"
 #include "fdbserver/SpanContextMessage.h"
 #include "fdbserver/TLogInterface.h"
@ -217,8 +216,6 @@ static const KeyRange persistTagMessagesKeys = prefixRange("TagMsg/"_sr);
 static const KeyRange persistTagMessageRefsKeys = prefixRange("TagMsgRef/"_sr);
 static const KeyRange persistTagPoppedKeys = prefixRange("TagPop/"_sr);

-static const KeyRef persistClusterIdKey = "clusterId"_sr;
-
 static Key persistTagMessagesKey(UID id, Tag tag, Version version) {
 	BinaryWriter wr(Unversioned());
 	wr.serializeBytes(persistTagMessagesKeys.begin);
@ -306,13 +303,6 @@ struct TLogData : NonCopyable {
 	Deque<UID> spillOrder;
 	std::map<UID, Reference<struct LogData>> id_data;

-	// The durable cluster ID identifies which cluster the tlogs persistent
-	// data is written from. This value is restored from disk when the tlog
-	// restarts.
-	UID durableClusterId;
-	// The cluster-controller cluster ID stores the cluster ID read from the txnStateStore.
-	// It is cached in this variable.
-	UID ccClusterId;
 	UID dbgid;
 	UID workerID;

@ -652,10 +642,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 		          context);
 		addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id()));

-		persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id);
-		persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id);
-		version.initMetric("TLog.Version"_sr, cc.id);
-		queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id);
+		persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId());
+		persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId());
+		version.initMetric("TLog.Version"_sr, cc.getId());
+		queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId());

 		specialCounter(cc, "Version", [this]() { return this->version.get(); });
 		specialCounter(cc, "QueueCommittedVersion", [this]() { return this->queueCommittedVersion.get(); });
@ -2401,24 +2391,6 @@ ACTOR Future<Void> initPersistentState(TLogData* self, Reference<LogData> logDat
 	return Void();
 }

-ACTOR Future<UID> getClusterId(TLogData* self) {
-	state ReadYourWritesTransaction tr(self->cx);
-	loop {
-		try {
-			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
-			Optional<Value> clusterId = wait(tr.get(clusterIdKey));
-			if (clusterId.present()) {
-				return BinaryReader::fromStringRef<UID>(clusterId.get(), Unversioned());
-			} else {
-				return UID();
-			}
-		} catch (Error& e) {
-			wait(tr.onError(e));
-		}
-	}
-}
-
 // send stopped promise instead of LogData* to avoid reference cycles
 ACTOR Future<Void> rejoinClusterController(TLogData* self,
                                           TLogInterface tli,
@ -2441,26 +2413,14 @@ ACTOR Future<Void> rejoinClusterController(TLogData* self,
 		}
 		isDisplaced = isDisplaced && !inf.logSystemConfig.hasTLog(tli.id());
 		if (isDisplaced) {
-			state TraceEvent ev("TLogDisplaced", tli.id());
-			ev.detail("Reason", "DBInfoDoesNotContain")
+			TraceEvent("TLogDisplaced", tli.id())
+			    .detail("Reason", "DBInfoDoesNotContain")
 			    .detail("RecoveryCount", recoveryCount)
 			    .detail("InfRecoveryCount", inf.recoveryCount)
 			    .detail("RecoveryState", (int)inf.recoveryState)
 			    .detail("LogSysConf", describe(inf.logSystemConfig.tLogs))
 			    .detail("PriorLogs", describe(inf.priorCommittedLogServers))
 			    .detail("OldLogGens", inf.logSystemConfig.oldTLogs.size());
-			// Read and cache cluster ID before displacing this tlog. We want
-			// to avoid removing the tlogs data if it has joined a new cluster
-			// with a different cluster ID.
-
-			// TODO: #5375
-			/*
-			            state UID clusterId = wait(getClusterId(self));
-			            ASSERT(clusterId.isValid());
-			            self->ccClusterId = clusterId;
-			            ev.detail("ClusterId", clusterId).detail("SelfClusterId", self->durableClusterId);
-			*/
-
 			if (BUGGIFY)
 				wait(delay(SERVER_KNOBS->BUGGIFY_WORKER_REMOVED_MAX_LAG * deterministicRandom()->random01()));
 			throw worker_removed();
@ -2619,27 +2579,6 @@ ACTOR Future<Void> tLogEnablePopReq(TLogEnablePopRequest enablePopReq, TLogData*
 	return Void();
 }

-ACTOR Future<Void> updateDurableClusterID(TLogData* self) {
-	loop {
-		// Persist cluster ID once cluster has recovered.
-		if (self->dbInfo->get().recoveryState == RecoveryState::FULLY_RECOVERED) {
-			ASSERT(!self->durableClusterId.isValid());
-			state UID ccClusterId = self->dbInfo->get().client.clusterId;
-			self->durableClusterId = ccClusterId;
-			ASSERT(ccClusterId.isValid());
-
-			wait(self->persistentDataCommitLock.take());
-			state FlowLock::Releaser commitLockReleaser(self->persistentDataCommitLock);
-			self->persistentData->set(
-			    KeyValueRef(persistClusterIdKey, BinaryWriter::toValue(ccClusterId, Unversioned())));
-			wait(self->persistentData->commit());
-
-			return Void();
-		}
-		wait(self->dbInfo->onChange());
-	}
-}
-
 ACTOR Future<Void> serveTLogInterface(TLogData* self,
                                      TLogInterface tli,
                                      Reference<LogData> logData,
@ -2930,26 +2869,26 @@ ACTOR Future<Void> tLogCore(TLogData* self,
 	logData->addActor.send(waitFailureServer(tli.waitFailure.getFuture()));
 	logData->addActor.send(logData->removed);
 	// FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance
-	logData->addActor.send(traceCounters("TLogMetrics",
-	                                     logData->logId,
-	                                     SERVER_KNOBS->STORAGE_LOGGING_DELAY,
-	                                     &logData->cc,
-	                                     logData->logId.toString() + "/TLogMetrics",
-	                                     [self = self](TraceEvent& te) {
-		                                     StorageBytes sbTlog = self->persistentData->getStorageBytes();
-		                                     te.detail("KvstoreBytesUsed", sbTlog.used);
-		                                     te.detail("KvstoreBytesFree", sbTlog.free);
-		                                     te.detail("KvstoreBytesAvailable", sbTlog.available);
-		                                     te.detail("KvstoreBytesTotal", sbTlog.total);
-		                                     te.detail("KvstoreBytesTemp", sbTlog.temp);
+	logData->addActor.send(logData->cc.traceCounters("TLogMetrics",
+	                                                 logData->logId,
+	                                                 SERVER_KNOBS->STORAGE_LOGGING_DELAY,
+	                                                 logData->logId.toString() + "/TLogMetrics",
+	                                                 [self = self](TraceEvent& te) {
+		                                                 StorageBytes sbTlog = self->persistentData->getStorageBytes();
+		                                                 te.detail("KvstoreBytesUsed", sbTlog.used);
+		                                                 te.detail("KvstoreBytesFree", sbTlog.free);
+		                                                 te.detail("KvstoreBytesAvailable", sbTlog.available);
+		                                                 te.detail("KvstoreBytesTotal", sbTlog.total);
+		                                                 te.detail("KvstoreBytesTemp", sbTlog.temp);

-		                                     StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
-		                                     te.detail("QueueDiskBytesUsed", sbQueue.used);
-		                                     te.detail("QueueDiskBytesFree", sbQueue.free);
-		                                     te.detail("QueueDiskBytesAvailable", sbQueue.available);
-		                                     te.detail("QueueDiskBytesTotal", sbQueue.total);
-		                                     te.detail("QueueDiskBytesTemp", sbQueue.temp);
-	                                     }));
+		                                                 StorageBytes sbQueue =
+		                                                     self->rawPersistentQueue->getStorageBytes();
+		                                                 te.detail("QueueDiskBytesUsed", sbQueue.used);
+		                                                 te.detail("QueueDiskBytesFree", sbQueue.free);
+		                                                 te.detail("QueueDiskBytesAvailable", sbQueue.available);
+		                                                 te.detail("QueueDiskBytesTotal", sbQueue.total);
+		                                                 te.detail("QueueDiskBytesTemp", sbQueue.temp);
+	                                                 }));

 	logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
 	logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));
@ -3027,7 +2966,6 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 	state IKeyValueStore* storage = self->persistentData;
 	state Future<Optional<Value>> fFormat = storage->readValue(persistFormat.key);
 	state Future<Optional<Value>> fRecoveryLocation = storage->readValue(persistRecoveryLocationKey);
-	state Future<Optional<Value>> fClusterId = storage->readValue(persistClusterIdKey);
 	state Future<RangeResult> fVers = storage->readRange(persistCurrentVersionKeys);
 	state Future<RangeResult> fKnownCommitted = storage->readRange(persistKnownCommittedVersionKeys);
 	state Future<RangeResult> fLocality = storage->readRange(persistLocalityKeys);
@ -3039,7 +2977,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,

 	// FIXME: metadata in queue?

-	wait(waitForAll(std::vector{ fFormat, fRecoveryLocation, fClusterId }));
+	wait(waitForAll(std::vector{ fFormat, fRecoveryLocation }));
 	wait(waitForAll(std::vector{ fVers,
 	                             fKnownCommitted,
 	                             fLocality,
@ -3049,10 +2987,6 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 	                             fProtocolVersions,
 	                             fTLogSpillTypes }));

-	if (fClusterId.get().present()) {
-		self->durableClusterId = BinaryReader::fromStringRef<UID>(fClusterId.get().get(), Unversioned());
-	}
-
 	if (fFormat.get().present() && !persistFormatReadableRange.contains(fFormat.get().get())) {
 		// FIXME: remove when we no longer need to test upgrades from 4.X releases
 		if (g_network->isSimulated()) {
@ -3315,7 +3249,7 @@ bool tlogTerminated(TLogData* self, IKeyValueStore* persistentData, TLogQueue* p
 	}

 	if (e.code() == error_code_worker_removed || e.code() == error_code_recruitment_failed ||
-	    e.code() == error_code_file_not_found || e.code() == error_code_invalid_cluster_id) {
+	    e.code() == error_code_file_not_found) {
 		TraceEvent("TLogTerminated", self->dbgid).errorUnsuppressed(e);
 		return true;
 	} else
@ -3591,86 +3525,50 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,

 	TraceEvent("SharedTlog", tlogId);
 	try {
-		try {
-			wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
+		wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));

-			if (restoreFromDisk) {
-				wait(restorePersistentState(&self, locality, oldLog, recovered, tlogRequests));
-			} else {
-				wait(ioTimeoutError(checkEmptyQueue(&self) && initPersistentStorage(&self),
-				                    SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
-			}
+		if (restoreFromDisk) {
+			wait(restorePersistentState(&self, locality, oldLog, recovered, tlogRequests));
+		} else {
+			wait(ioTimeoutError(checkEmptyQueue(&self) && initPersistentStorage(&self),
+			                    SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
+		}

-			// Disk errors need a chance to kill this actor.
-			wait(delay(0.000001));
+		// Disk errors need a chance to kill this actor.
+		wait(delay(0.000001));

-			if (recovered.canBeSet())
-				recovered.send(Void());
+		if (recovered.canBeSet())
+			recovered.send(Void());

-			if (!self.durableClusterId.isValid()) {
-				self.sharedActors.send(updateDurableClusterID(&self));
-			}
-			self.sharedActors.send(commitQueue(&self));
-			self.sharedActors.send(updateStorageLoop(&self));
-			self.sharedActors.send(traceRole(Role::SHARED_TRANSACTION_LOG, tlogId));
-			state Future<Void> activeSharedChange = Void();
+		self.sharedActors.send(commitQueue(&self));
+		self.sharedActors.send(updateStorageLoop(&self));
+		self.sharedActors.send(traceRole(Role::SHARED_TRANSACTION_LOG, tlogId));
+		state Future<Void> activeSharedChange = Void();

-			loop {
-				choose {
-					when(state InitializeTLogRequest req = waitNext(tlogRequests.getFuture())) {
-						if (!self.tlogCache.exists(req.recruitmentID)) {
-							self.tlogCache.set(req.recruitmentID, req.reply.getFuture());
-							self.sharedActors.send(
-							    self.tlogCache.removeOnReady(req.recruitmentID, tLogStart(&self, req, locality)));
-						} else {
-							forwardPromise(req.reply, self.tlogCache.get(req.recruitmentID));
-						}
-					}
-					when(wait(error)) { throw internal_error(); }
-					when(wait(activeSharedChange)) {
-						if (activeSharedTLog->get() == tlogId) {
-							TraceEvent("SharedTLogNowActive", self.dbgid).detail("NowActive", activeSharedTLog->get());
-							self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD;
-						} else {
-							stopAllTLogs(&self, tlogId);
-							TraceEvent("SharedTLogQueueSpilling", self.dbgid)
-							    .detail("NowActive", activeSharedTLog->get());
-							self.sharedActors.send(startSpillingInTenSeconds(&self, tlogId, activeSharedTLog));
-						}
-						activeSharedChange = activeSharedTLog->onChange();
+		loop {
+			choose {
+				when(state InitializeTLogRequest req = waitNext(tlogRequests.getFuture())) {
+					if (!self.tlogCache.exists(req.recruitmentID)) {
+						self.tlogCache.set(req.recruitmentID, req.reply.getFuture());
+						self.sharedActors.send(
+						    self.tlogCache.removeOnReady(req.recruitmentID, tLogStart(&self, req, locality)));
+					} else {
+						forwardPromise(req.reply, self.tlogCache.get(req.recruitmentID));
 					}
 				}
+				when(wait(error)) { throw internal_error(); }
+				when(wait(activeSharedChange)) {
+					if (activeSharedTLog->get() == tlogId) {
+						TraceEvent("SharedTLogNowActive", self.dbgid).detail("NowActive", activeSharedTLog->get());
+						self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD;
+					} else {
+						stopAllTLogs(&self, tlogId);
+						TraceEvent("SharedTLogQueueSpilling", self.dbgid).detail("NowActive", activeSharedTLog->get());
+						self.sharedActors.send(startSpillingInTenSeconds(&self, tlogId, activeSharedTLog));
+					}
+					activeSharedChange = activeSharedTLog->onChange();
+				}
 			}
-		} catch (Error& e) {
-			throw;
-
-			// TODO: #5375
-			/*
-			            if (e.code() != error_code_worker_removed) {
-			                throw;
-			            }
-			            // Don't need to worry about deleting data if there is no durable
-			            // cluster ID.
-			            if (!self.durableClusterId.isValid()) {
-			                throw;
-			            }
-			            // When a tlog joins a new cluster and has data for an old cluster,
-			            // it should automatically exclude itself to avoid being used in
-			            // the new cluster.
-			            auto recoveryState = self.dbInfo->get().recoveryState;
-			            if (recoveryState == RecoveryState::FULLY_RECOVERED && self.ccClusterId.isValid() &&
-			                self.durableClusterId.isValid() && self.ccClusterId != self.durableClusterId) {
-			                state NetworkAddress address = g_network->getLocalAddress();
-			                wait(excludeServers(self.cx, { AddressExclusion{ address.ip, address.port } }));
-			                TraceEvent(SevWarnAlways, "TLogBelongsToExistingCluster")
-			                    .detail("ClusterId", self.durableClusterId)
-			                    .detail("NewClusterId", self.ccClusterId);
-			            }
-			            // If the tlog has a valid durable cluster ID, we don't want it to
-			            // wipe its data! Throw this error to signal to `tlogTerminated` to
-			            // close the persistent data store instead of deleting it.
-			            throw invalid_cluster_id();
-			*/
 		}
 	} catch (Error& e) {
 		self.terminated.send(Void());
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@ -1635,7 +1635,6 @@ Future<Void> TagPartitionedLogSystem::endEpoch() {
 Future<Reference<ILogSystem>> TagPartitionedLogSystem::newEpoch(
    RecruitFromConfigurationReply const& recr,
    Future<RecruitRemoteFromConfigurationReply> const& fRemoteWorkers,
-    UID clusterId,
    DatabaseConfiguration const& config,
    LogEpoch recoveryCount,
    Version recoveryTransactionVersion,
@ -1646,7 +1645,6 @@ Future<Reference<ILogSystem>> TagPartitionedLogSystem::newEpoch(
 	return newEpoch(Reference<TagPartitionedLogSystem>::addRef(this),
 	                recr,
 	                fRemoteWorkers,
-	                clusterId,
 	                config,
 	                recoveryCount,
 	                recoveryTransactionVersion,
@ -2546,7 +2544,6 @@ std::vector<Tag> TagPartitionedLogSystem::getLocalTags(int8_t locality, const st
 ACTOR Future<Void> TagPartitionedLogSystem::newRemoteEpoch(TagPartitionedLogSystem* self,
                                                           Reference<TagPartitionedLogSystem> oldLogSystem,
                                                           Future<RecruitRemoteFromConfigurationReply> fRemoteWorkers,
-                                                           UID clusterId,
                                                           DatabaseConfiguration configuration,
                                                           LogEpoch recoveryCount,
                                                           Version recoveryTransactionVersion,
@ -2690,7 +2687,6 @@ ACTOR Future<Void> TagPartitionedLogSystem::newRemoteEpoch(TagPartitionedLogSyst
 		req.startVersion = logSet->startVersion;
 		req.logRouterTags = 0;
 		req.txsTags = self->txsTags;
-		req.clusterId = clusterId;
 		req.recoveryTransactionVersion = recoveryTransactionVersion;
 	}

@ -2742,7 +2738,6 @@ ACTOR Future<Reference<ILogSystem>> TagPartitionedLogSystem::newEpoch(
    Reference<TagPartitionedLogSystem> oldLogSystem,
    RecruitFromConfigurationReply recr,
    Future<RecruitRemoteFromConfigurationReply> fRemoteWorkers,
-    UID clusterId,
    DatabaseConfiguration configuration,
    LogEpoch recoveryCount,
    Version recoveryTransactionVersion,
@ -2965,7 +2960,6 @@ ACTOR Future<Reference<ILogSystem>> TagPartitionedLogSystem::newEpoch(
 		req.startVersion = logSystem->tLogs[0]->startVersion;
 		req.logRouterTags = logSystem->logRouterTags;
 		req.txsTags = logSystem->txsTags;
-		req.clusterId = clusterId;
 		req.recoveryTransactionVersion = recoveryTransactionVersion;
 	}

@ -3035,7 +3029,6 @@ ACTOR Future<Reference<ILogSystem>> TagPartitionedLogSystem::newEpoch(
 			req.startVersion = oldLogSystem->knownCommittedVersion + 1;
 			req.logRouterTags = logSystem->logRouterTags;
 			req.txsTags = logSystem->txsTags;
-			req.clusterId = clusterId;
 			req.recoveryTransactionVersion = recoveryTransactionVersion;
 		}

@ -3094,7 +3087,6 @@ ACTOR Future<Reference<ILogSystem>> TagPartitionedLogSystem::newEpoch(
 		logSystem->remoteRecovery = TagPartitionedLogSystem::newRemoteEpoch(logSystem.getPtr(),
 		                                                                    oldLogSystem,
 		                                                                    fRemoteWorkers,
-		                                                                    clusterId,
 		                                                                    configuration,
 		                                                                    recoveryCount,
 		                                                                    recoveryTransactionVersion,
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@ -38,6 +38,7 @@
 #include "flow/IRandom.h"
 #include "flow/Knobs.h"
 #include "flow/ObjectSerializer.h"
+#include "flow/PriorityMultiLock.actor.h"
 #include "flow/serialize.h"
 #include "flow/Trace.h"
 #include "flow/UnitTest.h"
@ -105,210 +106,6 @@ std::string addPrefix(std::string prefix, std::string lines) {
 	return s;
 }

-#define PRIORITYMULTILOCK_DEBUG 0
-
-// A multi user lock with a concurrent holder limit where waiters are granted the lock according to
-// an integer priority from 0 to maxPriority, inclusive, where higher integers are given priority.
-//
-// The interface is similar to FlowMutex except that lock holders can drop the lock to release it.
-//
-// Usage:
-//   Lock lock = wait(prioritylock.lock(priorityLevel));
-//   lock.release();  // Explicit release, or
-//   // let lock and all copies of lock go out of scope to release
-class PriorityMultiLock {
-
-public:
-	// Waiting on the lock returns a Lock, which is really just a Promise<Void>
-	// Calling release() is not necessary, it exists in case the Lock holder wants to explicitly release
-	// the Lock before it goes out of scope.
-	struct Lock {
-		void release() { promise.send(Void()); }
-
-		// This is exposed in case the caller wants to use/copy it directly
-		Promise<Void> promise;
-	};
-
-private:
-	struct Waiter {
-		Waiter() : queuedTime(now()) {}
-		Promise<Lock> lockPromise;
-		double queuedTime;
-	};
-
-	typedef Deque<Waiter> Queue;
-
-#if PRIORITYMULTILOCK_DEBUG
-#define prioritylock_printf(...) printf(__VA_ARGS__)
-#else
-#define prioritylock_printf(...)
-#endif
-
-public:
-	PriorityMultiLock(int concurrency, int maxPriority, int launchLimit = std::numeric_limits<int>::max())
-	  : concurrency(concurrency), available(concurrency), waiting(0), launchLimit(launchLimit) {
-		waiters.resize(maxPriority + 1);
-		fRunner = runner(this);
-	}
-
-	~PriorityMultiLock() { prioritylock_printf("destruct"); }
-
-	void kill() {
-		brokenOnDestruct.sendError(broken_promise());
-		fRunner.cancel();
-		runners.clear();
-		for (auto& w : waiters) {
-			w.clear();
-		}
-	}
-
-	Future<Lock> lock(int priority = 0) {
-		prioritylock_printf("lock begin %s\n", toString().c_str());
-
-		// This shortcut may enable a waiter to jump the line when the releaser loop yields
-		if (available > 0) {
-			--available;
-			Lock p;
-			addRunner(p);
-			prioritylock_printf("lock exit immediate %s\n", toString().c_str());
-			return p;
-		}
-
-		Waiter w;
-		waiters[priority].push_back(w);
-		++waiting;
-		prioritylock_printf("lock exit queued %s\n", toString().c_str());
-		return w.lockPromise.getFuture();
-	}
-
-	std::string toString() const {
-		int runnersDone = 0;
-		for (int i = 0; i < runners.size(); ++i) {
-			if (runners[i].isReady()) {
-				++runnersDone;
-			}
-		}
-
-		std::string s =
-		    format("{ ptr=%p concurrency=%d available=%d running=%d waiting=%d runnersQueue=%d runnersDone=%d ",
-		           this,
-		           concurrency,
-		           available,
-		           concurrency - available,
-		           waiting,
-		           runners.size(),
-		           runnersDone);
-
-		for (int i = 0; i < waiters.size(); ++i) {
-			s += format("p%d_waiters=%u ", i, waiters[i].size());
-		}
-
-		s += "}";
-		return s;
-	}
-
-private:
-	void addRunner(Lock& lock) {
-		runners.push_back(map(ready(lock.promise.getFuture()), [=](Void) {
-			prioritylock_printf("Lock released\n");
-			++available;
-			if (waiting > 0 || runners.size() > 100) {
-				release.trigger();
-			}
-			return Void();
-		}));
-	}
-
-	ACTOR static Future<Void> runner(PriorityMultiLock* self) {
-		state int sinceYield = 0;
-		state Future<Void> error = self->brokenOnDestruct.getFuture();
-		state int maxPriority = self->waiters.size() - 1;
-
-		// Priority to try to run tasks from next
-		state int priority = maxPriority;
-		state Queue* pQueue = &self->waiters[maxPriority];
-
-		// Track the number of waiters unlocked at the same priority in a row
-		state int lastPriorityCount = 0;
-
-		loop {
-			// Cleanup finished runner futures at the front of the runner queue.
-			while (!self->runners.empty() && self->runners.front().isReady()) {
-				self->runners.pop_front();
-			}
-
-			// Wait for a runner to release its lock
-			wait(self->release.onTrigger());
-			prioritylock_printf("runner wakeup %s\n", self->toString().c_str());
-
-			if (++sinceYield == 1000) {
-				sinceYield = 0;
-				wait(delay(0));
-			}
-
-			// While there are available slots and there are waiters, launch tasks
-			while (self->available > 0 && self->waiting > 0) {
-				prioritylock_printf("Checking priority=%d lastPriorityCount=%d %s\n",
-				                    priority,
-				                    lastPriorityCount,
-				                    self->toString().c_str());
-
-				while (!pQueue->empty() && ++lastPriorityCount < self->launchLimit) {
-					Waiter w = pQueue->front();
-					pQueue->pop_front();
-					--self->waiting;
-					Lock lock;
-					prioritylock_printf("  Running waiter priority=%d wait=%f %s\n",
-					                    priority,
-					                    now() - w.queuedTime,
-					                    self->toString().c_str());
-					w.lockPromise.send(lock);
-
-					// Self may have been destructed during the lock callback
-					if (error.isReady()) {
-						throw error.getError();
-					}
-
-					// If the lock was not already released, add it to the runners future queue
-					if (lock.promise.canBeSet()) {
-						self->addRunner(lock);
-
-						// A slot has been consumed, so stop reading from this queue if there aren't any more
-						if (--self->available == 0) {
-							break;
-						}
-					}
-				}
-
-				// If there are no more slots available, then don't move to the next priority
-				if (self->available == 0) {
-					break;
-				}
-
-				// Decrease priority, wrapping around to max from 0
-				if (priority == 0) {
-					priority = maxPriority;
-				} else {
-					--priority;
-				}
-
-				pQueue = &self->waiters[priority];
-				lastPriorityCount = 0;
-			}
-		}
-	}
-
-	int concurrency;
-	int available;
-	int waiting;
-	int launchLimit;
-	std::vector<Queue> waiters;
-	Deque<Future<Void>> runners;
-	Future<Void> fRunner;
-	AsyncTrigger release;
-	Promise<Void> brokenOnDestruct;
-};
-
 // Some convenience functions for debugging to stringify various structures
 // Classes can add compatibility by either specializing toString<T> or implementing
 //   std::string toString() const;
@ -1677,6 +1474,8 @@ struct RedwoodMetrics {
 		kvSizeReadByGetRange = Reference<Histogram>(
 		    new Histogram(Reference<HistogramRegistry>(), "kvSize", "ReadByGetRange", Histogram::Unit::bytes));

+		ioLock = nullptr;
+
 		// These histograms are used for Btree events, hence level > 0
 		unsigned int levelCounter = 0;
 		for (RedwoodMetrics::Level& level : levels) {
@ -1719,6 +1518,8 @@ struct RedwoodMetrics {
 	// btree levels and one extra level for non btree level.
 	Level levels[btreeLevels + 1];
 	metrics metric;
+	// pointer to the priority multi lock used in pager
+	PriorityMultiLock* ioLock;

 	Reference<Histogram> kvSizeWritten;
 	Reference<Histogram> kvSizeReadByGet;
@ -1773,9 +1574,12 @@ struct RedwoodMetrics {
 	// The string is a reasonably well formatted page of information
 	void getFields(TraceEvent* e, std::string* s = nullptr, bool skipZeroes = false);

+	void getIOLockFields(TraceEvent* e, std::string* s = nullptr);
+
 	std::string toString(bool clearAfter) {
 		std::string s;
 		getFields(nullptr, &s);
+		getIOLockFields(nullptr, &s);

 		if (clearAfter) {
 			clear();
@ -1810,6 +1614,7 @@ ACTOR Future<Void> redwoodMetricsLogger() {
 		double elapsed = now() - g_redwoodMetrics.startTime;
 		e.detail("Elapsed", elapsed);
 		g_redwoodMetrics.getFields(&e);
+		g_redwoodMetrics.getIOLockFields(&e);
 		g_redwoodMetrics.clear();
 	}
 }
@ -2220,7 +2025,7 @@ public:
 	          bool memoryOnly,
 	          Reference<IPageEncryptionKeyProvider> keyProvider,
 	          Promise<Void> errorPromise = {})
-	  : keyProvider(keyProvider), ioLock(FLOW_KNOBS->MAX_OUTSTANDING, ioMaxPriority, FLOW_KNOBS->MAX_OUTSTANDING / 2),
+	  : keyProvider(keyProvider), ioLock(FLOW_KNOBS->MAX_OUTSTANDING, SERVER_KNOBS->REDWOOD_PRIORITY_LAUNCHS),
 	    pageCacheBytes(pageCacheSizeBytes), desiredPageSize(desiredPageSize), desiredExtentSize(desiredExtentSize),
 	    filename(filename), memoryOnly(memoryOnly), errorPromise(errorPromise),
 	    remapCleanupWindowBytes(remapCleanupWindowBytes), concurrentExtentReads(new FlowLock(concurrentExtentReads)) {
@ -2232,6 +2037,7 @@ public:
 		// This sets the page cache size for all PageCacheT instances using the same evictor
 		pageCache.evictor().sizeLimit = pageCacheBytes;

+		g_redwoodMetrics.ioLock = &ioLock;
 		if (!g_redwoodMetricsActor.isValid()) {
 			g_redwoodMetricsActor = redwoodMetricsLogger();
 		}
@ -8121,8 +7927,7 @@ RedwoodRecordRef VersionedBTree::dbEnd("\xff\xff\xff\xff\xff"_sr);
 class KeyValueStoreRedwood : public IKeyValueStore {
 public:
 	KeyValueStoreRedwood(std::string filename, UID logID, Reference<IPageEncryptionKeyProvider> encryptionKeyProvider)
-	  : m_filename(filename), m_concurrentReads(SERVER_KNOBS->REDWOOD_KVSTORE_CONCURRENT_READS, 0),
-	    prefetch(SERVER_KNOBS->REDWOOD_KVSTORE_RANGE_PREFETCH) {
+	  : m_filename(filename), prefetch(SERVER_KNOBS->REDWOOD_KVSTORE_RANGE_PREFETCH) {

 		int pageSize =
 		    BUGGIFY ? deterministicRandom()->randomInt(1000, 4096 * 4) : SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE;
@ -8187,6 +7992,8 @@ public:
 	ACTOR void shutdown(KeyValueStoreRedwood* self, bool dispose) {
 		TraceEvent(SevInfo, "RedwoodShutdown").detail("Filename", self->m_filename).detail("Dispose", dispose);

+		g_redwoodMetrics.ioLock = nullptr;
+
 		// In simulation, if the instance is being disposed of then sometimes run destructive sanity check.
 		if (g_network->isSimulated() && dispose && BUGGIFY) {
 			// Only proceed if the last commit is a success, but don't throw if it's not because shutdown
@ -8289,7 +8096,6 @@ public:
 				f.get();
 			} else {
 				CODE_PROBE(true, "Uncached forward range read seek");
-				wait(store(lock, self->m_concurrentReads.lock()));
 				wait(f);
 			}

@ -8345,7 +8151,6 @@ public:
 				f.get();
 			} else {
 				CODE_PROBE(true, "Uncached reverse range read seek");
-				wait(store(lock, self->m_concurrentReads.lock()));
 				wait(f);
 			}

@ -8412,9 +8217,6 @@ public:
 		wait(self->m_tree->initBTreeCursor(
 		    &cur, self->m_tree->getLastCommittedVersion(), PagerEventReasons::PointRead, options));

-		// Not locking for point reads, instead relying on IO priority lock
-		// state PriorityMultiLock::Lock lock = wait(self->m_concurrentReads.lock());
-
 		++g_redwoodMetrics.metric.opGet;
 		wait(cur.seekGTE(key));
 		if (cur.isValid() && cur.get().key == key) {
@ -8450,7 +8252,6 @@ private:
 	Future<Void> m_init;
 	Promise<Void> m_closed;
 	Promise<Void> m_error;
-	PriorityMultiLock m_concurrentReads;
 	bool prefetch;
 	Version m_nextCommitVersion;
 	Reference<IPageEncryptionKeyProvider> m_keyProvider;
@ -9086,6 +8887,43 @@ void RedwoodMetrics::getFields(TraceEvent* e, std::string* s, bool skipZeroes) {
 	}
 }

+void RedwoodMetrics::getIOLockFields(TraceEvent* e, std::string* s) {
+	if (ioLock == nullptr)
+		return;
+
+	int maxPriority = ioLock->maxPriority();
+
+	if (e != nullptr) {
+		e->detail("ActiveReads", ioLock->totalRunners());
+		e->detail("AwaitReads", ioLock->totalWaiters());
+
+		for (int priority = 0; priority <= maxPriority; ++priority) {
+			e->detail(format("ActiveP%d", priority), ioLock->numRunners(priority));
+			e->detail(format("AwaitP%d", priority), ioLock->numWaiters(priority));
+		}
+	}
+
+	if (s != nullptr) {
+		std::string active = "Active";
+		std::string await = "Await";
+
+		*s += "\n";
+		*s += format("%-15s %-8u  ", "ActiveReads", ioLock->totalRunners());
+		*s += format("%-15s %-8u  ", "AwaitReads", ioLock->totalWaiters());
+		*s += "\n";
+
+		for (int priority = 0; priority <= maxPriority; ++priority) {
+			*s +=
+			    format("%-15s %-8u  ", (active + 'P' + std::to_string(priority)).c_str(), ioLock->numRunners(priority));
+		}
+		*s += "\n";
+		for (int priority = 0; priority <= maxPriority; ++priority) {
+			*s +=
+			    format("%-15s %-8u  ", (await + 'P' + std::to_string(priority)).c_str(), ioLock->numWaiters(priority));
+		}
+	}
+}
+
 TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[0] == 3);
 	ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[1] == 4);
@ -11569,3 +11407,57 @@ TEST_CASE(":/redwood/performance/histograms") {

 	return Void();
 }
+
+ACTOR Future<Void> waitLockIncrement(PriorityMultiLock* pml, int priority, int* pout) {
+	state PriorityMultiLock::Lock lock = wait(pml->lock(priority));
+	wait(delay(deterministicRandom()->random01() * .1));
+	++*pout;
+	return Void();
+}
+
+TEST_CASE("/redwood/PriorityMultiLock") {
+	state std::vector<int> priorities = { 10, 20, 40 };
+	state int concurrency = 25;
+	state PriorityMultiLock* pml = new PriorityMultiLock(concurrency, priorities);
+	state std::vector<int> counts;
+	counts.resize(priorities.size(), 0);
+
+	// Clog the lock buy taking concurrency locks at each level
+	state std::vector<Future<PriorityMultiLock::Lock>> lockFutures;
+	for (int i = 0; i < priorities.size(); ++i) {
+		for (int j = 0; j < concurrency; ++j) {
+			lockFutures.push_back(pml->lock(i));
+		}
+	}
+
+	// Wait for n = concurrency locks to be acquired
+	wait(quorum(lockFutures, concurrency));
+
+	state std::vector<Future<Void>> futures;
+	for (int i = 0; i < 10e3; ++i) {
+		int p = i % priorities.size();
+		futures.push_back(waitLockIncrement(pml, p, &counts[p]));
+	}
+
+	state Future<Void> f = waitForAll(futures);
+
+	// Release the locks
+	lockFutures.clear();
+
+	// Print stats and wait for all futures to be ready
+	loop {
+		choose {
+			when(wait(delay(1))) {
+				printf("counts: ");
+				for (auto c : counts) {
+					printf("%d ", c);
+				}
+				printf("   pml: %s\n", pml->toString().c_str());
+			}
+			when(wait(f)) { break; }
+		}
+	}
+
+	delete pml;
+	return Void();
+}
--- a/fdbserver/include/fdbserver/ClusterController.actor.h
+++ b/fdbserver/include/fdbserver/ClusterController.actor.h
@ -3341,6 +3341,7 @@ public:
 	AsyncVar<std::pair<bool, Optional<std::vector<Optional<Key>>>>>
 	    changedDcIds; // current DC priorities to change second, and whether the cluster controller has been changed
 	UID id;
+	Reference<AsyncVar<Optional<UID>>> clusterId;
 	std::vector<Reference<RecruitWorkersInfo>> outstandingRecruitmentRequests;
 	std::vector<Reference<RecruitRemoteWorkersInfo>> outstandingRemoteRecruitmentRequests;
 	std::vector<std::pair<RecruitStorageRequest, double>> outstandingStorageRequests;
@ -3412,15 +3413,16 @@ public:

 	ClusterControllerData(ClusterControllerFullInterface const& ccInterface,
 	                      LocalityData const& locality,
-	                      ServerCoordinators const& coordinators)
+	                      ServerCoordinators const& coordinators,
+	                      Reference<AsyncVar<Optional<UID>>> clusterId)
 	  : gotProcessClasses(false), gotFullyRecoveredConfig(false), shouldCommitSuicide(false),
 	    clusterControllerProcessId(locality.processId()), clusterControllerDcId(locality.dcId()), id(ccInterface.id()),
-	    ac(false), outstandingRequestChecker(Void()), outstandingRemoteRequestChecker(Void()), startTime(now()),
-	    goodRecruitmentTime(Never()), goodRemoteRecruitmentTime(Never()), datacenterVersionDifference(0),
-	    versionDifferenceUpdated(false), remoteDCMonitorStarted(false), remoteTransactionSystemDegraded(false),
-	    recruitDistributor(false), recruitRatekeeper(false), recruitBlobManager(false), recruitBlobMigrator(false),
-	    recruitEncryptKeyProxy(false), recruitConsistencyScan(false),
-	    clusterControllerMetrics("ClusterController", id.toString()),
+	    clusterId(clusterId), ac(false), outstandingRequestChecker(Void()), outstandingRemoteRequestChecker(Void()),
+	    startTime(now()), goodRecruitmentTime(Never()), goodRemoteRecruitmentTime(Never()),
+	    datacenterVersionDifference(0), versionDifferenceUpdated(false), remoteDCMonitorStarted(false),
+	    remoteTransactionSystemDegraded(false), recruitDistributor(false), recruitRatekeeper(false),
+	    recruitBlobManager(false), recruitBlobMigrator(false), recruitEncryptKeyProxy(false),
+	    recruitConsistencyScan(false), clusterControllerMetrics("ClusterController", id.toString()),
 	    openDatabaseRequests("OpenDatabaseRequests", clusterControllerMetrics),
 	    registerWorkerRequests("RegisterWorkerRequests", clusterControllerMetrics),
 	    getWorkersRequests("GetWorkersRequests", clusterControllerMetrics),
--- a/fdbserver/include/fdbserver/ClusterRecovery.actor.h
+++ b/fdbserver/include/fdbserver/ClusterRecovery.actor.h
@ -289,11 +289,10 @@ struct ClusterRecoveryData : NonCopyable, ReferenceCounted<ClusterRecoveryData>
 		    getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_DURATION_EVENT_NAME));
 		clusterRecoveryAvailableEventHolder = makeReference<EventCacheHolder>(
 		    getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_AVAILABLE_EVENT_NAME));
-		logger = traceCounters(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME),
-		                       dbgid,
-		                       SERVER_KNOBS->WORKER_LOGGING_INTERVAL,
-		                       &cc,
-		                       getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME));
+		logger = cc.traceCounters(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME),
+		                          dbgid,
+		                          SERVER_KNOBS->WORKER_LOGGING_INTERVAL,
+		                          getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME));
 		if (forceRecovery && !controllerData->clusterControllerDcId.present()) {
 			TraceEvent(SevError, "ForcedRecoveryRequiresDcID").log();
 			forceRecovery = false;
--- a/fdbserver/include/fdbserver/DDTeamCollection.h
+++ b/fdbserver/include/fdbserver/DDTeamCollection.h
@ -468,8 +468,6 @@ class DDTeamCollection : public ReferenceCounted<DDTeamCollection> {
 	                               bool recruitTss,
 	                               Reference<TSSPairState> tssState);

-	Future<UID> getClusterId();
-
 	// return the next ServerID in storageWiggler
 	Future<UID> getNextWigglingServerID();

--- a/fdbserver/include/fdbserver/DDTxnProcessor.h
+++ b/fdbserver/include/fdbserver/DDTxnProcessor.h
@ -117,6 +117,7 @@ public:

 	virtual Future<Void> moveKeys(const MoveKeysParams& params) = 0;

+	// metrics.second is the number of key-ranges (i.e., shards) in the 'keys' key-range
 	virtual Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(KeyRange const& keys,
 	                                                                            StorageMetrics const& min,
 	                                                                            StorageMetrics const& max,
@ -136,8 +137,6 @@ public:

 	virtual Future<Optional<Value>> readRebalanceDDIgnoreKey() const { return {}; }

-	virtual Future<UID> getClusterId() const { return {}; }
-
 	virtual Future<Void> waitDDTeamInfoPrintSignal() const { return Never(); }

 	virtual Future<std::vector<ProcessData>> getWorkers() const = 0;
@ -221,8 +220,6 @@ public:

 	Future<Optional<Value>> readRebalanceDDIgnoreKey() const override;

-	Future<UID> getClusterId() const override;
-
 	Future<Void> waitDDTeamInfoPrintSignal() const override;

 	Future<std::vector<ProcessData>> getWorkers() const override;
--- a/fdbserver/include/fdbserver/DataDistribution.actor.h
+++ b/fdbserver/include/fdbserver/DataDistribution.actor.h
@ -476,6 +476,8 @@ struct ShardSizeBounds {
 	bool operator==(ShardSizeBounds const& rhs) const {
 		return max == rhs.max && min == rhs.min && permittedError == rhs.permittedError;
 	}
+
+	static ShardSizeBounds shardSizeBoundsBeforeTrack();
 };

 // Gets the permitted size and IO bounds for a shard
--- a/fdbserver/include/fdbserver/GrvProxyTransactionTagThrottler.h
+++ b/fdbserver/include/fdbserver/GrvProxyTransactionTagThrottler.h
@ -46,6 +46,7 @@ class GrvProxyTransactionTagThrottler {
 		  : req(req), startTime(now()), sequenceNumber(++lastSequenceNumber) {}

 		void updateProxyTagThrottledDuration();
+		bool isMaxThrottled() const;
 	};

 	struct TagQueue {
@ -56,6 +57,8 @@ class GrvProxyTransactionTagThrottler {
 		explicit TagQueue(double rate) : rateInfo(rate) {}

 		void setRate(double rate);
+		bool isMaxThrottled() const;
+		void rejectRequests();
 	};

 	// Track the budgets for each tag
@ -69,8 +72,8 @@ public:
 	// If a request is ready to be executed, it is sent to the deque
 	// corresponding to its priority. If not, the request remains queued.
 	void releaseTransactions(double elapsed,
-	                         SpannedDeque<GetReadVersionRequest>& outBatchPriority,
-	                         SpannedDeque<GetReadVersionRequest>& outDefaultPriority);
+	                         Deque<GetReadVersionRequest>& outBatchPriority,
+	                         Deque<GetReadVersionRequest>& outDefaultPriority);

 	void addRequest(GetReadVersionRequest const&);

--- a/fdbserver/include/fdbserver/IKeyValueStore.h
+++ b/fdbserver/include/fdbserver/IKeyValueStore.h
@ -29,7 +29,7 @@
 #include "fdbserver/IClosable.h"
 #include "fdbserver/IPageEncryptionKeyProvider.actor.h"
 #include "fdbserver/ServerDBInfo.h"
-#include "fdbserver/StorageMetrics.h"
+#include "fdbserver/StorageMetrics.actor.h"

 struct CheckpointRequest {
 	const Version version; // The FDB version at which the checkpoint is created.
--- a/fdbserver/include/fdbserver/LogSystem.h
+++ b/fdbserver/include/fdbserver/LogSystem.h
@ -641,7 +641,6 @@ struct ILogSystem {
 	virtual Future<Reference<ILogSystem>> newEpoch(
 	    RecruitFromConfigurationReply const& recr,
 	    Future<struct RecruitRemoteFromConfigurationReply> const& fRemoteWorkers,
-	    UID clusterId,
 	    DatabaseConfiguration const& config,
 	    LogEpoch recoveryCount,
 	    Version recoveryTransactionVersion,
--- a/fdbserver/include/fdbserver/MockGlobalState.h
+++ b/fdbserver/include/fdbserver/MockGlobalState.h
@ -21,10 +21,11 @@
 #ifndef FOUNDATIONDB_MOCKGLOBALSTATE_H
 #define FOUNDATIONDB_MOCKGLOBALSTATE_H

-#include "StorageMetrics.h"
+#include "StorageMetrics.actor.h"
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbclient/DatabaseConfiguration.h"
+#include "fdbclient/KeyLocationService.h"
 #include "SimulatedCluster.h"
 #include "ShardsAffectedByTeamFailure.h"

@ -51,9 +52,11 @@ inline bool isStatusTransitionValid(MockShardStatus from, MockShardStatus to) {
 	return false;
 }

-class MockStorageServer {
+class MockStorageServer : public IStorageMetricsService {
 	friend struct MockGlobalStateTester;

+	ActorCollection actors;
+
 public:
 	struct ShardInfo {
 		MockShardStatus status;
@ -73,8 +76,6 @@ public:
 	// size() and nthRange() would use the metrics as index instead
 	KeyRangeMap<ShardInfo> serverKeys;

-	// sampled metrics
-	StorageServerMetrics metrics;
 	CoalescedKeyRangeMap<bool, int64_t, KeyBytesMetric<int64_t>> byteSampleClears;

 	StorageServerInterface ssi; // serve RPC requests
@ -103,6 +104,35 @@ public:

 	uint64_t sumRangeSize(KeyRangeRef range) const;

+	void addActor(Future<Void> future) override;
+
+	void getSplitPoints(SplitRangeRequest const& req) override;
+
+	Future<Void> waitMetricsTenantAware(const WaitMetricsRequest& req) override;
+
+	void getStorageMetrics(const GetStorageMetricsRequest& req) override;
+
+	template <class Reply>
+	static constexpr bool isLoadBalancedReply = std::is_base_of_v<LoadBalancedReply, Reply>;
+
+	template <class Reply>
+	typename std::enable_if_t<isLoadBalancedReply<Reply>, void> sendErrorWithPenalty(const ReplyPromise<Reply>& promise,
+	                                                                                 const Error& err,
+	                                                                                 double penalty) {
+		Reply reply;
+		reply.error = err;
+		reply.penalty = penalty;
+		promise.send(reply);
+	}
+
+	template <class Reply>
+	typename std::enable_if_t<!isLoadBalancedReply<Reply>, void>
+	sendErrorWithPenalty(const ReplyPromise<Reply>& promise, const Error& err, double) {
+		promise.sendError(err);
+	}
+
+	Future<Void> run();
+
 protected:
 	void threeWayShardSplitting(KeyRangeRef outerRange,
 	                            KeyRangeRef innerRange,
@ -112,8 +142,13 @@ protected:
 	void twoWayShardSplitting(KeyRangeRef range, KeyRef splitPoint, uint64_t rangeSize, bool restrictSize);
 };

-class MockGlobalState {
+class MockGlobalStateImpl;
+
+class MockGlobalState : public IKeyLocationService {
 	friend struct MockGlobalStateTester;
+	friend class MockGlobalStateImpl;
+
+	std::vector<StorageServerInterface> extractStorageServerInterfaces(const std::vector<UID>& ids) const;

 public:
 	typedef ShardsAffectedByTeamFailure::Team Team;
@ -162,7 +197,37 @@ public:
 	 * * mgs.shardMapping doesn’t have any information about X
 	 * * mgs.allServer[X] is existed
 	 */
-	bool allShardRemovedFromServer(const UID& serverId);
+	bool allShardsRemovedFromServer(const UID& serverId);
+
+	// SOMEDAY: NativeAPI::waitStorageMetrics should share the code in the future, this is a simpler version of it
+	Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(KeyRange const& keys,
+	                                                                    StorageMetrics const& min,
+	                                                                    StorageMetrics const& max,
+	                                                                    StorageMetrics const& permittedError,
+	                                                                    int shardLimit,
+	                                                                    int expectedShardCount);
+
+	Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(const KeyRange& keys,
+	                                                          const StorageMetrics& limit,
+	                                                          const StorageMetrics& estimated,
+	                                                          const Optional<int>& minSplitBytes);
+
+	Future<KeyRangeLocationInfo> getKeyLocation(TenantInfo tenant,
+	                                            Key key,
+	                                            SpanContext spanContext,
+	                                            Optional<UID> debugID,
+	                                            UseProvisionalProxies useProvisionalProxies,
+	                                            Reverse isBackward,
+	                                            Version version) override;
+
+	Future<std::vector<KeyRangeLocationInfo>> getKeyRangeLocations(TenantInfo tenant,
+	                                                               KeyRange keys,
+	                                                               int limit,
+	                                                               Reverse reverse,
+	                                                               SpanContext spanContext,
+	                                                               Optional<UID> debugID,
+	                                                               UseProvisionalProxies useProvisionalProxies,
+	                                                               Version version) override;
 };

 #endif // FOUNDATIONDB_MOCKGLOBALSTATE_H
--- a/fdbserver/include/fdbserver/ProxyCommitData.actor.h
+++ b/fdbserver/include/fdbserver/ProxyCommitData.actor.h
@ -156,10 +156,20 @@ struct ProxyStats {
 		specialCounter(cc, "NumTenants", [pTenantMap]() { return pTenantMap ? pTenantMap->size() : 0; });
 		specialCounter(cc, "MaxCompute", [this]() { return this->getAndResetMaxCompute(); });
 		specialCounter(cc, "MinCompute", [this]() { return this->getAndResetMinCompute(); });
-		logger = traceCounters("ProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ProxyMetrics");
+		logger = cc.traceCounters("ProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ProxyMetrics");
 	}
 };

+struct ExpectedIdempotencyIdCountForKey {
+	Version commitVersion = invalidVersion;
+	int16_t idempotencyIdCount = 0;
+	uint8_t batchIndexHighByte = 0;
+
+	ExpectedIdempotencyIdCountForKey() {}
+	ExpectedIdempotencyIdCountForKey(Version commitVersion, int16_t idempotencyIdCount, uint8_t batchIndexHighByte)
+	  : commitVersion(commitVersion), idempotencyIdCount(idempotencyIdCount), batchIndexHighByte(batchIndexHighByte) {}
+};
+
 struct ProxyCommitData {
 	UID dbgid;
 	int64_t commitBatchesMemBytesCount;
@ -226,6 +236,9 @@ struct ProxyCommitData {

 	bool isEncryptionEnabled = false;

+	PromiseStream<ExpectedIdempotencyIdCountForKey> expectedIdempotencyIdCountForKey;
+	Standalone<VectorRef<MutationRef>> idempotencyClears;
+
 	// The tag related to a storage server rarely change, so we keep a vector of tags for each key range to be slightly
 	// more CPU efficient. When a tag related to a storage server does change, we empty out all of these vectors to
 	// signify they must be repopulated. We do not repopulate them immediately to avoid a slow task.
--- a/Show More
+++ b/Show More