diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9549297305..35845dbb08 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -20,7 +20,7 @@ If you have questions, we encourage you to engage in discussion on the [communit
 
 ## Before you get started
 ### Community Guidelines
-We want the FoundationDB community to be as welcoming and inclusive as possible, and have adopted a [Code of Conduct](CODE_OF_CONDUCT.md) that we ask all community members to read and observe.
+We want the FoundationDB community to be as welcoming and inclusive as possible, and have adopted a [Code of Conduct](CODE_OF_CONDUCT.md) that we ask all community members to read and abide by.
 
 ### Project Licensing
 By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, and agree by submitting the patch that your contributions are licensed under the Apache 2.0 license.
@@ -34,7 +34,7 @@ Members of the Apple FoundationDB team are part of the core committers helping r
 
 ## Contributing
 ### Opening a Pull Request
-We love pull requests! For minor changes, feel free to open up a PR directly. For larger feature development and any changes that may require community discussion, we ask that you discuss your ideas on the [community forums](https://forums.foundationdb.org) prior to opening a PR, and then reference that thread within your PR comment. Please refer to [FoundationDB Commit Process](https://github.com/apple/foundationdb/wiki/FoundationDB-Commit-Process) for more detailed guidelines.
+We love pull requests! For minor changes, feel free to open up a PR directly. For larger feature development and any changes that may require community discussion, we ask that you discuss your ideas on the [community forums](https://forums.foundationdb.org) prior to opening a PR, and then reference that thread within your PR comment. Please refer to the [FoundationDB Commit Process](https://github.com/apple/foundationdb/wiki/FoundationDB-Commit-Process) for more detailed guidelines.
 
 CI will be run automatically for core committers, and for community PRs it will be initiated by the request of a core committer.  Tests can also be run locally via `ctest`, and core committers can run additional validation on pull requests prior to merging them.
 
@@ -46,10 +46,10 @@ To report a security issue, please **DO NOT** start by filing a public issue or
 
 ## Project Communication
 ### Community Forums
-We encourage your participation asking questions and helping improve the FoundationDB project. Check out the [FoundationDB community forums](https://forums.foundationdb.org), which serve a similar function as mailing lists in many open source projects. The forums are organized into three sections:
+We encourage your participation asking questions and helping improve the FoundationDB project. Check out the [FoundationDB community forums](https://forums.foundationdb.org), which serve a similar function as mailing lists in many open source projects. The forums are organized into three categories:
 
 * [Development](https://forums.foundationdb.org/c/development): For discussing the internals and development of the FoundationDB core, as well as layers.
-* [Using FoundationDB](https://forums.foundationdb.org/c/using-foundationdb): For discussing user-facing topics. Getting started and have a question? This is the place for you.
+* [Using FoundationDB](https://forums.foundationdb.org/c/using-foundationdb): For discussing user-facing topics. Getting started and have a question? This is the category for you.
 * [Site Feedback](https://forums.foundationdb.org/c/site-feedback): A category for discussing the forums and the OSS project, its organization, how it works, and how we can improve it.
 
 ### Using GitHub Issues and Community Forums
@@ -63,4 +63,4 @@ GitHub Issues should be used for tracking tasks. If you know the specific code t
 * Implementing an agreed upon feature: *GitHub Issues*
 
 ### Project and Development Updates
-Stay connected to the project and the community! For project and community updates, follow the [FoundationDB project blog](https://www.foundationdb.org/blog/). Development announcements will be made via the community forums' [dev-announce](https://forums.foundationdb.org/c/development/dev-announce) section.
+Stay connected to the project and the community! For project and community updates, follow the [FoundationDB project blog](https://www.foundationdb.org/blog/). Development announcements will be made via the community forums' [dev-announce](https://forums.foundationdb.org/c/development/dev-announce) category.
diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt
index 9e864aa509..1f31a8739c 100644
--- a/bindings/c/CMakeLists.txt
+++ b/bindings/c/CMakeLists.txt
@@ -139,8 +139,12 @@ if(NOT WIN32)
     test/apitester/TesterTestSpec.cpp
     test/apitester/TesterTestSpec.h
     test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp
+    test/apitester/TesterBlobGranuleErrorsWorkload.cpp
+    test/apitester/TesterBlobGranuleUtil.cpp
+    test/apitester/TesterBlobGranuleUtil.h
     test/apitester/TesterCancelTransactionWorkload.cpp
     test/apitester/TesterCorrectnessWorkload.cpp
+    test/apitester/TesterExampleWorkload.cpp
     test/apitester/TesterKeyValueStore.cpp
     test/apitester/TesterKeyValueStore.h
     test/apitester/TesterOptions.h
@@ -332,6 +336,24 @@ if(NOT WIN32)
             @SERVER_CA_FILE@
             )
 
+  add_test(NAME fdb_c_upgrade_to_future_version
+    COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
+        --build-dir ${CMAKE_BINARY_DIR}
+        --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml
+        --upgrade-path "7.2.0" "7.3.0" "7.2.0"
+        --process-number 3
+      )
+  set_tests_properties("fdb_c_upgrade_to_future_version" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}")
+
+  add_test(NAME fdb_c_upgrade_to_future_version_blob_granules
+    COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
+        --build-dir ${CMAKE_BINARY_DIR}
+        --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml
+        --upgrade-path "7.2.0" "7.3.0" "7.2.0"
+        --blob-granules-enabled
+        --process-number 3
+      )
+
   if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT USE_SANITIZER)
     add_test(NAME fdb_c_upgrade_single_threaded_630api
       COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
@@ -439,7 +461,7 @@ if (OPEN_FOR_IDE)
   target_link_libraries(fdb_c_shim_lib_tester PRIVATE fdb_c_shim SimpleOpt fdb_cpp Threads::Threads)
   target_include_directories(fdb_c_shim_lib_tester PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/ ${CMAKE_SOURCE_DIR}/flow/include)
 
-elseif(NOT WIN32 AND NOT APPLE AND NOT USE_UBSAN) # Linux Only, non-ubsan only
+elseif(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-santizer only
 
   set(SHIM_LIB_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
@@ -465,7 +487,7 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT USE_UBSAN) # Linux Only, non-ubsan only
     DEPENDS ${IMPLIBSO_SRC}
     COMMENT "Generating source code for C shim library")
 
-  add_library(fdb_c_shim SHARED ${SHIM_LIB_GEN_SRC} foundationdb/fdb_c_shim.h fdb_c_shim.cpp)
+  add_library(fdb_c_shim STATIC ${SHIM_LIB_GEN_SRC} foundationdb/fdb_c_shim.h fdb_c_shim.cpp)
   target_link_options(fdb_c_shim PRIVATE "LINKER:--version-script=${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.map,-z,nodelete,-z,noexecstack")
   target_link_libraries(fdb_c_shim PUBLIC dl)
   target_include_directories(fdb_c_shim PUBLIC
@@ -492,7 +514,7 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT USE_UBSAN) # Linux Only, non-ubsan only
           --api-test-dir ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
           )
 
-endif() # End Linux only, non-ubsan only
+endif() # End Linux only, non-sanitizer only
 
 # TODO: re-enable once the old vcxproj-based build system is removed.
 #generate_export_header(fdb_c EXPORT_MACRO_NAME "DLLEXPORT"
@@ -537,7 +559,7 @@ fdb_install(
   DESTINATION_SUFFIX "/cmake/${targets_export_name}"
   COMPONENT clients)
 
-if(NOT WIN32 AND NOT APPLE AND NOT USE_UBSAN) # Linux Only, non-ubsan only
+if(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-sanitizer only
 
 fdb_install(
   FILES foundationdb/fdb_c_shim.h
diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp
index c97604b98c..bc16cbf1a1 100644
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@@ -79,9 +79,10 @@ extern "C" DLLEXPORT fdb_bool_t fdb_error_predicate(int predicate_test, fdb_erro
 	if (predicate_test == FDBErrorPredicates::RETRYABLE_NOT_COMMITTED) {
 		return code == error_code_not_committed || code == error_code_transaction_too_old ||
 		       code == error_code_future_version || code == error_code_database_locked ||
-		       code == error_code_proxy_memory_limit_exceeded || code == error_code_batch_transaction_throttled ||
-		       code == error_code_process_behind || code == error_code_tag_throttled ||
-		       code == error_code_unknown_tenant;
+		       code == error_code_grv_proxy_memory_limit_exceeded ||
+		       code == error_code_commit_proxy_memory_limit_exceeded ||
+		       code == error_code_batch_transaction_throttled || code == error_code_process_behind ||
+		       code == error_code_tag_throttled || code == error_code_unknown_tenant;
 	}
 	return false;
 }
@@ -238,6 +239,10 @@ fdb_error_t fdb_future_get_version_v619(FDBFuture* f, int64_t* out_version) {
 	CATCH_AND_RETURN(*out_version = TSAV(Version, f)->get(););
 }
 
+extern "C" DLLEXPORT fdb_error_t fdb_future_get_bool(FDBFuture* f, fdb_bool_t* out_value) {
+	CATCH_AND_RETURN(*out_value = TSAV(bool, f)->get(););
+}
+
 extern "C" DLLEXPORT fdb_error_t fdb_future_get_int64(FDBFuture* f, int64_t* out_value) {
 	CATCH_AND_RETURN(*out_value = TSAV(int64_t, f)->get(););
 }
@@ -493,6 +498,54 @@ extern "C" DLLEXPORT FDBFuture* fdb_database_wait_purge_granules_complete(FDBDat
 	    FDBFuture*)(DB(db)->waitPurgeGranulesComplete(StringRef(purge_key_name, purge_key_name_length)).extractPtr());
 }
 
+extern "C" DLLEXPORT FDBFuture* fdb_database_blobbify_range(FDBDatabase* db,
+                                                            uint8_t const* begin_key_name,
+                                                            int begin_key_name_length,
+                                                            uint8_t const* end_key_name,
+                                                            int end_key_name_length) {
+	return (FDBFuture*)(DB(db)
+	                        ->blobbifyRange(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length),
+	                                                    StringRef(end_key_name, end_key_name_length)))
+	                        .extractPtr());
+}
+
+extern "C" DLLEXPORT FDBFuture* fdb_database_unblobbify_range(FDBDatabase* db,
+                                                              uint8_t const* begin_key_name,
+                                                              int begin_key_name_length,
+                                                              uint8_t const* end_key_name,
+                                                              int end_key_name_length) {
+	return (FDBFuture*)(DB(db)
+	                        ->unblobbifyRange(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length),
+	                                                      StringRef(end_key_name, end_key_name_length)))
+	                        .extractPtr());
+}
+
+extern "C" DLLEXPORT FDBFuture* fdb_database_list_blobbified_ranges(FDBDatabase* db,
+                                                                    uint8_t const* begin_key_name,
+                                                                    int begin_key_name_length,
+                                                                    uint8_t const* end_key_name,
+                                                                    int end_key_name_length,
+                                                                    int rangeLimit) {
+	return (FDBFuture*)(DB(db)
+	                        ->listBlobbifiedRanges(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length),
+	                                                           StringRef(end_key_name, end_key_name_length)),
+	                                               rangeLimit)
+	                        .extractPtr());
+}
+
+extern "C" DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_verify_blob_range(FDBDatabase* db,
+                                                                                  uint8_t const* begin_key_name,
+                                                                                  int begin_key_name_length,
+                                                                                  uint8_t const* end_key_name,
+                                                                                  int end_key_name_length,
+                                                                                  int64_t version) {
+	return (FDBFuture*)(DB(db)
+	                        ->verifyBlobRange(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length),
+	                                                      StringRef(end_key_name, end_key_name_length)),
+	                                          version)
+	                        .extractPtr());
+}
+
 extern "C" DLLEXPORT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant, FDBTransaction** out_transaction) {
 	CATCH_AND_RETURN(*out_transaction = (FDBTransaction*)TENANT(tenant)->createTransaction().extractPtr(););
 }
@@ -855,11 +908,12 @@ extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_blob_granule_ranges(FDBTrans
                                                                         uint8_t const* begin_key_name,
                                                                         int begin_key_name_length,
                                                                         uint8_t const* end_key_name,
-                                                                        int end_key_name_length) {
+                                                                        int end_key_name_length,
+                                                                        int rangeLimit) {
 	RETURN_FUTURE_ON_ERROR(
 	    Standalone<VectorRef<KeyRangeRef>>,
 	    KeyRangeRef range(KeyRef(begin_key_name, begin_key_name_length), KeyRef(end_key_name, end_key_name_length));
-	    return (FDBFuture*)(TXN(tr)->getBlobGranuleRanges(range).extractPtr()););
+	    return (FDBFuture*)(TXN(tr)->getBlobGranuleRanges(range, rangeLimit).extractPtr()););
 }
 
 extern "C" DLLEXPORT FDBResult* fdb_transaction_read_blob_granules(FDBTransaction* tr,
@@ -889,6 +943,57 @@ extern "C" DLLEXPORT FDBResult* fdb_transaction_read_blob_granules(FDBTransactio
 	    return (FDBResult*)(TXN(tr)->readBlobGranules(range, beginVersion, rv, context).extractPtr()););
 }
 
+extern "C" DLLEXPORT FDBFuture* fdb_transaction_read_blob_granules_start(FDBTransaction* tr,
+                                                                         uint8_t const* begin_key_name,
+                                                                         int begin_key_name_length,
+                                                                         uint8_t const* end_key_name,
+                                                                         int end_key_name_length,
+                                                                         int64_t beginVersion,
+                                                                         int64_t readVersion,
+                                                                         int64_t* readVersionOut) {
+	Optional<Version> rv;
+	if (readVersion != latestVersion) {
+		rv = readVersion;
+	}
+	return (FDBFuture*)(TXN(tr)
+	                        ->readBlobGranulesStart(KeyRangeRef(KeyRef(begin_key_name, begin_key_name_length),
+	                                                            KeyRef(end_key_name, end_key_name_length)),
+	                                                beginVersion,
+	                                                rv,
+	                                                readVersionOut)
+	                        .extractPtr());
+}
+
+extern "C" DLLEXPORT FDBResult* fdb_transaction_read_blob_granules_finish(FDBTransaction* tr,
+                                                                          FDBFuture* f,
+                                                                          uint8_t const* begin_key_name,
+                                                                          int begin_key_name_length,
+                                                                          uint8_t const* end_key_name,
+                                                                          int end_key_name_length,
+                                                                          int64_t beginVersion,
+                                                                          int64_t readVersion,
+                                                                          FDBReadBlobGranuleContext* granule_context) {
+	// FIXME: better way to convert?
+	ReadBlobGranuleContext context;
+	context.userContext = granule_context->userContext;
+	context.start_load_f = granule_context->start_load_f;
+	context.get_load_f = granule_context->get_load_f;
+	context.free_load_f = granule_context->free_load_f;
+	context.debugNoMaterialize = granule_context->debugNoMaterialize;
+	context.granuleParallelism = granule_context->granuleParallelism;
+	ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>> startFuture(
+	    TSAV(Standalone<VectorRef<BlobGranuleChunkRef>>, f));
+
+	return (FDBResult*)(TXN(tr)
+	                        ->readBlobGranulesFinish(startFuture,
+	                                                 KeyRangeRef(KeyRef(begin_key_name, begin_key_name_length),
+	                                                             KeyRef(end_key_name, end_key_name_length)),
+	                                                 beginVersion,
+	                                                 readVersion,
+	                                                 context)
+	                        .extractPtr());
+}
+
 #include "fdb_c_function_pointers.g.h"
 
 #define FDB_API_CHANGED(func, ver)                                                                                     \
@@ -964,6 +1069,10 @@ extern "C" DLLEXPORT const char* fdb_get_client_version() {
 	return API->getClientVersion();
 }
 
+extern "C" DLLEXPORT void fdb_use_future_protocol_version() {
+	API->useFutureProtocolVersion();
+}
+
 #if defined(__APPLE__)
 #include <dlfcn.h>
 __attribute__((constructor)) static void initialize() {
diff --git a/bindings/c/foundationdb/fdb_c.h b/bindings/c/foundationdb/fdb_c.h
index 409fd8ef55..10534a94dc 100644
--- a/bindings/c/foundationdb/fdb_c.h
+++ b/bindings/c/foundationdb/fdb_c.h
@@ -227,6 +227,8 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_set_callback(FDBFuture* f,
 DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_error(FDBFuture* f);
 #endif
 
+DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_bool(FDBFuture* f, fdb_bool_t* out);
+
 DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_int64(FDBFuture* f, int64_t* out);
 
 DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_uint64(FDBFuture* f, uint64_t* out);
@@ -321,6 +323,32 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_wait_purge_granules_complet
                                                                                   uint8_t const* purge_key_name,
                                                                                   int purge_key_name_length);
 
+DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_blobbify_range(FDBDatabase* db,
+                                                                    uint8_t const* begin_key_name,
+                                                                    int begin_key_name_length,
+                                                                    uint8_t const* end_key_name,
+                                                                    int end_key_name_length);
+
+DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_unblobbify_range(FDBDatabase* db,
+                                                                      uint8_t const* begin_key_name,
+                                                                      int begin_key_name_length,
+                                                                      uint8_t const* end_key_name,
+                                                                      int end_key_name_length);
+
+DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_list_blobbified_ranges(FDBDatabase* db,
+                                                                            uint8_t const* begin_key_name,
+                                                                            int begin_key_name_length,
+                                                                            uint8_t const* end_key_name,
+                                                                            int end_key_name_length,
+                                                                            int rangeLimit);
+
+DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_verify_blob_range(FDBDatabase* db,
+                                                                       uint8_t const* begin_key_name,
+                                                                       int begin_key_name_length,
+                                                                       uint8_t const* end_key_name,
+                                                                       int end_key_name_length,
+                                                                       int64_t version);
+
 DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant,
                                                                        FDBTransaction** out_transaction);
 
@@ -479,7 +507,8 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_blob_granule_ranges(
                                                                                 uint8_t const* begin_key_name,
                                                                                 int begin_key_name_length,
                                                                                 uint8_t const* end_key_name,
-                                                                                int end_key_name_length);
+                                                                                int end_key_name_length,
+                                                                                int rangeLimit);
 
 /* LatestVersion (-2) for readVersion means get read version from transaction
    Separated out as optional because BG reads can support longer-lived reads than normal FDB transactions */
diff --git a/bindings/c/foundationdb/fdb_c_internal.h b/bindings/c/foundationdb/fdb_c_internal.h
index 2b1a2163c7..62b77f354e 100644
--- a/bindings/c/foundationdb/fdb_c_internal.h
+++ b/bindings/c/foundationdb/fdb_c_internal.h
@@ -49,6 +49,29 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_shared_state(FDBFuture*
 DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_create_database_from_connection_string(const char* connection_string,
                                                                                     FDBDatabase** out_database);
 
+DLLEXPORT void fdb_use_future_protocol_version();
+
+// the logical read_blob_granules is broken out (at different points depending on the client type) into the asynchronous
+// start() that happens on the fdb network thread, and synchronous finish() that happens off it
+DLLEXPORT FDBFuture* fdb_transaction_read_blob_granules_start(FDBTransaction* tr,
+                                                              uint8_t const* begin_key_name,
+                                                              int begin_key_name_length,
+                                                              uint8_t const* end_key_name,
+                                                              int end_key_name_length,
+                                                              int64_t beginVersion,
+                                                              int64_t readVersion,
+                                                              int64_t* readVersionOut);
+
+DLLEXPORT FDBResult* fdb_transaction_read_blob_granules_finish(FDBTransaction* tr,
+                                                               FDBFuture* f,
+                                                               uint8_t const* begin_key_name,
+                                                               int begin_key_name_length,
+                                                               uint8_t const* end_key_name,
+                                                               int end_key_name_length,
+                                                               int64_t beginVersion,
+                                                               int64_t readVersion,
+                                                               FDBReadBlobGranuleContext* granuleContext);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp
index 52d8ddc651..e1af440f09 100644
--- a/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp
+++ b/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp
@@ -18,61 +18,13 @@
  * limitations under the License.
  */
 #include "TesterApiWorkload.h"
+#include "TesterBlobGranuleUtil.h"
 #include "TesterUtil.h"
 #include <memory>
 #include <fmt/format.h>
 
 namespace FdbApiTester {
 
-class TesterGranuleContext {
-public:
-	std::unordered_map<int64_t, uint8_t*> loadsInProgress;
-	int64_t nextId = 0;
-	std::string basePath;
-
-	~TesterGranuleContext() {
-		// if there was an error or not all loads finished, delete data
-		for (auto& it : loadsInProgress) {
-			uint8_t* dataToFree = it.second;
-			delete[] dataToFree;
-		}
-	}
-};
-
-static int64_t granule_start_load(const char* filename,
-                                  int filenameLength,
-                                  int64_t offset,
-                                  int64_t length,
-                                  int64_t fullFileLength,
-                                  void* context) {
-
-	TesterGranuleContext* ctx = (TesterGranuleContext*)context;
-	int64_t loadId = ctx->nextId++;
-
-	uint8_t* buffer = new uint8_t[length];
-	std::ifstream fin(ctx->basePath + std::string(filename, filenameLength), std::ios::in | std::ios::binary);
-	fin.seekg(offset);
-	fin.read((char*)buffer, length);
-
-	ctx->loadsInProgress.insert({ loadId, buffer });
-
-	return loadId;
-}
-
-static uint8_t* granule_get_load(int64_t loadId, void* context) {
-	TesterGranuleContext* ctx = (TesterGranuleContext*)context;
-	return ctx->loadsInProgress.at(loadId);
-}
-
-static void granule_free_load(int64_t loadId, void* context) {
-	TesterGranuleContext* ctx = (TesterGranuleContext*)context;
-	auto it = ctx->loadsInProgress.find(loadId);
-	uint8_t* dataToFree = it->second;
-	delete[] dataToFree;
-
-	ctx->loadsInProgress.erase(it);
-}
-
 class ApiBlobGranuleCorrectnessWorkload : public ApiWorkload {
 public:
 	ApiBlobGranuleCorrectnessWorkload(const WorkloadConfig& config) : ApiWorkload(config) {
@@ -80,9 +32,12 @@ public:
 		if (Random::get().randomInt(0, 1) == 0) {
 			excludedOpTypes.push_back(OP_CLEAR_RANGE);
 		}
+		// FIXME: remove! this bug is fixed in another PR
+		excludedOpTypes.push_back(OP_GET_RANGES);
 	}
 
 private:
+	// FIXME: use other new blob granule apis!
 	enum OpType { OP_INSERT, OP_CLEAR, OP_CLEAR_RANGE, OP_READ, OP_GET_RANGES, OP_LAST = OP_GET_RANGES };
 	std::vector<OpType> excludedOpTypes;
 
@@ -101,16 +56,8 @@ private:
 		execTransaction(
 		    [this, begin, end, results, tooOld](auto ctx) {
 			    ctx->tx().setOption(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE);
-			    TesterGranuleContext testerContext;
-			    testerContext.basePath = ctx->getBGBasePath();
-
-			    fdb::native::FDBReadBlobGranuleContext granuleContext;
-			    granuleContext.userContext = &testerContext;
-			    granuleContext.debugNoMaterialize = false;
-			    granuleContext.granuleParallelism = 1;
-			    granuleContext.start_load_f = &granule_start_load;
-			    granuleContext.get_load_f = &granule_get_load;
-			    granuleContext.free_load_f = &granule_free_load;
+			    TesterGranuleContext testerContext(ctx->getBGBasePath());
+			    fdb::native::FDBReadBlobGranuleContext granuleContext = createGranuleContext(&testerContext);
 
 			    fdb::Result res = ctx->tx().readBlobGranules(
 			        begin, end, 0 /* beginVersion */, -2 /* latest read version */, granuleContext);
@@ -124,8 +71,10 @@ private:
 			    } else if (err.code() != error_code_success) {
 				    ctx->onError(err);
 			    } else {
-				    auto& [out_kv, out_count, out_more] = out;
+				    auto resCopy = copyKeyValueArray(out);
+				    auto& [resVector, out_more] = resCopy;
 				    ASSERT(!out_more);
+				    results.get()->assign(resVector.begin(), resVector.end());
 				    if (!seenReadSuccess) {
 					    info("BlobGranuleCorrectness::randomReadOp first success\n");
 				    }
@@ -178,7 +127,7 @@ private:
 		}
 		execTransaction(
 		    [begin, end, results](auto ctx) {
-			    fdb::Future f = ctx->tx().getBlobGranuleRanges(begin, end).eraseType();
+			    fdb::Future f = ctx->tx().getBlobGranuleRanges(begin, end, 1000).eraseType();
 			    ctx->continueAfter(
 			        f,
 			        [ctx, f, results]() {
@@ -196,11 +145,25 @@ private:
 
 			    for (int i = 0; i < results->size(); i++) {
 				    // no empty or inverted ranges
+				    if ((*results)[i].beginKey >= (*results)[i].endKey) {
+					    error(fmt::format("Empty/inverted range [{0} - {1}) for getBlobGranuleRanges({2} - {3})",
+					                      fdb::toCharsRef((*results)[i].beginKey),
+					                      fdb::toCharsRef((*results)[i].endKey),
+					                      fdb::toCharsRef(begin),
+					                      fdb::toCharsRef(end)));
+				    }
 				    ASSERT((*results)[i].beginKey < (*results)[i].endKey);
 			    }
 
 			    for (int i = 1; i < results->size(); i++) {
 				    // ranges contain entire requested key range
+				    if ((*results)[i].beginKey != (*results)[i].endKey) {
+					    error(fmt::format("Non-contiguous range [{0} - {1}) for getBlobGranuleRanges({2} - {3})",
+					                      fdb::toCharsRef((*results)[i].beginKey),
+					                      fdb::toCharsRef((*results)[i].endKey),
+					                      fdb::toCharsRef(begin),
+					                      fdb::toCharsRef(end)));
+				    }
 				    ASSERT((*results)[i].beginKey == (*results)[i - 1].endKey);
 			    }
 
diff --git a/bindings/c/test/apitester/TesterBlobGranuleErrorsWorkload.cpp b/bindings/c/test/apitester/TesterBlobGranuleErrorsWorkload.cpp
new file mode 100644
index 0000000000..7bb879a185
--- /dev/null
+++ b/bindings/c/test/apitester/TesterBlobGranuleErrorsWorkload.cpp
@@ -0,0 +1,145 @@
+/*
+ * TesterBlobGranuleErrorsWorkload.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "TesterApiWorkload.h"
+#include "TesterBlobGranuleUtil.h"
+#include "TesterUtil.h"
+#include <memory>
+#include <fmt/format.h>
+
+namespace FdbApiTester {
+
+class BlobGranuleErrorsWorkload : public ApiWorkload {
+public:
+	BlobGranuleErrorsWorkload(const WorkloadConfig& config) : ApiWorkload(config) {}
+
+private:
+	enum OpType {
+		OP_READ_NO_MATERIALIZE,
+		OP_READ_FILE_LOAD_ERROR,
+		OP_READ_TOO_OLD,
+		OP_CANCEL_RANGES,
+		OP_LAST = OP_CANCEL_RANGES
+	};
+
+	// Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet
+	// FIXME: should still guarantee a read succeeds eventually somehow
+	bool seenReadSuccess = false;
+
+	void doErrorOp(TTaskFct cont,
+	               std::string basePathAddition,
+	               bool doMaterialize,
+	               int64_t readVersion,
+	               fdb::native::fdb_error_t expectedError) {
+		fdb::Key begin = randomKeyName();
+		fdb::Key end = begin;
+		// [K - K) empty range will succeed read because there is trivially nothing to do, so don't do it
+		while (end == begin) {
+			end = randomKeyName();
+		}
+		if (begin > end) {
+			std::swap(begin, end);
+		}
+
+		execTransaction(
+		    [this, begin, end, basePathAddition, doMaterialize, readVersion, expectedError](auto ctx) {
+			    ctx->tx().setOption(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE);
+
+			    TesterGranuleContext testerContext(ctx->getBGBasePath() + basePathAddition);
+			    fdb::native::FDBReadBlobGranuleContext granuleContext = createGranuleContext(&testerContext);
+			    granuleContext.debugNoMaterialize = !doMaterialize;
+
+			    fdb::Result res =
+			        ctx->tx().readBlobGranules(begin, end, 0 /* beginVersion */, readVersion, granuleContext);
+			    auto out = fdb::Result::KeyValueRefArray{};
+			    fdb::Error err = res.getKeyValueArrayNothrow(out);
+
+			    if (err.code() == error_code_success) {
+				    error(fmt::format("Operation succeeded in error test!"));
+			    }
+			    ASSERT(err.code() != error_code_success);
+			    if (err.code() != error_code_blob_granule_transaction_too_old) {
+				    seenReadSuccess = true;
+			    }
+			    if (err.code() != expectedError) {
+				    info(fmt::format("incorrect error. Expected {}, Got {}", err.code(), expectedError));
+				    if (err.code() == error_code_blob_granule_transaction_too_old) {
+					    ASSERT(!seenReadSuccess);
+					    ctx->done();
+				    } else {
+					    ctx->onError(err);
+				    }
+			    } else {
+				    ctx->done();
+			    }
+		    },
+		    [this, cont]() { schedule(cont); });
+	}
+
+	void randomOpReadNoMaterialize(TTaskFct cont) {
+		// ensure setting noMaterialize flag produces blob_granule_not_materialized
+		doErrorOp(cont, "", false, -2 /*latest read version */, error_code_blob_granule_not_materialized);
+	}
+
+	void randomOpReadFileLoadError(TTaskFct cont) {
+		// point to a file path that doesn't exist by adding an extra suffix
+		doErrorOp(cont, "extrapath/", true, -2 /*latest read version */, error_code_blob_granule_file_load_error);
+	}
+
+	void randomOpReadTooOld(TTaskFct cont) {
+		// read at a version (1) that should predate granule data
+		doErrorOp(cont, "", true, 1, error_code_blob_granule_transaction_too_old);
+	}
+
+	void randomCancelGetRangesOp(TTaskFct cont) {
+		fdb::Key begin = randomKeyName();
+		fdb::Key end = randomKeyName();
+		if (begin > end) {
+			std::swap(begin, end);
+		}
+		execTransaction(
+		    [begin, end](auto ctx) {
+			    fdb::Future f = ctx->tx().getBlobGranuleRanges(begin, end, 1000).eraseType();
+			    ctx->done();
+		    },
+		    [this, cont]() { schedule(cont); });
+	}
+
+	void randomOperation(TTaskFct cont) override {
+		OpType txType = (OpType)Random::get().randomInt(0, OP_LAST);
+		switch (txType) {
+		case OP_READ_NO_MATERIALIZE:
+			randomOpReadNoMaterialize(cont);
+			break;
+		case OP_READ_FILE_LOAD_ERROR:
+			randomOpReadFileLoadError(cont);
+			break;
+		case OP_READ_TOO_OLD:
+			randomOpReadTooOld(cont);
+			break;
+		case OP_CANCEL_RANGES:
+			randomCancelGetRangesOp(cont);
+			break;
+		}
+	}
+};
+
+WorkloadFactory<BlobGranuleErrorsWorkload> BlobGranuleErrorsWorkloadFactory("BlobGranuleErrors");
+
+} // namespace FdbApiTester
diff --git a/bindings/c/test/apitester/TesterBlobGranuleUtil.cpp b/bindings/c/test/apitester/TesterBlobGranuleUtil.cpp
new file mode 100644
index 0000000000..a908a9c0bf
--- /dev/null
+++ b/bindings/c/test/apitester/TesterBlobGranuleUtil.cpp
@@ -0,0 +1,80 @@
+/*
+ * TesterBlobGranuleUtil.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TesterBlobGranuleUtil.h"
+#include "TesterUtil.h"
+#include <fstream>
+
+namespace FdbApiTester {
+
+// FIXME: avoid duplicating this between files!
+static int64_t granule_start_load(const char* filename,
+                                  int filenameLength,
+                                  int64_t offset,
+                                  int64_t length,
+                                  int64_t fullFileLength,
+                                  void* context) {
+
+	TesterGranuleContext* ctx = (TesterGranuleContext*)context;
+	int64_t loadId = ctx->nextId++;
+
+	uint8_t* buffer = new uint8_t[length];
+	std::ifstream fin(ctx->basePath + std::string(filename, filenameLength), std::ios::in | std::ios::binary);
+	if (fin.fail()) {
+		delete[] buffer;
+		buffer = nullptr;
+	} else {
+		fin.seekg(offset);
+		fin.read((char*)buffer, length);
+	}
+
+	ctx->loadsInProgress.insert({ loadId, buffer });
+
+	return loadId;
+}
+
+static uint8_t* granule_get_load(int64_t loadId, void* context) {
+	TesterGranuleContext* ctx = (TesterGranuleContext*)context;
+	return ctx->loadsInProgress.at(loadId);
+}
+
+static void granule_free_load(int64_t loadId, void* context) {
+	TesterGranuleContext* ctx = (TesterGranuleContext*)context;
+	auto it = ctx->loadsInProgress.find(loadId);
+	uint8_t* dataToFree = it->second;
+	delete[] dataToFree;
+
+	ctx->loadsInProgress.erase(it);
+}
+
+fdb::native::FDBReadBlobGranuleContext createGranuleContext(const TesterGranuleContext* testerContext) {
+	fdb::native::FDBReadBlobGranuleContext granuleContext;
+
+	granuleContext.userContext = (void*)testerContext;
+	granuleContext.debugNoMaterialize = false;
+	granuleContext.granuleParallelism = 1 + Random::get().randomInt(0, 3);
+	granuleContext.start_load_f = &granule_start_load;
+	granuleContext.get_load_f = &granule_get_load;
+	granuleContext.free_load_f = &granule_free_load;
+
+	return granuleContext;
+}
+
+} // namespace FdbApiTester
\ No newline at end of file
diff --git a/bindings/c/test/apitester/TesterBlobGranuleUtil.h b/bindings/c/test/apitester/TesterBlobGranuleUtil.h
new file mode 100644
index 0000000000..7b4b0dba81
--- /dev/null
+++ b/bindings/c/test/apitester/TesterBlobGranuleUtil.h
@@ -0,0 +1,49 @@
+/*
+ * TesterBlobGranuleUtil.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifndef APITESTER_BLOBGRANULE_UTIL_H
+#define APITESTER_BLOBGRANULE_UTIL_H
+#include "TesterUtil.h"
+#include "test/fdb_api.hpp"
+#include <unordered_map>
+
+namespace FdbApiTester {
+
+class TesterGranuleContext {
+public:
+	std::unordered_map<int64_t, uint8_t*> loadsInProgress;
+	std::string basePath;
+	int64_t nextId;
+
+	TesterGranuleContext(const std::string& basePath) : basePath(basePath), nextId(0) {}
+
+	~TesterGranuleContext() {
+		// this should now never happen with proper memory management
+		ASSERT(loadsInProgress.empty());
+	}
+};
+
+fdb::native::FDBReadBlobGranuleContext createGranuleContext(const TesterGranuleContext* testerContext);
+
+} // namespace FdbApiTester
+
+#endif
diff --git a/bindings/c/test/apitester/TesterExampleWorkload.cpp b/bindings/c/test/apitester/TesterExampleWorkload.cpp
new file mode 100644
index 0000000000..3765dc50fb
--- /dev/null
+++ b/bindings/c/test/apitester/TesterExampleWorkload.cpp
@@ -0,0 +1,65 @@
+/*
+ * TesterExampleWorkload.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TesterWorkload.h"
+#include "TesterUtil.h"
+
+namespace FdbApiTester {
+
+class SetAndGetWorkload : public WorkloadBase {
+public:
+	fdb::Key keyPrefix;
+	Random random;
+
+	SetAndGetWorkload(const WorkloadConfig& config) : WorkloadBase(config) {
+		keyPrefix = fdb::toBytesRef(fmt::format("{}/", workloadId));
+	}
+
+	void start() override { setAndGet(NO_OP_TASK); }
+
+	void setAndGet(TTaskFct cont) {
+		fdb::Key key = keyPrefix + random.randomStringLowerCase(10, 100);
+		fdb::Value value = random.randomStringLowerCase(10, 1000);
+		execTransaction(
+		    [key, value](auto ctx) {
+			    ctx->tx().set(key, value);
+			    ctx->commit();
+		    },
+		    [this, key, value, cont]() {
+			    execTransaction(
+			        [this, key, value](auto ctx) {
+				        auto future = ctx->tx().get(key, false);
+				        ctx->continueAfter(future, [this, ctx, future, value]() {
+					        std::optional<fdb::Value> res = copyValueRef(future.get());
+					        if (res != value) {
+						        error(fmt::format(
+						            "expected: {} actual: {}", fdb::toCharsRef(value), fdb::toCharsRef(res.value())));
+					        }
+					        ctx->done();
+				        });
+			        },
+			        cont);
+		    });
+	}
+};
+
+WorkloadFactory<SetAndGetWorkload> SetAndGetWorkloadFactory("SetAndGet");
+
+} // namespace FdbApiTester
diff --git a/bindings/c/test/apitester/TesterOptions.h b/bindings/c/test/apitester/TesterOptions.h
index 3ff57ec183..1160b696b0 100644
--- a/bindings/c/test/apitester/TesterOptions.h
+++ b/bindings/c/test/apitester/TesterOptions.h
@@ -38,6 +38,7 @@ public:
 	std::string logGroup;
 	std::string externalClientLibrary;
 	std::string externalClientDir;
+	std::string futureVersionClientLibrary;
 	std::string tmpDir;
 	bool disableLocalClient = false;
 	std::string testFile;
diff --git a/bindings/c/test/apitester/TesterWorkload.cpp b/bindings/c/test/apitester/TesterWorkload.cpp
index cbce118f10..6cdfacc423 100644
--- a/bindings/c/test/apitester/TesterWorkload.cpp
+++ b/bindings/c/test/apitester/TesterWorkload.cpp
@@ -165,8 +165,11 @@ void WorkloadManager::add(std::shared_ptr<IWorkload> workload, TTaskFct cont) {
 
 void WorkloadManager::run() {
 	std::vector<std::shared_ptr<IWorkload>> initialWorkloads;
-	for (auto iter : workloads) {
-		initialWorkloads.push_back(iter.second.ref);
+	{
+		std::unique_lock<std::mutex> lock(mutex);
+		for (auto iter : workloads) {
+			initialWorkloads.push_back(iter.second.ref);
+		}
 	}
 	for (auto iter : initialWorkloads) {
 		iter->init(this);
@@ -324,4 +327,4 @@ std::unordered_map<std::string, IWorkloadFactory*>& IWorkloadFactory::factories(
 	return theFactories;
 }
 
-} // namespace FdbApiTester
\ No newline at end of file
+} // namespace FdbApiTester
diff --git a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml
new file mode 100644
index 0000000000..788bd04d85
--- /dev/null
+++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml
@@ -0,0 +1,22 @@
+[[test]]
+title = 'Blob Granule Errors Multi Threaded'
+multiThreaded = true
+buggify = true
+minFdbThreads = 2
+maxFdbThreads = 8
+minDatabases = 2
+maxDatabases = 8
+minClientThreads = 2
+maxClientThreads = 8
+minClients = 2
+maxClients = 8
+
+	[[test.workload]]
+    name = 'BlobGranuleErrors'
+    minKeyLength = 1
+	maxKeyLength = 64
+	minValueLength = 1
+	maxValueLength = 1000
+	maxKeysPerTransaction = 50
+	initialSize = 100
+	numRandomOperations = 100
\ No newline at end of file
diff --git a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml
new file mode 100644
index 0000000000..788bd04d85
--- /dev/null
+++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml
@@ -0,0 +1,22 @@
+[[test]]
+title = 'Blob Granule Errors Multi Threaded'
+multiThreaded = true
+buggify = true
+minFdbThreads = 2
+maxFdbThreads = 8
+minDatabases = 2
+maxDatabases = 8
+minClientThreads = 2
+maxClientThreads = 8
+minClients = 2
+maxClients = 8
+
+	[[test.workload]]
+    name = 'BlobGranuleErrors'
+    minKeyLength = 1
+	maxKeyLength = 64
+	minValueLength = 1
+	maxValueLength = 1000
+	maxKeysPerTransaction = 50
+	initialSize = 100
+	numRandomOperations = 100
\ No newline at end of file
diff --git a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml
new file mode 100644
index 0000000000..85e78975f6
--- /dev/null
+++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml
@@ -0,0 +1,15 @@
+[[test]]
+title = 'Blob Granule Errors Single Threaded'
+minClients = 1
+maxClients = 3
+multiThreaded = false
+
+	[[test.workload]]
+    name = 'BlobGranuleErrors'
+    minKeyLength = 1
+	maxKeyLength = 64
+	minValueLength = 1
+	maxValueLength = 1000
+	maxKeysPerTransaction = 50
+	initialSize = 100
+	numRandomOperations = 100
\ No newline at end of file
diff --git a/bindings/c/test/apitester/fdb_c_api_tester.cpp b/bindings/c/test/apitester/fdb_c_api_tester.cpp
index 62b6af6dd4..310ebd9b83 100644
--- a/bindings/c/test/apitester/fdb_c_api_tester.cpp
+++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp
@@ -46,6 +46,7 @@ enum TesterOptionId {
 	OPT_KNOB,
 	OPT_EXTERNAL_CLIENT_LIBRARY,
 	OPT_EXTERNAL_CLIENT_DIRECTORY,
+	OPT_FUTURE_VERSION_CLIENT_LIBRARY,
 	OPT_TMP_DIR,
 	OPT_DISABLE_LOCAL_CLIENT,
 	OPT_TEST_FILE,
@@ -72,6 +73,7 @@ CSimpleOpt::SOption TesterOptionDefs[] = //
 	  { OPT_KNOB, "--knob-", SO_REQ_SEP },
 	  { OPT_EXTERNAL_CLIENT_LIBRARY, "--external-client-library", SO_REQ_SEP },
 	  { OPT_EXTERNAL_CLIENT_DIRECTORY, "--external-client-dir", SO_REQ_SEP },
+	  { OPT_FUTURE_VERSION_CLIENT_LIBRARY, "--future-version-client-library", SO_REQ_SEP },
 	  { OPT_TMP_DIR, "--tmp-dir", SO_REQ_SEP },
 	  { OPT_DISABLE_LOCAL_CLIENT, "--disable-local-client", SO_NONE },
 	  { OPT_TEST_FILE, "-f", SO_REQ_SEP },
@@ -110,6 +112,8 @@ void printProgramUsage(const char* execName) {
 	       "                 Path to the external client library.\n"
 	       "  --external-client-dir DIR\n"
 	       "                 Directory containing external client libraries.\n"
+	       "  --future-version-client-library FILE\n"
+	       "                 Path to a client library to be used with a future protocol version.\n"
 	       "  --tmp-dir DIR\n"
 	       "                 Directory for temporary files of the client.\n"
 	       "  --disable-local-client DIR\n"
@@ -204,6 +208,9 @@ bool processArg(TesterOptions& options, const CSimpleOpt& args) {
 	case OPT_EXTERNAL_CLIENT_DIRECTORY:
 		options.externalClientDir = args.OptionArg();
 		break;
+	case OPT_FUTURE_VERSION_CLIENT_LIBRARY:
+		options.futureVersionClientLibrary = args.OptionArg();
+		break;
 	case OPT_TMP_DIR:
 		options.tmpDir = args.OptionArg();
 		break;
@@ -296,6 +303,11 @@ void applyNetworkOptions(TesterOptions& options) {
 		}
 	}
 
+	if (!options.futureVersionClientLibrary.empty()) {
+		fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_FUTURE_VERSION_CLIENT_LIBRARY,
+		                        options.futureVersionClientLibrary);
+	}
+
 	if (options.testSpec.multiThreaded) {
 		fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_THREADS_PER_VERSION, options.numFdbThreads);
 	}
diff --git a/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml b/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml
new file mode 100644
index 0000000000..84531ea9c8
--- /dev/null
+++ b/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml
@@ -0,0 +1,23 @@
+[[test]]
+title = 'Mixed Workload for Upgrade Tests with a Multi-Threaded Client'
+multiThreaded = true
+buggify = true
+databasePerTransaction = false
+minFdbThreads = 2
+maxFdbThreads = 8
+minDatabases = 2
+maxDatabases = 8
+minClientThreads = 2
+maxClientThreads = 8
+minClients = 2
+maxClients = 8
+
+    [[test.workload]]
+    name = 'ApiBlobGranuleCorrectness'
+    minKeyLength = 1
+	maxKeyLength = 64
+	minValueLength = 1
+	maxValueLength = 1000
+	maxKeysPerTransaction = 50
+	initialSize = 100
+	runUntilStop = true
\ No newline at end of file
diff --git a/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml b/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml
index 86e65c5918..94bf4e0509 100644
--- a/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml
+++ b/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml
@@ -32,4 +32,14 @@ maxClients = 8
     maxKeysPerTransaction = 50
     initialSize = 100
     runUntilStop = true
-    readExistingKeysRatio = 0.9
\ No newline at end of file
+    readExistingKeysRatio = 0.9
+
+    [[test.workload]]
+    name = 'AtomicOpsCorrectness'
+    initialSize = 0
+    runUntilStop = true
+
+    [[test.workload]]
+    name = 'WatchAndWait'
+    initialSize = 0
+    runUntilStop = true
\ No newline at end of file
diff --git a/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml b/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml
index 42df76521b..daf070b31b 100644
--- a/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml
+++ b/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml
@@ -30,4 +30,14 @@ maxClients = 8
     maxKeysPerTransaction = 50
     initialSize = 100
     runUntilStop = true
-    readExistingKeysRatio = 0.9
\ No newline at end of file
+    readExistingKeysRatio = 0.9
+
+    [[test.workload]]
+    name = 'AtomicOpsCorrectness'
+    initialSize = 0
+    runUntilStop = true
+
+    [[test.workload]]
+    name = 'WatchAndWait'
+    initialSize = 0
+    runUntilStop = true
\ No newline at end of file
diff --git a/bindings/c/test/fdb_api.hpp b/bindings/c/test/fdb_api.hpp
index bee40981c3..6d0db008a2 100644
--- a/bindings/c/test/fdb_api.hpp
+++ b/bindings/c/test/fdb_api.hpp
@@ -559,9 +559,9 @@ public:
 		                                         reverse);
 	}
 
-	TypedFuture<future_var::KeyRangeRefArray> getBlobGranuleRanges(KeyRef begin, KeyRef end) {
+	TypedFuture<future_var::KeyRangeRefArray> getBlobGranuleRanges(KeyRef begin, KeyRef end, int rangeLimit) {
 		return native::fdb_transaction_get_blob_granule_ranges(
-		    tr.get(), begin.data(), intSize(begin), end.data(), intSize(end));
+		    tr.get(), begin.data(), intSize(begin), end.data(), intSize(end), rangeLimit);
 	}
 
 	Result readBlobGranules(KeyRef begin,
diff --git a/bindings/c/test/mako/blob_granules.cpp b/bindings/c/test/mako/blob_granules.cpp
index af805f2e56..1071737211 100644
--- a/bindings/c/test/mako/blob_granules.cpp
+++ b/bindings/c/test/mako/blob_granules.cpp
@@ -26,6 +26,9 @@
 
 extern thread_local mako::Logger logr;
 
+// FIXME: use the same implementation as the api tester! this implementation was from back when mako was written in C
+// and is inferior.
+
 namespace mako::blob_granules::local_file {
 
 int64_t startLoad(const char* filename,
diff --git a/bindings/c/test/unit/fdb_api.cpp b/bindings/c/test/unit/fdb_api.cpp
index d3c1dec30d..d454082af3 100644
--- a/bindings/c/test/unit/fdb_api.cpp
+++ b/bindings/c/test/unit/fdb_api.cpp
@@ -356,9 +356,15 @@ fdb_error_t Transaction::add_conflict_range(std::string_view begin_key,
 	    tr_, (const uint8_t*)begin_key.data(), begin_key.size(), (const uint8_t*)end_key.data(), end_key.size(), type);
 }
 
-KeyRangeArrayFuture Transaction::get_blob_granule_ranges(std::string_view begin_key, std::string_view end_key) {
-	return KeyRangeArrayFuture(fdb_transaction_get_blob_granule_ranges(
-	    tr_, (const uint8_t*)begin_key.data(), begin_key.size(), (const uint8_t*)end_key.data(), end_key.size()));
+KeyRangeArrayFuture Transaction::get_blob_granule_ranges(std::string_view begin_key,
+                                                         std::string_view end_key,
+                                                         int rangeLimit) {
+	return KeyRangeArrayFuture(fdb_transaction_get_blob_granule_ranges(tr_,
+	                                                                   (const uint8_t*)begin_key.data(),
+	                                                                   begin_key.size(),
+	                                                                   (const uint8_t*)end_key.data(),
+	                                                                   end_key.size(),
+	                                                                   rangeLimit));
 }
 KeyValueArrayResult Transaction::read_blob_granules(std::string_view begin_key,
                                                     std::string_view end_key,
diff --git a/bindings/c/test/unit/fdb_api.hpp b/bindings/c/test/unit/fdb_api.hpp
index 7d44a30a9a..d0c4abd8db 100644
--- a/bindings/c/test/unit/fdb_api.hpp
+++ b/bindings/c/test/unit/fdb_api.hpp
@@ -348,7 +348,7 @@ public:
 	// Wrapper around fdb_transaction_add_conflict_range.
 	fdb_error_t add_conflict_range(std::string_view begin_key, std::string_view end_key, FDBConflictRangeType type);
 
-	KeyRangeArrayFuture get_blob_granule_ranges(std::string_view begin_key, std::string_view end_key);
+	KeyRangeArrayFuture get_blob_granule_ranges(std::string_view begin_key, std::string_view end_key, int rangeLimit);
 	KeyValueArrayResult read_blob_granules(std::string_view begin_key,
 	                                       std::string_view end_key,
 	                                       int64_t beginVersion,
diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp
index 9f5c015bfb..2ab80cf90c 100644
--- a/bindings/c/test/unit/unit_tests.cpp
+++ b/bindings/c/test/unit/unit_tests.cpp
@@ -2853,7 +2853,7 @@ TEST_CASE("Blob Granule Functions") {
 	// test ranges
 
 	while (1) {
-		fdb::KeyRangeArrayFuture f = tr.get_blob_granule_ranges(key("bg"), key("bh"));
+		fdb::KeyRangeArrayFuture f = tr.get_blob_granule_ranges(key("bg"), key("bh"), 1000);
 		fdb_error_t err = wait_future(f);
 		if (err) {
 			fdb::EmptyFuture f2 = tr.on_error(err);
diff --git a/bindings/go/src/fdb/generated.go b/bindings/go/src/fdb/generated.go
index a3c0674e64..a58cea3f1f 100644
--- a/bindings/go/src/fdb/generated.go
+++ b/bindings/go/src/fdb/generated.go
@@ -239,6 +239,13 @@ func (o NetworkOptions) SetClientThreadsPerVersion(param int64) error {
 	return o.setOpt(65, int64ToBytes(param))
 }
 
+// Adds an external client library to be used with a future version protocol. This option can be used testing purposes only!
+//
+// Parameter: path to client library
+func (o NetworkOptions) SetFutureVersionClientLibrary(param string) error {
+	return o.setOpt(66, []byte(param))
+}
+
 // Disables logging of client statistics, such as sampled transaction activity.
 func (o NetworkOptions) SetDisableClientStatisticsLogging() error {
 	return o.setOpt(70, nil)
@@ -615,6 +622,13 @@ func (o TransactionOptions) SetUseGrvCache() error {
 	return o.setOpt(1101, nil)
 }
 
+// Attach given authorization token to the transaction such that subsequent tenant-aware requests are authorized
+//
+// Parameter: A JSON Web Token authorized to access data belonging to one or more tenants, indicated by 'tenants' claim of the token's payload.
+func (o TransactionOptions) SetAuthorizationToken(param string) error {
+	return o.setOpt(2000, []byte(param))
+}
+
 type StreamingMode int
 
 const (
diff --git a/bindings/java/CMakeLists.txt b/bindings/java/CMakeLists.txt
index 22564dccc8..7057f22384 100644
--- a/bindings/java/CMakeLists.txt
+++ b/bindings/java/CMakeLists.txt
@@ -34,9 +34,11 @@ set(JAVA_BINDING_SRCS
   src/main/com/apple/foundationdb/FDBDatabase.java
   src/main/com/apple/foundationdb/FDBTenant.java
   src/main/com/apple/foundationdb/FDBTransaction.java
+  src/main/com/apple/foundationdb/FutureBool.java
   src/main/com/apple/foundationdb/FutureInt64.java
   src/main/com/apple/foundationdb/FutureKey.java
   src/main/com/apple/foundationdb/FutureKeyArray.java
+  src/main/com/apple/foundationdb/FutureKeyRangeArray.java
   src/main/com/apple/foundationdb/FutureResult.java
   src/main/com/apple/foundationdb/FutureResults.java
   src/main/com/apple/foundationdb/FutureMappedResults.java
@@ -56,6 +58,7 @@ set(JAVA_BINDING_SRCS
   src/main/com/apple/foundationdb/RangeQuery.java
   src/main/com/apple/foundationdb/MappedRangeQuery.java
   src/main/com/apple/foundationdb/KeyArrayResult.java
+  src/main/com/apple/foundationdb/KeyRangeArrayResult.java
   src/main/com/apple/foundationdb/RangeResult.java
   src/main/com/apple/foundationdb/MappedRangeResult.java
   src/main/com/apple/foundationdb/RangeResultInfo.java
diff --git a/bindings/java/fdbJNI.cpp b/bindings/java/fdbJNI.cpp
index e685d3ee53..c2b5ea90cc 100644
--- a/bindings/java/fdbJNI.cpp
+++ b/bindings/java/fdbJNI.cpp
@@ -25,9 +25,11 @@
 #include "com_apple_foundationdb_FDB.h"
 #include "com_apple_foundationdb_FDBDatabase.h"
 #include "com_apple_foundationdb_FDBTransaction.h"
+#include "com_apple_foundationdb_FutureBool.h"
 #include "com_apple_foundationdb_FutureInt64.h"
 #include "com_apple_foundationdb_FutureKey.h"
 #include "com_apple_foundationdb_FutureKeyArray.h"
+#include "com_apple_foundationdb_FutureKeyRangeArray.h"
 #include "com_apple_foundationdb_FutureResult.h"
 #include "com_apple_foundationdb_FutureResults.h"
 #include "com_apple_foundationdb_FutureStrings.h"
@@ -55,7 +57,11 @@ static jclass mapped_range_result_class;
 static jclass mapped_key_value_class;
 static jclass string_class;
 static jclass key_array_result_class;
+static jclass keyrange_class;
+static jclass keyrange_array_result_class;
 static jmethodID key_array_result_init;
+static jmethodID keyrange_init;
+static jmethodID keyrange_array_result_init;
 static jmethodID range_result_init;
 static jmethodID mapped_range_result_init;
 static jmethodID mapped_key_value_from_bytes;
@@ -278,6 +284,23 @@ JNIEXPORT void JNICALL Java_com_apple_foundationdb_NativeFuture_Future_1releaseM
 	fdb_future_release_memory(var);
 }
 
+JNIEXPORT jboolean JNICALL Java_com_apple_foundationdb_FutureBool_FutureBool_1get(JNIEnv* jenv, jobject, jlong future) {
+	if (!future) {
+		throwParamNotNull(jenv);
+		return 0;
+	}
+	FDBFuture* f = (FDBFuture*)future;
+
+	fdb_bool_t value = false;
+	fdb_error_t err = fdb_future_get_bool(f, &value);
+	if (err) {
+		safeThrow(jenv, getThrowable(jenv, err));
+		return 0;
+	}
+
+	return (jboolean)value;
+}
+
 JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FutureInt64_FutureInt64_1get(JNIEnv* jenv, jobject, jlong future) {
 	if (!future) {
 		throwParamNotNull(jenv);
@@ -407,6 +430,61 @@ JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureKeyArray_FutureKeyAr
 	return result;
 }
 
+JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureKeyRangeArray_FutureKeyRangeArray_1get(JNIEnv* jenv,
+                                                                                                   jobject,
+                                                                                                   jlong future) {
+	if (!future) {
+		throwParamNotNull(jenv);
+		return JNI_NULL;
+	}
+
+	FDBFuture* f = (FDBFuture*)future;
+
+	const FDBKeyRange* fdbKr;
+	int count;
+	fdb_error_t err = fdb_future_get_keyrange_array(f, &fdbKr, &count);
+	if (err) {
+		safeThrow(jenv, getThrowable(jenv, err));
+		return JNI_NULL;
+	}
+
+	jobjectArray kr_values = jenv->NewObjectArray(count, keyrange_class, NULL);
+	if (!kr_values) {
+		if (!jenv->ExceptionOccurred())
+			throwOutOfMem(jenv);
+		return JNI_NULL;
+	}
+
+	for (int i = 0; i < count; i++) {
+		jbyteArray beginArr = jenv->NewByteArray(fdbKr[i].begin_key_length);
+		if (!beginArr) {
+			if (!jenv->ExceptionOccurred())
+				throwOutOfMem(jenv);
+			return JNI_NULL;
+		}
+		jbyteArray endArr = jenv->NewByteArray(fdbKr[i].end_key_length);
+		if (!endArr) {
+			if (!jenv->ExceptionOccurred())
+				throwOutOfMem(jenv);
+			return JNI_NULL;
+		}
+		jenv->SetByteArrayRegion(beginArr, 0, fdbKr[i].begin_key_length, (const jbyte*)fdbKr[i].begin_key);
+		jenv->SetByteArrayRegion(endArr, 0, fdbKr[i].end_key_length, (const jbyte*)fdbKr[i].end_key);
+
+		jobject kr = jenv->NewObject(keyrange_class, keyrange_init, beginArr, endArr);
+		if (jenv->ExceptionOccurred())
+			return JNI_NULL;
+		jenv->SetObjectArrayElement(kr_values, i, kr);
+		if (jenv->ExceptionOccurred())
+			return JNI_NULL;
+	}
+	jobject krarr = jenv->NewObject(keyrange_array_result_class, keyrange_array_result_init, kr_values);
+	if (jenv->ExceptionOccurred())
+		return JNI_NULL;
+
+	return krarr;
+}
+
 // SOMEDAY: explore doing this more efficiently with Direct ByteBuffers
 JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureResults_FutureResults_1get(JNIEnv* jenv,
                                                                                        jobject,
@@ -830,6 +908,142 @@ Java_com_apple_foundationdb_FDBDatabase_Database_1waitPurgeGranulesComplete(JNIE
 	return (jlong)f;
 }
 
+JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1blobbifyRange(JNIEnv* jenv,
+                                                                                        jobject,
+                                                                                        jlong dbPtr,
+                                                                                        jbyteArray beginKeyBytes,
+                                                                                        jbyteArray endKeyBytes) {
+	if (!dbPtr || !beginKeyBytes || !endKeyBytes) {
+		throwParamNotNull(jenv);
+		return 0;
+	}
+
+	FDBDatabase* database = (FDBDatabase*)dbPtr;
+
+	uint8_t* beginKeyArr = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL);
+	if (!beginKeyArr) {
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	uint8_t* endKeyArr = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL);
+	if (!endKeyArr) {
+		jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT);
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	FDBFuture* f = fdb_database_blobbify_range(
+	    database, beginKeyArr, jenv->GetArrayLength(beginKeyBytes), endKeyArr, jenv->GetArrayLength(endKeyBytes));
+	jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT);
+	jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKeyArr, JNI_ABORT);
+	return (jlong)f;
+}
+
+JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1unblobbifyRange(JNIEnv* jenv,
+                                                                                          jobject,
+                                                                                          jlong dbPtr,
+                                                                                          jbyteArray beginKeyBytes,
+                                                                                          jbyteArray endKeyBytes) {
+	if (!dbPtr || !beginKeyBytes || !endKeyBytes) {
+		throwParamNotNull(jenv);
+		return 0;
+	}
+
+	FDBDatabase* database = (FDBDatabase*)dbPtr;
+
+	uint8_t* beginKeyArr = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL);
+	if (!beginKeyArr) {
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	uint8_t* endKeyArr = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL);
+	if (!endKeyArr) {
+		jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT);
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	FDBFuture* f = fdb_database_unblobbify_range(
+	    database, beginKeyArr, jenv->GetArrayLength(beginKeyBytes), endKeyArr, jenv->GetArrayLength(endKeyBytes));
+	jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT);
+	jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKeyArr, JNI_ABORT);
+	return (jlong)f;
+}
+
+JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1listBlobbifiedRanges(JNIEnv* jenv,
+                                                                                               jobject,
+                                                                                               jlong dbPtr,
+                                                                                               jbyteArray beginKeyBytes,
+                                                                                               jbyteArray endKeyBytes,
+                                                                                               jint rangeLimit) {
+	if (!dbPtr || !beginKeyBytes || !endKeyBytes) {
+		throwParamNotNull(jenv);
+		return 0;
+	}
+	FDBDatabase* tr = (FDBDatabase*)dbPtr;
+
+	uint8_t* startKey = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL);
+	if (!startKey) {
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	uint8_t* endKey = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL);
+	if (!endKey) {
+		jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT);
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	FDBFuture* f = fdb_database_list_blobbified_ranges(
+	    tr, startKey, jenv->GetArrayLength(beginKeyBytes), endKey, jenv->GetArrayLength(endKeyBytes), rangeLimit);
+	jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT);
+	jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKey, JNI_ABORT);
+	return (jlong)f;
+}
+
+JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1verifyBlobRange(JNIEnv* jenv,
+                                                                                          jobject,
+                                                                                          jlong dbPtr,
+                                                                                          jbyteArray beginKeyBytes,
+                                                                                          jbyteArray endKeyBytes,
+                                                                                          jlong version) {
+	if (!dbPtr || !beginKeyBytes || !endKeyBytes) {
+		throwParamNotNull(jenv);
+		return 0;
+	}
+	FDBDatabase* tr = (FDBDatabase*)dbPtr;
+
+	uint8_t* startKey = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL);
+	if (!startKey) {
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	uint8_t* endKey = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL);
+	if (!endKey) {
+		jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT);
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	FDBFuture* f = fdb_database_list_blobbified_ranges(
+	    tr, startKey, jenv->GetArrayLength(beginKeyBytes), endKey, jenv->GetArrayLength(endKeyBytes), version);
+	jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT);
+	jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKey, JNI_ABORT);
+	return (jlong)f;
+}
+
 JNIEXPORT jboolean JNICALL Java_com_apple_foundationdb_FDB_Error_1predicate(JNIEnv* jenv,
                                                                             jobject,
                                                                             jint predicate,
@@ -1307,6 +1521,41 @@ Java_com_apple_foundationdb_FDBTransaction_Transaction_1getRangeSplitPoints(JNIE
 	return (jlong)f;
 }
 
+JNIEXPORT jlong JNICALL
+Java_com_apple_foundationdb_FDBTransaction_Transaction_1getBlobGranuleRanges(JNIEnv* jenv,
+                                                                             jobject,
+                                                                             jlong tPtr,
+                                                                             jbyteArray beginKeyBytes,
+                                                                             jbyteArray endKeyBytes,
+                                                                             jint rowLimit) {
+	if (!tPtr || !beginKeyBytes || !endKeyBytes || !rowLimit) {
+		throwParamNotNull(jenv);
+		return 0;
+	}
+	FDBTransaction* tr = (FDBTransaction*)tPtr;
+
+	uint8_t* startKey = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL);
+	if (!startKey) {
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	uint8_t* endKey = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL);
+	if (!endKey) {
+		jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT);
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	FDBFuture* f = fdb_transaction_get_blob_granule_ranges(
+	    tr, startKey, jenv->GetArrayLength(beginKeyBytes), endKey, jenv->GetArrayLength(endKeyBytes), rowLimit);
+	jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT);
+	jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKey, JNI_ABORT);
+	return (jlong)f;
+}
+
 JNIEXPORT void JNICALL Java_com_apple_foundationdb_FDBTransaction_Transaction_1set(JNIEnv* jenv,
                                                                                    jobject,
                                                                                    jlong tPtr,
@@ -1746,6 +1995,15 @@ jint JNI_OnLoad(JavaVM* vm, void* reserved) {
 		key_array_result_init = env->GetMethodID(local_key_array_result_class, "<init>", "([B[I)V");
 		key_array_result_class = (jclass)(env)->NewGlobalRef(local_key_array_result_class);
 
+		jclass local_keyrange_class = env->FindClass("com/apple/foundationdb/Range");
+		keyrange_init = env->GetMethodID(local_keyrange_class, "<init>", "([B[B)V");
+		keyrange_class = (jclass)(env)->NewGlobalRef(local_keyrange_class);
+
+		jclass local_keyrange_array_result_class = env->FindClass("com/apple/foundationdb/KeyRangeArrayResult");
+		keyrange_array_result_init =
+		    env->GetMethodID(local_keyrange_array_result_class, "<init>", "([Lcom/apple/foundationdb/Range;)V");
+		keyrange_array_result_class = (jclass)(env)->NewGlobalRef(local_keyrange_array_result_class);
+
 		jclass local_range_result_summary_class = env->FindClass("com/apple/foundationdb/RangeResultSummary");
 		range_result_summary_init = env->GetMethodID(local_range_result_summary_class, "<init>", "([BIZ)V");
 		range_result_summary_class = (jclass)(env)->NewGlobalRef(local_range_result_summary_class);
@@ -1770,6 +2028,12 @@ void JNI_OnUnload(JavaVM* vm, void* reserved) {
 		if (range_result_class != JNI_NULL) {
 			env->DeleteGlobalRef(range_result_class);
 		}
+		if (keyrange_array_result_class != JNI_NULL) {
+			env->DeleteGlobalRef(keyrange_array_result_class);
+		}
+		if (keyrange_class != JNI_NULL) {
+			env->DeleteGlobalRef(keyrange_class);
+		}
 		if (mapped_range_result_class != JNI_NULL) {
 			env->DeleteGlobalRef(mapped_range_result_class);
 		}
diff --git a/bindings/java/src/main/com/apple/foundationdb/Database.java b/bindings/java/src/main/com/apple/foundationdb/Database.java
index 8608effe53..a3f012ba7c 100644
--- a/bindings/java/src/main/com/apple/foundationdb/Database.java
+++ b/bindings/java/src/main/com/apple/foundationdb/Database.java
@@ -161,6 +161,20 @@ public interface Database extends AutoCloseable, TransactionContext {
 	 */
 	double getMainThreadBusyness();
 
+	/**
+	 * Runs {@link #purgeBlobGranules(Function)} on the default executor.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+	 * @param purgeVersion version to purge at
+	 * @param force if true delete all data, if not keep data >= purgeVersion
+	 *
+	 * @return the key to watch for purge complete
+	 */
+	default CompletableFuture<byte[]> purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force) {
+		return purgeBlobGranules(beginKey, endKey, purgeVersion, force, getExecutor());
+	}
+
 	/**
 	 * Queues a purge of blob granules for the specified key range, at the specified version.
      *
@@ -168,17 +182,126 @@ public interface Database extends AutoCloseable, TransactionContext {
 	 * @param endKey end of the key range
 	 * @param purgeVersion version to purge at
 	 * @param force if true delete all data, if not keep data >= purgeVersion
+	 * @param e the {@link Executor} to use for asynchronous callbacks
+
 	 * @return the key to watch for purge complete
 	 */
 	CompletableFuture<byte[]> purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force, Executor e);
 
+
 	/**
-	 * Wait for a previous call to purgeBlobGranules to complete
+	 * Runs {@link #waitPurgeGranulesComplete(Function)} on the default executor.
 	 *
 	 * @param purgeKey key to watch
 	 */
+	default CompletableFuture<Void> waitPurgeGranulesComplete(byte[] purgeKey) {
+		return waitPurgeGranulesComplete(purgeKey, getExecutor());
+	}
+
+	/**
+	 * Wait for a previous call to purgeBlobGranules to complete.
+	 *
+	 * @param purgeKey key to watch
+	 * @param e the {@link Executor} to use for asynchronous callbacks
+	 */
 	CompletableFuture<Void> waitPurgeGranulesComplete(byte[] purgeKey, Executor e);
 
+	/**
+	 * Runs {@link #blobbifyRange(Function)} on the default executor.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+
+	 * @return if the recording of the range was successful
+	 */
+	default CompletableFuture<Boolean> blobbifyRange(byte[] beginKey, byte[] endKey) {
+		return blobbifyRange(beginKey, endKey, getExecutor());
+	}
+
+	/**
+	 * Sets a range to be blobbified in the database. Must be a completely unblobbified range.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+	 * @param e the {@link Executor} to use for asynchronous callbacks
+
+	 * @return if the recording of the range was successful
+	 */
+	CompletableFuture<Boolean> blobbifyRange(byte[] beginKey, byte[] endKey, Executor e);
+
+	/**
+	 * Runs {@link #unblobbifyRange(Function)} on the default executor.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+
+	 * @return if the recording of the range was successful
+	 */
+	default CompletableFuture<Boolean> unblobbifyRange(byte[] beginKey, byte[] endKey) {
+		return unblobbifyRange(beginKey, endKey, getExecutor());
+	}
+
+	/**
+	 * Unsets a blobbified range in the database. The range must be aligned to known blob ranges.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+	 * @param e the {@link Executor} to use for asynchronous callbacks
+
+	 * @return if the recording of the range was successful
+	 */
+	CompletableFuture<Boolean> unblobbifyRange(byte[] beginKey, byte[] endKey, Executor e);
+
+	/**
+	 * Runs {@link #listBlobbifiedRanges(Function)} on the default executor.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+	 * @param rangeLimit batch size
+	 * @param e the {@link Executor} to use for asynchronous callbacks
+
+	 * @return a future with the list of blobbified ranges: [lastLessThan(beginKey), firstGreaterThanOrEqual(endKey)]
+	 */
+	 default CompletableFuture<KeyRangeArrayResult> listBlobbifiedRanges(byte[] beginKey, byte[] endKey, int rangeLimit) {
+		return listBlobbifiedRanges(beginKey, endKey, rangeLimit, getExecutor());
+	 }
+
+	/**
+	 * Lists blobbified ranges in the database. There may be more if result.size() == rangeLimit.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+	 * @param rangeLimit batch size
+	 * @param e the {@link Executor} to use for asynchronous callbacks
+
+	 * @return a future with the list of blobbified ranges: [lastLessThan(beginKey), firstGreaterThanOrEqual(endKey)]
+	 */
+	 CompletableFuture<KeyRangeArrayResult> listBlobbifiedRanges(byte[] beginKey, byte[] endKey, int rangeLimit, Executor e);
+
+	/**
+	 * Runs {@link #verifyBlobRange(Function)} on the default executor.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+	 * @param version version to read at
+	 *
+	 * @return a future with the version of the last blob granule.
+	 */
+	default CompletableFuture<Long> verifyBlobRange(byte[] beginKey, byte[] endKey, long version) {
+		return verifyBlobRange(beginKey, endKey, version, getExecutor());
+	}
+
+	/**
+	 * Checks if a blob range is blobbified.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+	 * @param version version to read at
+	 *
+	 * @return a future with the version of the last blob granule.
+	 */
+	CompletableFuture<Long> verifyBlobRange(byte[] beginKey, byte[] endKey, long version, Executor e);
+
 	/**
 	 * Runs a read-only transactional function against this {@code Database} with retry logic.
 	 *  {@link Function#apply(Object) apply(ReadTransaction)} will be called on the
diff --git a/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java b/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java
index 50a63cc910..98c001a1b0 100644
--- a/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java
+++ b/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java
@@ -201,20 +201,60 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume
 	}
 
 	@Override
-	public CompletableFuture<byte[]> purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force, Executor executor) {
+	public CompletableFuture<byte[]> purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force, Executor e) {
 		pointerReadLock.lock();
 		try {
-			return new FutureKey(Database_purgeBlobGranules(getPtr(), beginKey, endKey, purgeVersion, force), executor, eventKeeper);
+			return new FutureKey(Database_purgeBlobGranules(getPtr(), beginKey, endKey, purgeVersion, force), e, eventKeeper);
 		} finally {
 			pointerReadLock.unlock();
 		}
 	}
 
 	@Override
-	public CompletableFuture<Void> waitPurgeGranulesComplete(byte[] purgeKey, Executor executor) {
+	public CompletableFuture<Void> waitPurgeGranulesComplete(byte[] purgeKey, Executor e) {
 		pointerReadLock.lock();
 		try {
-			return new FutureVoid(Database_waitPurgeGranulesComplete(getPtr(), purgeKey), executor);
+			return new FutureVoid(Database_waitPurgeGranulesComplete(getPtr(), purgeKey), e);
+		} finally {
+			pointerReadLock.unlock();
+		}
+	}
+
+	@Override
+	public CompletableFuture<Boolean> blobbifyRange(byte[] beginKey, byte[] endKey, Executor e) {
+		pointerReadLock.lock();
+		try {
+			return new FutureBool(Database_blobbifyRange(getPtr(), beginKey, endKey), e);
+		} finally {
+			pointerReadLock.unlock();
+		}
+	}
+
+	@Override
+	public CompletableFuture<Boolean> unblobbifyRange(byte[] beginKey, byte[] endKey, Executor e) {
+		pointerReadLock.lock();
+		try {
+			return new FutureBool(Database_unblobbifyRange(getPtr(), beginKey, endKey), e);
+		} finally {
+			pointerReadLock.unlock();
+		}
+	}
+
+	@Override
+	public CompletableFuture<KeyRangeArrayResult> listBlobbifiedRanges(byte[] beginKey, byte[] endKey, int rangeLimit, Executor e) {
+		pointerReadLock.lock();
+		try {
+			return new FutureKeyRangeArray(Database_listBlobbifiedRanges(getPtr(), beginKey, endKey, rangeLimit), e);
+		} finally {
+			pointerReadLock.unlock();
+		}
+	}
+
+	@Override
+	public CompletableFuture<Long> verifyBlobRange(byte[] beginKey, byte[] endKey, long version, Executor e) {
+		pointerReadLock.lock();
+		try {
+			return new FutureInt64(Database_verifyBlobRange(getPtr(), beginKey, endKey, version), e);
 		} finally {
 			pointerReadLock.unlock();
 		}
@@ -237,4 +277,8 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume
 	private native double Database_getMainThreadBusyness(long cPtr);
 	private native long Database_purgeBlobGranules(long cPtr, byte[] beginKey, byte[] endKey, long purgeVersion, boolean force);
 	private native long Database_waitPurgeGranulesComplete(long cPtr, byte[] purgeKey);
+	private native long Database_blobbifyRange(long cPtr, byte[] beginKey, byte[] endKey);
+	private native long Database_unblobbifyRange(long cPtr, byte[] beginKey, byte[] endKey);
+	private native long Database_listBlobbifiedRanges(long cPtr, byte[] beginKey, byte[] endKey, int rangeLimit);
+	private native long Database_verifyBlobRange(long cPtr, byte[] beginKey, byte[] endKey, long version);
 }
\ No newline at end of file
diff --git a/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java b/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java
index b35196c146..7943c5e9d1 100644
--- a/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java
+++ b/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java
@@ -97,6 +97,11 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC
 			return FDBTransaction.this.getRangeSplitPoints(range, chunkSize);
 		}
 
+		@Override
+		public CompletableFuture<KeyRangeArrayResult> getBlobGranuleRanges(byte[] begin, byte[] end, int rowLimit) {
+			return FDBTransaction.this.getBlobGranuleRanges(begin, end, rowLimit);
+		}
+
 		@Override
 		public AsyncIterable<MappedKeyValue> getMappedRange(KeySelector begin, KeySelector end, byte[] mapper,
 		                                                    int limit, int matchIndex, boolean reverse,
@@ -352,6 +357,16 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC
 		return this.getRangeSplitPoints(range.begin, range.end, chunkSize);
 	}
 
+	@Override
+	public CompletableFuture<KeyRangeArrayResult> getBlobGranuleRanges(byte[] begin, byte[] end, int rowLimit) {
+		pointerReadLock.lock();
+		try {
+			return new FutureKeyRangeArray(Transaction_getBlobGranuleRanges(getPtr(), begin, end, rowLimit), executor);
+		} finally {
+			pointerReadLock.unlock();
+		}
+	}
+
 	@Override
 	public AsyncIterable<MappedKeyValue> getMappedRange(KeySelector begin, KeySelector end, byte[] mapper, int limit,
 	                                                    int matchIndex, boolean reverse, StreamingMode mode) {
@@ -842,4 +857,5 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC
 	private native long Transaction_getKeyLocations(long cPtr, byte[] key);
 	private native long Transaction_getEstimatedRangeSizeBytes(long cPtr, byte[] keyBegin, byte[] keyEnd);
 	private native long Transaction_getRangeSplitPoints(long cPtr, byte[] keyBegin, byte[] keyEnd, long chunkSize);
+	private native long Transaction_getBlobGranuleRanges(long cPtr, byte[] keyBegin, byte[] keyEnd, int rowLimit);
 }
diff --git a/bindings/java/src/main/com/apple/foundationdb/FutureBool.java b/bindings/java/src/main/com/apple/foundationdb/FutureBool.java
new file mode 100644
index 0000000000..ddbbd02649
--- /dev/null
+++ b/bindings/java/src/main/com/apple/foundationdb/FutureBool.java
@@ -0,0 +1,37 @@
+/*
+ * FutureBool.java
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.apple.foundationdb;
+
+import java.util.concurrent.Executor;
+
+class FutureBool extends NativeFuture<Boolean> {
+	FutureBool(long cPtr, Executor executor) {
+		super(cPtr);
+		registerMarshalCallback(executor);
+	}
+
+	@Override
+	protected Boolean getIfDone_internal(long cPtr) throws FDBException {
+		return FutureBool_get(cPtr);
+	}
+
+	private native boolean FutureBool_get(long cPtr) throws FDBException;
+}
diff --git a/bindings/java/src/main/com/apple/foundationdb/FutureKeyRangeArray.java b/bindings/java/src/main/com/apple/foundationdb/FutureKeyRangeArray.java
new file mode 100644
index 0000000000..d866e9fca4
--- /dev/null
+++ b/bindings/java/src/main/com/apple/foundationdb/FutureKeyRangeArray.java
@@ -0,0 +1,37 @@
+/*
+ * FutureKeyRangeArray.java
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.apple.foundationdb;
+
+import java.util.concurrent.Executor;
+
+class FutureKeyRangeArray extends NativeFuture<KeyRangeArrayResult> {
+	FutureKeyRangeArray(long cPtr, Executor executor) {
+		super(cPtr);
+		registerMarshalCallback(executor);
+	}
+
+	@Override
+	protected KeyRangeArrayResult getIfDone_internal(long cPtr) throws FDBException {
+		return FutureKeyRangeArray_get(cPtr);
+	}
+
+	private native KeyRangeArrayResult FutureKeyRangeArray_get(long cPtr) throws FDBException;
+}
diff --git a/bindings/java/src/main/com/apple/foundationdb/KeyRangeArrayResult.java b/bindings/java/src/main/com/apple/foundationdb/KeyRangeArrayResult.java
new file mode 100644
index 0000000000..7385b8fe0a
--- /dev/null
+++ b/bindings/java/src/main/com/apple/foundationdb/KeyRangeArrayResult.java
@@ -0,0 +1,36 @@
+/*
+ * KeyRangeArrayResult.java
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.apple.foundationdb;
+
+import java.util.Arrays;
+import java.util.List;
+
+public class KeyRangeArrayResult {
+	final List<Range> keyRanges;
+
+	public KeyRangeArrayResult(Range[] keyRangeArr) {
+		this.keyRanges = Arrays.asList(keyRangeArr);
+	}
+
+	public List<Range> getKeyRanges() {
+		return keyRanges;
+	}
+}
diff --git a/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java b/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java
index 11ed7e900c..04050de6fb 100644
--- a/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java
+++ b/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java
@@ -513,6 +513,17 @@ public interface ReadTransaction extends ReadTransactionContext {
 	 */
 	CompletableFuture<KeyArrayResult> getRangeSplitPoints(Range range, long chunkSize);
 
+	/**
+	 * Gets the blob granule ranges for a given region.
+	 * Returned in batches, requires calling again moving the begin key up.
+	 *
+	 * @param begin beginning of the range (inclusive)
+	 * @param end end of the range (exclusive)
+
+	 * @return list of blob granules in the given range. May not be all.
+	 */
+	 CompletableFuture<KeyRangeArrayResult> getBlobGranuleRanges(byte[] begin, byte[] end, int rowLimit);
+
 	
 	/**
 	 * Returns a set of options that can be set on a {@code Transaction}
diff --git a/bindings/java/src/test/com/apple/foundationdb/test/Context.java b/bindings/java/src/test/com/apple/foundationdb/test/Context.java
index a594e088a1..151a4ba599 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/Context.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/Context.java
@@ -29,6 +29,7 @@ import java.util.Optional;
 
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ConcurrentHashMap;
 
 import com.apple.foundationdb.Database;
 import com.apple.foundationdb.FDB;
@@ -64,7 +65,7 @@ abstract class Context implements Runnable, AutoCloseable {
 	private List<Thread> children = new LinkedList<>();
 	private static Map<String, TransactionState> transactionMap = new HashMap<>();
 	private static Map<Transaction, AtomicInteger> transactionRefCounts = new HashMap<>();
-	private static Map<byte[], Tenant> tenantMap = new HashMap<>();
+	private static Map<byte[], Tenant> tenantMap = new ConcurrentHashMap<>();
 
 	Context(Database db, byte[] prefix) {
 		this.db = db;
diff --git a/bindings/python/tests/size_limit_tests.py b/bindings/python/tests/size_limit_tests.py
index cd27f985b0..b94d7ea8e4 100644
--- a/bindings/python/tests/size_limit_tests.py
+++ b/bindings/python/tests/size_limit_tests.py
@@ -66,6 +66,9 @@ def test_size_limit_option(db):
     except fdb.FDBError as e:
         assert(e.code == 2101)  # Transaction exceeds byte limit (2101)
 
+    # Reset the size limit for future tests
+    db.options.set_transaction_size_limit(10000000)
+
 @fdb.transactional
 def test_get_approximate_size(tr):
     tr[b'key1'] = b'value1'
diff --git a/cmake/AddFdbTest.cmake b/cmake/AddFdbTest.cmake
index 066baf7100..786126359b 100644
--- a/cmake/AddFdbTest.cmake
+++ b/cmake/AddFdbTest.cmake
@@ -142,7 +142,7 @@ function(add_fdb_test)
       ${VALGRIND_OPTION}
       ${ADD_FDB_TEST_TEST_FILES}
       WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
-    set_tests_properties("${test_name}" PROPERTIES ENVIRONMENT UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1)
+    set_tests_properties("${test_name}" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}")
     get_filename_component(test_dir_full ${first_file} DIRECTORY)
     if(NOT ${test_dir_full} STREQUAL "")
       get_filename_component(test_dir ${test_dir_full} NAME)
@@ -172,8 +172,7 @@ function(stage_correctness_package)
   file(MAKE_DIRECTORY ${STAGE_OUT_DIR}/bin)
   string(LENGTH "${CMAKE_SOURCE_DIR}/tests/" base_length)
   foreach(test IN LISTS TEST_NAMES)
-    if(("${TEST_TYPE_${test}}" STREQUAL "simulation") AND
-        (${test} MATCHES ${TEST_PACKAGE_INCLUDE}) AND
+    if((${test} MATCHES ${TEST_PACKAGE_INCLUDE}) AND
         (NOT ${test} MATCHES ${TEST_PACKAGE_EXCLUDE}))
       foreach(file IN LISTS TEST_FILES_${test})
         string(SUBSTRING ${file} ${base_length} -1 rel_out_file)
@@ -199,16 +198,17 @@ function(stage_correctness_package)
       set(src_dir "${src_dir}/")
       string(SUBSTRING ${src_dir} ${dir_len} -1 dest_dir)
       string(SUBSTRING ${file} ${dir_len} -1 rel_out_file)
-	  set(out_file ${STAGE_OUT_DIR}/${rel_out_file})
+      set(out_file ${STAGE_OUT_DIR}/${rel_out_file})
       list(APPEND external_files ${out_file})
-	  add_custom_command(
+      add_custom_command(
         OUTPUT ${out_file}
-		DEPENDS ${file}
-		COMMAND ${CMAKE_COMMAND} -E copy ${file} ${out_file}
-		COMMENT "Copying ${STAGE_CONTEXT} external file ${file}"
-		)
+        DEPENDS ${file}
+        COMMAND ${CMAKE_COMMAND} -E copy ${file} ${out_file}
+        COMMENT "Copying ${STAGE_CONTEXT} external file ${file}"
+        )
     endforeach()
   endforeach()
+
   list(APPEND package_files ${STAGE_OUT_DIR}/bin/fdbserver
                             ${STAGE_OUT_DIR}/bin/coverage.fdbserver.xml
                             ${STAGE_OUT_DIR}/bin/coverage.fdbclient.xml
@@ -218,6 +218,7 @@ function(stage_correctness_package)
                             ${STAGE_OUT_DIR}/bin/TraceLogHelper.dll
                             ${STAGE_OUT_DIR}/CMakeCache.txt
     )
+
   add_custom_command(
     OUTPUT ${package_files}
     DEPENDS ${CMAKE_BINARY_DIR}/CMakeCache.txt
@@ -239,6 +240,20 @@ function(stage_correctness_package)
                                      ${STAGE_OUT_DIR}/bin
     COMMENT "Copying files for ${STAGE_CONTEXT} package"
     )
+
+  set(test_harness_dir "${CMAKE_SOURCE_DIR}/contrib/TestHarness2")
+  file(GLOB_RECURSE test_harness2_files RELATIVE "${test_harness_dir}" CONFIGURE_DEPENDS "${test_harness_dir}/*.py")
+  foreach(file IN LISTS test_harness2_files)
+    set(src_file "${test_harness_dir}/${file}")
+    set(out_file "${STAGE_OUT_DIR}/${file}")
+    get_filename_component(dir "${out_file}" DIRECTORY)
+    file(MAKE_DIRECTORY "${dir}")
+    add_custom_command(OUTPUT ${out_file}
+      COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${out_file}"
+      DEPENDS "${src_file}")
+    list(APPEND package_files "${out_file}")
+  endforeach()
+
   list(APPEND package_files ${test_files} ${external_files})
   if(STAGE_OUT_FILES)
     set(${STAGE_OUT_FILES} ${package_files} PARENT_SCOPE)
@@ -404,7 +419,7 @@ endfunction()
 
 # Creates a single cluster before running the specified command (usually a ctest test)
 function(add_fdbclient_test)
-  set(options DISABLED ENABLED DISABLE_LOG_DUMP API_TEST_BLOB_GRANULES_ENABLED TLS_ENABLED)
+  set(options DISABLED ENABLED DISABLE_TENANTS DISABLE_LOG_DUMP API_TEST_BLOB_GRANULES_ENABLED TLS_ENABLED)
   set(oneValueArgs NAME PROCESS_NUMBER TEST_TIMEOUT WORKING_DIRECTORY)
   set(multiValueArgs COMMAND)
   cmake_parse_arguments(T "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}")
@@ -431,6 +446,9 @@ function(add_fdbclient_test)
   if(T_DISABLE_LOG_DUMP)
     list(APPEND TMP_CLUSTER_CMD --disable-log-dump)
   endif()
+  if(T_DISABLE_TENANTS)
+    list(APPEND TMP_CLUSTER_CMD --disable-tenants)
+  endif()
   if(T_API_TEST_BLOB_GRANULES_ENABLED)
     list(APPEND TMP_CLUSTER_CMD --blob-granules-enabled)
   endif()
@@ -447,9 +465,13 @@ function(add_fdbclient_test)
     set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT ${T_TEST_TIMEOUT})
   else()
     # default timeout
-    set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 300)
+    if(USE_SANITIZER)
+      set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 1200)
+    else()
+      set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 300)
+    endif()
   endif()
-  set_tests_properties("${T_NAME}" PROPERTIES ENVIRONMENT UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1)
+  set_tests_properties("${T_NAME}" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}")
 endfunction()
 
 # Creates a cluster file for a nonexistent cluster before running the specified command
@@ -483,7 +505,7 @@ function(add_unavailable_fdbclient_test)
     # default timeout
     set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 60)
   endif()
-  set_tests_properties("${T_NAME}" PROPERTIES ENVIRONMENT UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1)
+  set_tests_properties("${T_NAME}" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}")
 endfunction()
 
 # Creates 3 distinct clusters before running the specified command.
diff --git a/cmake/ConfigureCompiler.cmake b/cmake/ConfigureCompiler.cmake
index 79e20420af..d753cf394d 100644
--- a/cmake/ConfigureCompiler.cmake
+++ b/cmake/ConfigureCompiler.cmake
@@ -69,6 +69,7 @@ if(WIN32)
   add_definitions(-DWIN32_LEAN_AND_MEAN)
   add_definitions(-D_ITERATOR_DEBUG_LEVEL=0)
   add_definitions(-DNOGDI) # WinGDI.h defines macro ERROR
+  add_definitions(-D_USE_MATH_DEFINES) # Math constants
 endif()
 
 if (USE_CCACHE)
@@ -191,6 +192,7 @@ else()
   endif()
 
   if(USE_GCOV)
+    add_compile_options(--coverage)
     add_link_options(--coverage)
   endif()
 
@@ -199,6 +201,8 @@ else()
       -fsanitize=undefined
       # TODO(atn34) Re-enable -fsanitize=alignment once https://github.com/apple/foundationdb/issues/1434 is resolved
       -fno-sanitize=alignment
+      # https://github.com/apple/foundationdb/issues/7955
+      -fno-sanitize=function
       -DBOOST_USE_UCONTEXT)
     list(APPEND SANITIZER_LINK_OPTIONS -fsanitize=undefined)
   endif()
diff --git a/cmake/awssdk.cmake b/cmake/awssdk.cmake
index 88cb7c78e9..0fef54338d 100644
--- a/cmake/awssdk.cmake
+++ b/cmake/awssdk.cmake
@@ -11,7 +11,7 @@ endif()
 include(ExternalProject)
 ExternalProject_Add(awssdk_project
   GIT_REPOSITORY    https://github.com/aws/aws-sdk-cpp.git
-  GIT_TAG           2af3ce543c322cb259471b3b090829464f825972 # v1.9.200
+  GIT_TAG           e4b4b310d8631bc7e9a797b6ac03a73c6f210bf6 # v1.9.331
   SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src"
   BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build"
   GIT_CONFIG        advice.detachedHead=false
@@ -35,6 +35,7 @@ ExternalProject_Add(awssdk_project
                     "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a"
                     "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a"
                     "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a"
+                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a"
                     "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a"
                     "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a"
                     "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a"
@@ -75,6 +76,10 @@ add_library(awssdk_c_io STATIC IMPORTED)
 add_dependencies(awssdk_c_io awssdk_project)
 set_target_properties(awssdk_c_io PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a")
 
+add_library(awssdk_c_sdkutils STATIC IMPORTED)
+add_dependencies(awssdk_c_sdkutils awssdk_project)
+set_target_properties(awssdk_c_sdkutils PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a")
+
 add_library(awssdk_checksums STATIC IMPORTED)
 add_dependencies(awssdk_checksums awssdk_project)
 set_target_properties(awssdk_checksums PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a")
@@ -94,4 +99,4 @@ set_target_properties(awssdk_c_common PROPERTIES IMPORTED_LOCATION "${CMAKE_CURR
 # link them all together in one interface target
 add_library(awssdk_target INTERFACE)
 target_include_directories(awssdk_target SYSTEM INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/include)
-target_link_libraries(awssdk_target INTERFACE awssdk_core awssdk_crt awssdk_c_s3 awssdk_c_auth awssdk_c_eventstream awssdk_c_http awssdk_c_mqtt awssdk_c_io awssdk_checksums awssdk_c_compression awssdk_c_cal awssdk_c_common curl)
\ No newline at end of file
+target_link_libraries(awssdk_target INTERFACE awssdk_core awssdk_crt awssdk_c_s3 awssdk_c_auth awssdk_c_eventstream awssdk_c_http awssdk_c_mqtt awssdk_c_sdkutils awssdk_c_io awssdk_checksums awssdk_c_compression awssdk_c_cal awssdk_c_common curl)
diff --git a/contrib/Joshua/scripts/correctnessTest.sh b/contrib/Joshua/scripts/correctnessTest.sh
index a617d81088..bee09acf25 100755
--- a/contrib/Joshua/scripts/correctnessTest.sh
+++ b/contrib/Joshua/scripts/correctnessTest.sh
@@ -4,4 +4,6 @@
 export ASAN_OPTIONS="detect_leaks=0"
 
 OLDBINDIR="${OLDBINDIR:-/app/deploy/global_data/oldBinaries}"
-mono bin/TestHarness.exe joshua-run "${OLDBINDIR}" false
+#mono bin/TestHarness.exe joshua-run "${OLDBINDIR}" false
+
+python3 -m test_harness.app -s ${JOSHUA_SEED} --old-binaries-path ${OLDBINDIR}
diff --git a/contrib/Joshua/scripts/correctnessTimeout.sh b/contrib/Joshua/scripts/correctnessTimeout.sh
index 7917aae591..6bd0bfeee0 100755
--- a/contrib/Joshua/scripts/correctnessTimeout.sh
+++ b/contrib/Joshua/scripts/correctnessTimeout.sh
@@ -1,4 +1,4 @@
 #!/bin/bash -u
-for file in `find . -name 'trace*.xml'` ; do
-    mono ./bin/TestHarness.exe summarize "${file}" summary.xml "" JoshuaTimeout true
-done
+
+
+python3 -m test_harness.timeout
diff --git a/contrib/Joshua/scripts/valgrindTest.sh b/contrib/Joshua/scripts/valgrindTest.sh
index 5409429691..820750f3b2 100755
--- a/contrib/Joshua/scripts/valgrindTest.sh
+++ b/contrib/Joshua/scripts/valgrindTest.sh
@@ -1,3 +1,3 @@
 #!/bin/sh
 OLDBINDIR="${OLDBINDIR:-/app/deploy/global_data/oldBinaries}"
-mono bin/TestHarness.exe joshua-run "${OLDBINDIR}" true
+python3 -m test_harness.app -s ${JOSHUA_SEED} --old-binaries-path ${OLDBINDIR} --use-valgrind
diff --git a/contrib/Joshua/scripts/valgrindTimeout.sh b/contrib/Joshua/scripts/valgrindTimeout.sh
index b9d9e7ebad..2224598e43 100755
--- a/contrib/Joshua/scripts/valgrindTimeout.sh
+++ b/contrib/Joshua/scripts/valgrindTimeout.sh
@@ -1,6 +1,2 @@
 #!/bin/bash -u
-for file in `find . -name 'trace*.xml'` ; do
-    for valgrindFile in `find . -name 'valgrind*.xml'` ; do
-        mono ./bin/TestHarness.exe summarize "${file}" summary.xml "${valgrindFile}" JoshuaTimeout true
-    done
-done
+python3 -m test_harness.timeout --use-valgrind
diff --git a/contrib/TestHarness/Program.cs b/contrib/TestHarness/Program.cs
index a31a6d6382..b3e003dee5 100644
--- a/contrib/TestHarness/Program.cs
+++ b/contrib/TestHarness/Program.cs
@@ -19,6 +19,7 @@
  */
 
 using System;
+using System.Collections;
 using System.Collections.Generic;
 using System.Linq;
 using System.Text;
@@ -302,6 +303,7 @@ namespace SummarizeTest
                         uniqueFileSet.Add(file.Substring(0, file.LastIndexOf("-"))); // all restarting tests end with -1.txt or -2.txt
                     }
                     uniqueFiles = uniqueFileSet.ToArray();
+                    Array.Sort(uniqueFiles);
                     testFile = random.Choice(uniqueFiles);
                     // The on-disk format changed in 4.0.0, and 5.x can't load files from 3.x.
                     string oldBinaryVersionLowerBound = "4.0.0";
@@ -334,8 +336,9 @@ namespace SummarizeTest
                         // thus, by definition, if "until_" appears, we do not want to run with the current binary version
                         oldBinaries = oldBinaries.Concat(currentBinary);
                     }
-                    List<string> oldBinariesList = oldBinaries.ToList<string>();
-                    if (oldBinariesList.Count == 0) {
+                    string[] oldBinariesList = oldBinaries.ToArray<string>();
+                    Array.Sort(oldBinariesList);
+                    if (oldBinariesList.Count() == 0) {
                         // In theory, restarting tests are named to have at least one old binary version to run
                         // But if none of the provided old binaries fall in the range, we just skip the test
                         Console.WriteLine("No available old binary version from {0} to {1}", oldBinaryVersionLowerBound, oldBinaryVersionUpperBound);
@@ -347,6 +350,7 @@ namespace SummarizeTest
                 else
                 {
                     uniqueFiles = Directory.GetFiles(testDir);
+                    Array.Sort(uniqueFiles);
                     testFile = random.Choice(uniqueFiles);
                 }
             }
@@ -487,6 +491,16 @@ namespace SummarizeTest
                         useValgrind ? "on" : "off");
                     }
 
+                    IDictionary data = Environment.GetEnvironmentVariables();
+                    foreach (DictionaryEntry i in data)
+                    {
+                        string k=(string)i.Key;
+                        string v=(string)i.Value;
+                        if (k.StartsWith("FDB_KNOB")) {
+                           process.StartInfo.EnvironmentVariables[k]=v;
+                        }
+                    }
+
                     process.Start();
 
                     // SOMEDAY: Do we want to actually do anything with standard output or error?
@@ -718,7 +732,7 @@ namespace SummarizeTest
                         process.Refresh();
                         if (process.HasExited)
                             return;
-                        long mem = process.PrivateMemorySize64;
+                        long mem = process.PagedMemorySize64;
                         MaxMem = Math.Max(MaxMem, mem);
                         //Console.WriteLine(string.Format("Process used {0} bytes", MaxMem));
                         Thread.Sleep(1000);
@@ -744,16 +758,28 @@ namespace SummarizeTest
             AppendToSummary(summaryFileName, xout);
         }
 
-        // Parses the valgrind XML file and returns a list of "what" tags for each error.
+        static string ParseValgrindStack(XElement stackElement) {
+            string backtrace = "";
+            foreach (XElement frame in stackElement.Elements()) {
+                backtrace += " " + frame.Element("ip").Value.ToLower();
+            }
+            if (backtrace.Length > 0) {
+                backtrace = "addr2line -e fdbserver.debug -p -C -f -i" + backtrace;
+            }
+
+            return backtrace;
+        }
+
+        // Parses the valgrind XML file and returns a list of error elements.
         //  All errors for which the "kind" tag starts with "Leak" are ignored
-        static string[] ParseValgrindOutput(string valgrindOutputFileName, bool traceToStdout)
+        static XElement[] ParseValgrindOutput(string valgrindOutputFileName, bool traceToStdout)
         {
             if (!traceToStdout)
             {
                 Console.WriteLine("Reading vXML file: " + valgrindOutputFileName);
             }
 
-            ISet<string> whats = new HashSet<string>();
+            IList<XElement> errors = new List<XElement>();
             XElement xdoc = XDocument.Load(valgrindOutputFileName).Element("valgrindoutput");
             foreach(var elem in xdoc.Elements()) {
                 if (elem.Name != "error")
@@ -761,9 +787,29 @@ namespace SummarizeTest
                 string kind = elem.Element("kind").Value;
                 if(kind.StartsWith("Leak"))
                     continue;
-                whats.Add(elem.Element("what").Value);
+
+                XElement errorElement = new XElement("ValgrindError",
+                                new XAttribute("Severity", (int)Magnesium.Severity.SevError));
+
+                int num = 1;
+                string suffix = "";
+                foreach (XElement sub in elem.Elements()) {
+                    if (sub.Name == "what") {
+                        errorElement.SetAttributeValue("What", sub.Value);
+                    } else if (sub.Name == "auxwhat") {
+                        suffix = "Aux" + num++;
+                        errorElement.SetAttributeValue("What" + suffix, sub.Value);
+                    } else if (sub.Name == "stack") {
+                        errorElement.SetAttributeValue("Backtrace" + suffix, ParseValgrindStack(sub));
+                    } else if (sub.Name == "origin") {
+                        errorElement.SetAttributeValue("WhatOrigin", sub.Element("what").Value);
+                        errorElement.SetAttributeValue("BacktraceOrigin", ParseValgrindStack(sub.Element("stack")));
+                    }
+                }
+
+                errors.Add(errorElement);
             }
-            return whats.ToArray();
+            return errors.ToArray();
         }
 
         delegate IEnumerable<Magnesium.Event> parseDelegate(System.IO.Stream stream, string file,
@@ -927,6 +973,10 @@ namespace SummarizeTest
                             {
                                 xout.Add(new XElement(ev.Type, new XAttribute("File", ev.Details.File), new XAttribute("Line", ev.Details.Line)));
                             }
+                            if (ev.Type == "RunningUnitTest") 
+                            {
+                                xout.Add(new XElement(ev.Type, new XAttribute("Name", ev.Details.Name), new XAttribute("File", ev.Details.File), new XAttribute("Line", ev.Details.Line)));
+                            }
                             if (ev.Type == "TestsExpectedToPass")
                                 testCount = int.Parse(ev.Details.Count);
                             if (ev.Type == "TestResults" && ev.Details.Passed == "1")
@@ -1065,12 +1115,10 @@ namespace SummarizeTest
                 try
                 {
                     // If there are any errors reported "ok" will be set to false
-                    var whats = ParseValgrindOutput(valgrindOutputFileName, traceToStdout);
-                    foreach (var what in whats)
+                    var valgrindErrors = ParseValgrindOutput(valgrindOutputFileName, traceToStdout);
+                    foreach (var vError in valgrindErrors)
                     {
-                        xout.Add(new XElement("ValgrindError",
-                                new XAttribute("Severity", (int)Magnesium.Severity.SevError),
-                                new XAttribute("What", what)));
+                        xout.Add(vError);
                         ok = false;
                         error = true;
                     }
diff --git a/contrib/TestHarness2/.gitignore b/contrib/TestHarness2/.gitignore
new file mode 100644
index 0000000000..80682f9552
--- /dev/null
+++ b/contrib/TestHarness2/.gitignore
@@ -0,0 +1,2 @@
+/tmp/
+/venv
diff --git a/contrib/TestHarness2/test_harness/__init__.py b/contrib/TestHarness2/test_harness/__init__.py
new file mode 100644
index 0000000000..3cb95520ec
--- /dev/null
+++ b/contrib/TestHarness2/test_harness/__init__.py
@@ -0,0 +1,2 @@
+# Currently this file is left intentionally empty. It's main job for now is to indicate that this directory
+# should be used as a module.
diff --git a/contrib/TestHarness2/test_harness/app.py b/contrib/TestHarness2/test_harness/app.py
new file mode 100644
index 0000000000..3e300c6bf4
--- /dev/null
+++ b/contrib/TestHarness2/test_harness/app.py
@@ -0,0 +1,25 @@
+import argparse
+import sys
+import traceback
+
+from test_harness.config import config
+from test_harness.run import TestRunner
+from test_harness.summarize import SummaryTree
+
+if __name__ == '__main__':
+    try:
+        parser = argparse.ArgumentParser('TestHarness', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+        config.build_arguments(parser)
+        args = parser.parse_args()
+        config.extract_args(args)
+        test_runner = TestRunner()
+        if not test_runner.run():
+            exit(1)
+    except Exception as e:
+        _, _, exc_traceback = sys.exc_info()
+        error = SummaryTree('TestHarnessError')
+        error.attributes['Severity'] = '40'
+        error.attributes['ErrorMessage'] = str(e)
+        error.attributes['Trace'] = repr(traceback.format_tb(exc_traceback))
+        error.dump(sys.stdout)
+        exit(1)
diff --git a/contrib/TestHarness2/test_harness/config.py b/contrib/TestHarness2/test_harness/config.py
new file mode 100644
index 0000000000..d0a11cf85c
--- /dev/null
+++ b/contrib/TestHarness2/test_harness/config.py
@@ -0,0 +1,263 @@
+from __future__ import annotations
+
+import argparse
+import collections
+import copy
+import os
+import random
+from enum import Enum
+from pathlib import Path
+from typing import List, Any, OrderedDict, Dict
+
+
+class BuggifyOptionValue(Enum):
+    ON = 1
+    OFF = 2
+    RANDOM = 3
+
+
+class BuggifyOption:
+    def __init__(self, val: str | None = None):
+        self.value = BuggifyOptionValue.RANDOM
+        if val is not None:
+            v = val.lower()
+            if v in ['on', '1', 'true']:
+                self.value = BuggifyOptionValue.ON
+            elif v in ['off', '0', 'false']:
+                self.value = BuggifyOptionValue.OFF
+            elif v in ['random', 'rnd', 'r']:
+                pass
+            else:
+                assert False, 'Invalid value {} -- use true, false, or random'.format(v)
+
+
+class ConfigValue:
+    def __init__(self, name: str, **kwargs):
+        self.name = name
+        self.value = None
+        self.kwargs = kwargs
+        if 'default' in self.kwargs:
+            self.value = self.kwargs['default']
+
+    def get_arg_name(self) -> str:
+        if 'long_name' in self.kwargs:
+            return self.kwargs['long_name']
+        else:
+            return self.name
+
+    def add_to_args(self, parser: argparse.ArgumentParser):
+        kwargs = copy.copy(self.kwargs)
+        long_name = self.name
+        short_name = None
+        if 'long_name' in kwargs:
+            long_name = kwargs['long_name']
+            del kwargs['long_name']
+        if 'short_name' in kwargs:
+            short_name = kwargs['short_name']
+            del kwargs['short_name']
+        if 'action' in kwargs and kwargs['action'] in ['store_true', 'store_false']:
+            del kwargs['type']
+        long_name = long_name.replace('_', '-')
+        if short_name is None:
+            # line below is useful for debugging
+            # print('add_argument(\'--{}\', [{{{}}}])'.format(long_name, ', '.join(['\'{}\': \'{}\''.format(k, v)
+            #                                                                       for k, v in kwargs.items()])))
+            parser.add_argument('--{}'.format(long_name), **kwargs)
+        else:
+            # line below is useful for debugging
+            # print('add_argument(\'-{}\', \'--{}\', [{{{}}}])'.format(short_name, long_name,
+            #                                                          ', '.join(['\'{}\': \'{}\''.format(k, v)
+            #                                                                     for k, v in kwargs.items()])))
+            parser.add_argument('-{}'.format(short_name), '--{}'.format(long_name), **kwargs)
+
+    def get_value(self, args: argparse.Namespace) -> tuple[str, Any]:
+        return self.name, args.__getattribute__(self.get_arg_name())
+
+
+class Config:
+    """
+    This is the central configuration class for test harness. The values in this class are exposed globally through
+    a global variable test_harness.config.config. This class provides some "magic" to keep test harness flexible.
+    Each parameter can further be configured using an `_args` member variable which is expected to be a dictionary.
+    * The value of any variable can be set through the command line. For a variable named `variable_name` we will
+      by default create a new command line option `--variable-name` (`_` is automatically changed to `-`). This
+      default can be changed by setting the `'long_name'` property in the `_arg` dict.
+    * In addition the user can also optionally set a short-name. This can be achieved by setting the `'short_name'`
+      property in the `_arg` dictionary.
+    * All additional properties in `_args` are passed to `argparse.add_argument`.
+    * If the default of a variable is `None` the user should explicitly set the `'type'` property to an appropriate
+      type.
+    * In addition to command line flags, all configuration options can also be controlled through environment variables.
+      By default, `variable-name` can be changed by setting the environment variable `TH_VARIABLE_NAME`. This default
+      can be changed by setting the `'env_name'` property.
+    * Test harness comes with multiple executables. Each of these should use the config facility. For this,
+      `Config.build_arguments` should be called first with the `argparse` parser. Then `Config.extract_args` needs
+      to be called with the result of `argparse.ArgumentParser.parse_args`. A sample example could look like this:
+      ```
+      parser = argparse.ArgumentParser('TestHarness', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+      config.build_arguments(parser)
+      args = parser.parse_args()
+      config.extract_args(args)
+      ```
+    * Changing the default value for all executables might not always be desirable. If it should be only changed for
+      one executable Config.change_default should be used.
+    """
+    def __init__(self):
+        self.random = random.Random()
+        self.cluster_file: str | None = None
+        self.cluster_file_args = {'short_name': 'C', 'type': str, 'help': 'Path to fdb cluster file', 'required': False,
+                                  'env_name': 'JOSHUA_CLUSTER_FILE'}
+        self.joshua_dir: str | None = None
+        self.joshua_dir_args = {'type': str, 'help': 'Where to write FDB data to', 'required': False,
+                                'env_name': 'JOSHUA_APP_DIR'}
+        self.stats: str | None = None
+        self.stats_args = {'type': str, 'help': 'A base64 encoded list of statistics (used to reproduce runs)',
+                           'required': False}
+        self.random_seed: int | None = None
+        self.random_seed_args = {'type': int,
+                                 'help': 'Force given seed given to fdbserver -- mostly useful for debugging',
+                                 'required': False}
+        self.kill_seconds: int = 30 * 60
+        self.kill_seconds_args = {'help': 'Timeout for individual test'}
+        self.buggify_on_ratio: float = 0.8
+        self.buggify_on_ratio_args = {'help': 'Probability that buggify is turned on'}
+        self.write_run_times = False
+        self.write_run_times_args = {'help': 'Write back probabilities after each test run',
+                                     'action': 'store_true'}
+        self.unseed_check_ratio: float = 0.05
+        self.unseed_check_ratio_args = {'help': 'Probability for doing determinism check'}
+        self.test_dirs: List[str] = ['slow', 'fast', 'restarting', 'rare', 'noSim']
+        self.test_dirs_args: dict = {'nargs': '*', 'help': 'test_directories to look for files in'}
+        self.trace_format: str = 'json'
+        self.trace_format_args = {'choices': ['json', 'xml'], 'help': 'What format fdb should produce'}
+        self.crash_on_error: bool = True
+        self.crash_on_error_args = {'long_name': 'no_crash', 'action': 'store_false',
+                                    'help': 'Don\'t crash on first error'}
+        self.max_warnings: int = 10
+        self.max_warnings_args = {'short_name': 'W'}
+        self.max_errors: int = 10
+        self.max_errors_args = {'short_name': 'E'}
+        self.old_binaries_path: Path = Path('/app/deploy/global_data/oldBinaries/')
+        self.old_binaries_path_args = {'help': 'Path to the directory containing the old fdb binaries'}
+        self.use_valgrind: bool = False
+        self.use_valgrind_args = {'action': 'store_true'}
+        self.buggify = BuggifyOption('random')
+        self.buggify_args = {'short_name': 'b', 'choices': ['on', 'off', 'random']}
+        self.pretty_print: bool = False
+        self.pretty_print_args = {'short_name': 'P', 'action': 'store_true'}
+        self.clean_up: bool = True
+        self.clean_up_args = {'long_name': 'no_clean_up', 'action': 'store_false'}
+        self.run_dir: Path = Path('tmp')
+        self.joshua_seed: int = random.randint(0, 2 ** 32 - 1)
+        self.joshua_seed_args = {'short_name': 's', 'help': 'A random seed', 'env_name': 'JOSHUA_SEED'}
+        self.print_coverage = False
+        self.print_coverage_args = {'action': 'store_true'}
+        self.binary = Path('bin') / ('fdbserver.exe' if os.name == 'nt' else 'fdbserver')
+        self.binary_args = {'help': 'Path to executable'}
+        self.hit_per_runs_ratio: int = 20000
+        self.hit_per_runs_ratio_args = {'help': 'Maximum test runs before each code probe hit at least once'}
+        self.output_format: str = 'xml'
+        self.output_format_args = {'short_name': 'O', 'choices': ['json', 'xml'],
+                                   'help': 'What format TestHarness should produce'}
+        self.include_test_files: str = r'.*'
+        self.include_test_files_args = {'help': 'Only consider test files whose path match against the given regex'}
+        self.exclude_test_files: str = r'.^'
+        self.exclude_test_files_args = {'help': 'Don\'t consider test files whose path match against the given regex'}
+        self.include_test_classes: str = r'.*'
+        self.include_test_classes_args = {'help': 'Only consider tests whose names match against the given regex'}
+        self.exclude_test_names: str = r'.^'
+        self.exclude_test_names_args = {'help': 'Don\'t consider tests whose names match against the given regex'}
+        self.details: bool = False
+        self.details_args = {'help': 'Print detailed results', 'short_name': 'c', 'action': 'store_true'}
+        self.success: bool = False
+        self.success_args = {'help': 'Print successful results', 'action': 'store_true'}
+        self.cov_include_files: str = r'.*'
+        self.cov_include_files_args = {'help': 'Only consider coverage traces that originated in files matching regex'}
+        self.cov_exclude_files: str = r'.^'
+        self.cov_exclude_files_args = {'help': 'Ignore coverage traces that originated in files matching regex'}
+        self.max_stderr_bytes: int = 1000
+        self.write_stats: bool = True
+        self.read_stats: bool = True
+        self.reproduce_prefix: str | None = None
+        self.reproduce_prefix_args = {'type': str, 'required': False,
+                                      'help': 'When printing the results, prepend this string to the command'}
+        self._env_names: Dict[str, str] = {}
+        self._config_map = self._build_map()
+        self._read_env()
+        self.random.seed(self.joshua_seed, version=2)
+
+    def change_default(self, attr: str, default_val):
+        assert attr in self._config_map, 'Unknown config attribute {}'.format(attr)
+        self.__setattr__(attr, default_val)
+        self._config_map[attr].kwargs['default'] = default_val
+
+    def _get_env_name(self, var_name: str) -> str:
+        return self._env_names.get(var_name, 'TH_{}'.format(var_name.upper()))
+
+    def dump(self):
+        for attr in dir(self):
+            obj = getattr(self, attr)
+            if attr == 'random' or attr.startswith('_') or callable(obj) or attr.endswith('_args'):
+                continue
+            print('config.{}: {} = {}'.format(attr, type(obj), obj))
+
+    def _build_map(self) -> OrderedDict[str, ConfigValue]:
+        config_map: OrderedDict[str, ConfigValue] = collections.OrderedDict()
+        for attr in dir(self):
+            obj = getattr(self, attr)
+            if attr == 'random' or attr.startswith('_') or callable(obj):
+                continue
+            if attr.endswith('_args'):
+                name = attr[0:-len('_args')]
+                assert name in config_map
+                assert isinstance(obj, dict)
+                for k, v in obj.items():
+                    if k == 'env_name':
+                        self._env_names[name] = v
+                    else:
+                        config_map[name].kwargs[k] = v
+            else:
+                # attribute_args has to be declared after the attribute
+                assert attr not in config_map
+                val_type = type(obj)
+                kwargs = {'type': val_type, 'default': obj}
+                config_map[attr] = ConfigValue(attr, **kwargs)
+        return config_map
+
+    def _read_env(self):
+        for attr in dir(self):
+            obj = getattr(self, attr)
+            if attr == 'random' or attr.startswith('_') or attr.endswith('_args') or callable(obj):
+                continue
+            env_name = self._get_env_name(attr)
+            attr_type = self._config_map[attr].kwargs['type']
+            assert type(None) != attr_type
+            e = os.getenv(env_name)
+            if e is not None:
+                # Use the env var to supply the default value, so that if the
+                # environment variable is set and the corresponding command line
+                # flag is not, the environment variable has an effect.
+                self._config_map[attr].kwargs['default'] = attr_type(e)
+
+    def build_arguments(self, parser: argparse.ArgumentParser):
+        for val in self._config_map.values():
+            val.add_to_args(parser)
+
+    def extract_args(self, args: argparse.Namespace):
+        for val in self._config_map.values():
+            k, v = val.get_value(args)
+            if v is not None:
+                config.__setattr__(k, v)
+        self.random.seed(self.joshua_seed, version=2)
+
+
+config = Config()
+
+if __name__ == '__main__':
+    # test the config setup
+    parser = argparse.ArgumentParser('TestHarness Config Tester',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    config.build_arguments(parser)
+    args = parser.parse_args()
+    config.extract_args(args)
+    config.dump()
diff --git a/contrib/TestHarness2/test_harness/fdb.py b/contrib/TestHarness2/test_harness/fdb.py
new file mode 100644
index 0000000000..1e6afa3906
--- /dev/null
+++ b/contrib/TestHarness2/test_harness/fdb.py
@@ -0,0 +1,144 @@
+from __future__ import annotations
+
+from typing import OrderedDict, Tuple, List
+
+import collections
+import fdb
+import fdb.tuple
+import struct
+
+from test_harness.run import StatFetcher, TestDescription
+from test_harness.config import config
+from test_harness.summarize import SummaryTree, Coverage
+
+# Before increasing this, make sure that all Joshua clusters (at Apple and Snowflake) have been upgraded.
+# This version needs to be changed if we either need newer features from FDB or the current API version is
+# getting retired.
+fdb.api_version(630)
+
+
+def str_to_tuple(s: str | None):
+    if s is None:
+        return s
+    return tuple(s.split(','))
+
+
+fdb_db = None
+
+
+def open_db(cluster_file: str | None):
+    global fdb_db
+    if fdb_db is None:
+        fdb_db = fdb.open(cluster_file)
+    return fdb_db
+
+
+def chunkify(iterable, sz: int):
+    res = []
+    for item in iterable:
+        res.append(item)
+        if len(res) >= sz:
+            yield res
+            res = []
+    if len(res) > 0:
+        yield res
+
+
+@fdb.transactional
+def write_coverage_chunk(tr, path: Tuple[str, ...], metadata: Tuple[str, ...],
+                         coverage: List[Tuple[Coverage, bool]], initialized: bool) -> bool:
+    cov_dir = fdb.directory.create_or_open(tr, path)
+    if not initialized:
+        metadata_dir = fdb.directory.create_or_open(tr, metadata)
+        v = tr[metadata_dir['initialized']]
+        initialized = v.present()
+    for cov, covered in coverage:
+        if not initialized or covered:
+            tr.add(cov_dir.pack((cov.file, cov.line, cov.comment)), struct.pack('<I', 1 if covered else 0))
+    return initialized
+
+
+@fdb.transactional
+def set_initialized(tr, metadata: Tuple[str, ...]):
+    metadata_dir = fdb.directory.create_or_open(tr, metadata)
+    tr[metadata_dir['initialized']] = fdb.tuple.pack((True,))
+
+
+def write_coverage(cluster_file: str | None, cov_path: Tuple[str, ...], metadata: Tuple[str, ...],
+                   coverage: OrderedDict[Coverage, bool]):
+    db = open_db(cluster_file)
+    assert config.joshua_dir is not None
+    initialized: bool = False
+    for chunk in chunkify(coverage.items(), 100):
+        initialized = write_coverage_chunk(db, cov_path, metadata, chunk, initialized)
+    if not initialized:
+        set_initialized(db, metadata)
+
+
+@fdb.transactional
+def _read_coverage(tr, cov_path: Tuple[str, ...]) -> OrderedDict[Coverage, int]:
+    res = collections.OrderedDict()
+    cov_dir = fdb.directory.create_or_open(tr, cov_path)
+    for k, v in tr[cov_dir.range()]:
+        file, line, comment = cov_dir.unpack(k)
+        count = struct.unpack('<I', v)[0]
+        res[Coverage(file, line, comment)] = count
+    return res
+
+
+def read_coverage(cluster_file: str | None, cov_path: Tuple[str, ...]) -> OrderedDict[Coverage, int]:
+    db = open_db(cluster_file)
+    return _read_coverage(db, cov_path)
+
+
+class TestStatistics:
+    def __init__(self, runtime: int, run_count: int):
+        self.runtime: int = runtime
+        self.run_count: int = run_count
+
+
+class Statistics:
+    def __init__(self, cluster_file: str | None, joshua_dir: Tuple[str, ...]):
+        self.db = open_db(cluster_file)
+        self.stats_dir = self.open_stats_dir(self.db, joshua_dir)
+        self.stats: OrderedDict[str, TestStatistics] = self.read_stats_from_db(self.db)
+
+    @fdb.transactional
+    def open_stats_dir(self, tr, app_dir: Tuple[str]):
+        stats_dir = app_dir + ('runtime_stats',)
+        return fdb.directory.create_or_open(tr, stats_dir)
+
+    @fdb.transactional
+    def read_stats_from_db(self, tr) -> OrderedDict[str, TestStatistics]:
+        result = collections.OrderedDict()
+        for k, v in tr[self.stats_dir.range()]:
+            test_name = self.stats_dir.unpack(k)[0]
+            runtime, run_count = struct.unpack('<II', v)
+            result[test_name] = TestStatistics(runtime, run_count)
+        return result
+
+    @fdb.transactional
+    def _write_runtime(self, tr, test_name: str, time: int) -> None:
+        key = self.stats_dir.pack((test_name,))
+        tr.add(key, struct.pack('<II', time, 1))
+
+    def write_runtime(self, test_name: str, time: int) -> None:
+        assert self.db is not None
+        self._write_runtime(self.db, test_name, time)
+
+
+class FDBStatFetcher(StatFetcher):
+    def __init__(self, tests: OrderedDict[str, TestDescription],
+                 joshua_dir: Tuple[str] = str_to_tuple(config.joshua_dir)):
+        super().__init__(tests)
+        self.statistics = Statistics(config.cluster_file, joshua_dir)
+
+    def read_stats(self):
+        for k, v in self.statistics.stats.items():
+            if k in self.tests.keys():
+                self.tests[k].total_runtime = v.runtime
+                self.tests[k].num_runs = v.run_count
+
+    def add_run_time(self, test_name: str, runtime: int, out: SummaryTree):
+        self.statistics.write_runtime(test_name, runtime)
+        super().add_run_time(test_name, runtime, out)
diff --git a/contrib/TestHarness2/test_harness/joshua.py b/contrib/TestHarness2/test_harness/joshua.py
new file mode 100644
index 0000000000..33c5881dcc
--- /dev/null
+++ b/contrib/TestHarness2/test_harness/joshua.py
@@ -0,0 +1,161 @@
+from __future__ import annotations
+
+import collections
+import io
+import sys
+import xml.sax
+import xml.sax.handler
+from pathlib import Path
+from typing import List, OrderedDict, Set
+
+from joshua import joshua_model
+
+import test_harness.run
+from test_harness.config import config
+from test_harness.summarize import SummaryTree
+
+
+class ToSummaryTree(xml.sax.handler.ContentHandler):
+    def __init__(self):
+        super().__init__()
+        self.root: SummaryTree | None = None
+        self.stack: List[SummaryTree] = []
+
+    def result(self) -> SummaryTree:
+        assert len(self.stack) == 0 and self.root is not None, 'Parse Error'
+        return self.root
+
+    def startElement(self, name, attrs):
+        new_child = SummaryTree(name)
+        for k, v in attrs.items():
+            new_child.attributes[k] = v
+        self.stack.append(new_child)
+
+    def endElement(self, name):
+        closed = self.stack.pop()
+        assert closed.name == name
+        if len(self.stack) == 0:
+            self.root = closed
+        else:
+            self.stack[-1].children.append(closed)
+
+
+def _print_summary(summary: SummaryTree, commands: Set[str]):
+    cmd = []
+    if config.reproduce_prefix is not None:
+        cmd.append(config.reproduce_prefix)
+    cmd.append('fdbserver')
+    if 'TestFile' in summary.attributes:
+        file_name = summary.attributes['TestFile']
+        role = 'test' if test_harness.run.is_no_sim(Path(file_name)) else 'simulation'
+        cmd += ['-r', role, '-f', file_name]
+    else:
+        cmd += ['-r', 'simulation', '-f', '<ERROR>']
+    if 'RandomSeed' in summary.attributes:
+        cmd += ['-s', summary.attributes['RandomSeed']]
+    else:
+        cmd += ['-s', '<Error>']
+    if 'BuggifyEnabled' in summary.attributes:
+        arg = 'on'
+        if summary.attributes['BuggifyEnabled'].lower() in ['0', 'off', 'false']:
+            arg = 'off'
+        cmd += ['-b', arg]
+    else:
+        cmd += ['b', '<ERROR>']
+    cmd += ['--crash', '--trace_format', config.trace_format]
+    key = ' '.join(cmd)
+    count = 1
+    while key in commands:
+        key = '{} # {}'.format(' '.join(cmd), count)
+        count += 1
+    # we want the command as the first attribute
+    attributes = {'Command': ' '.join(cmd)}
+    for k, v in summary.attributes.items():
+        if k == 'Errors':
+            attributes['ErrorCount'] = v
+        else:
+            attributes[k] = v
+    summary.attributes = attributes
+    if config.details:
+        key = str(len(commands))
+        str_io = io.StringIO()
+        summary.dump(str_io, prefix=('  ' if config.pretty_print else ''))
+        if config.output_format == 'json':
+            sys.stdout.write('{}"Test{}": {}'.format('  ' if config.pretty_print else '',
+                                                     key, str_io.getvalue()))
+        else:
+            sys.stdout.write(str_io.getvalue())
+        if config.pretty_print:
+            sys.stdout.write('\n' if config.output_format == 'xml' else ',\n')
+        return key
+    error_count = 0
+    warning_count = 0
+    small_summary = SummaryTree('Test')
+    small_summary.attributes = attributes
+    errors = SummaryTree('Errors')
+    warnings = SummaryTree('Warnings')
+    buggifies: OrderedDict[str, List[int]] = collections.OrderedDict()
+    for child in summary.children:
+        if 'Severity' in child.attributes and child.attributes['Severity'] == '40' and error_count < config.max_errors:
+            error_count += 1
+            errors.append(child)
+        if 'Severity' in child.attributes and child.attributes[
+            'Severity'] == '30' and warning_count < config.max_warnings:
+            warning_count += 1
+            warnings.append(child)
+        if child.name == 'BuggifySection':
+            file = child.attributes['File']
+            line = int(child.attributes['Line'])
+            buggifies.setdefault(file, []).append(line)
+    buggifies_elem = SummaryTree('Buggifies')
+    for file, lines in buggifies.items():
+        lines.sort()
+        if config.output_format == 'json':
+            buggifies_elem.attributes[file] = ' '.join(str(line) for line in lines)
+        else:
+            child = SummaryTree('Buggify')
+            child.attributes['File'] = file
+            child.attributes['Lines'] = ' '.join(str(line) for line in lines)
+            small_summary.append(child)
+    small_summary.children.append(buggifies_elem)
+    if len(errors.children) > 0:
+        small_summary.children.append(errors)
+    if len(warnings.children) > 0:
+        small_summary.children.append(warnings)
+    output = io.StringIO()
+    small_summary.dump(output, prefix=('  ' if config.pretty_print else ''))
+    if config.output_format == 'json':
+        sys.stdout.write('{}"{}": {}'.format('  ' if config.pretty_print else '', key, output.getvalue().strip()))
+    else:
+        sys.stdout.write('{}{}'.format('  ' if config.pretty_print else '', output.getvalue().strip()))
+    sys.stdout.write('\n' if config.output_format == 'xml' else ',\n')
+
+
+def print_errors(ensemble_id: str):
+    joshua_model.open(config.cluster_file)
+    properties = joshua_model.get_ensemble_properties(ensemble_id)
+    compressed = properties["compressed"] if "compressed" in properties else False
+    for rec in joshua_model.tail_results(ensemble_id, errors_only=(not config.success), compressed=compressed):
+        if len(rec) == 5:
+            version_stamp, result_code, host, seed, output = rec
+        elif len(rec) == 4:
+            version_stamp, result_code, host, output = rec
+            seed = None
+        elif len(rec) == 3:
+            version_stamp, result_code, output = rec
+            host = None
+            seed = None
+        elif len(rec) == 2:
+            version_stamp, seed = rec
+            output = str(joshua_model.fdb.tuple.unpack(seed)[0]) + "\n"
+            result_code = None
+            host = None
+            seed = None
+        else:
+            raise Exception("Unknown result format")
+        lines = output.splitlines()
+        commands: Set[str] = set()
+        for line in lines:
+            summary = ToSummaryTree()
+            xml.sax.parseString(line, summary)
+            commands.add(_print_summary(summary.result(), commands))
diff --git a/contrib/TestHarness2/test_harness/results.py b/contrib/TestHarness2/test_harness/results.py
new file mode 100644
index 0000000000..486c497d35
--- /dev/null
+++ b/contrib/TestHarness2/test_harness/results.py
@@ -0,0 +1,144 @@
+from __future__ import annotations
+
+import argparse
+import io
+import json
+import re
+import sys
+import test_harness.fdb
+
+from typing import List, Tuple, OrderedDict
+from test_harness.summarize import SummaryTree, Coverage
+from test_harness.config import config
+from xml.sax.saxutils import quoteattr
+
+
+class GlobalStatistics:
+    def __init__(self):
+        self.total_probes_hit: int = 0
+        self.total_cpu_time: int = 0
+        self.total_test_runs: int = 0
+        self.total_missed_probes: int = 0
+
+
+class EnsembleResults:
+    def __init__(self, cluster_file: str | None, ensemble_id: str):
+        self.global_statistics = GlobalStatistics()
+        self.fdb_path = ('joshua', 'ensembles', 'results', 'application', ensemble_id)
+        self.coverage_path = self.fdb_path + ('coverage',)
+        self.statistics = test_harness.fdb.Statistics(cluster_file, self.fdb_path)
+        coverage_dict: OrderedDict[Coverage, int] = test_harness.fdb.read_coverage(cluster_file, self.coverage_path)
+        self.coverage: List[Tuple[Coverage, int]] = []
+        self.min_coverage_hit: int | None = None
+        self.ratio = self.global_statistics.total_test_runs / config.hit_per_runs_ratio
+        for cov, count in coverage_dict.items():
+            if re.search(config.cov_include_files, cov.file) is None:
+                continue
+            if re.search(config.cov_exclude_files, cov.file) is not None:
+                continue
+            self.global_statistics.total_probes_hit += count
+            self.coverage.append((cov, count))
+            if count <= self.ratio:
+                self.global_statistics.total_missed_probes += 1
+            if self.min_coverage_hit is None or self.min_coverage_hit > count:
+                self.min_coverage_hit = count
+        self.coverage.sort(key=lambda x: (x[1], x[0].file, x[0].line))
+        self.stats: List[Tuple[str, int, int]] = []
+        for k, v in self.statistics.stats.items():
+            self.global_statistics.total_test_runs += v.run_count
+            self.global_statistics.total_cpu_time += v.runtime
+            self.stats.append((k, v.runtime, v.run_count))
+        self.stats.sort(key=lambda x: x[1], reverse=True)
+        if self.min_coverage_hit is not None:
+            self.coverage_ok = self.min_coverage_hit > self.ratio
+        else:
+            self.coverage_ok = False
+
+    def dump(self, prefix: str):
+        errors = 0
+        out = SummaryTree('EnsembleResults')
+        out.attributes['TotalRuntime'] = str(self.global_statistics.total_cpu_time)
+        out.attributes['TotalTestRuns'] = str(self.global_statistics.total_test_runs)
+        out.attributes['TotalProbesHit'] = str(self.global_statistics.total_probes_hit)
+        out.attributes['MinProbeHit'] = str(self.min_coverage_hit)
+        out.attributes['TotalProbes'] = str(len(self.coverage))
+        out.attributes['MissedProbes'] = str(self.global_statistics.total_missed_probes)
+
+        for cov, count in self.coverage:
+            severity = 10 if count > self.ratio else 40
+            if severity == 40:
+                errors += 1
+            if (severity == 40 and errors <= config.max_errors) or config.details:
+                child = SummaryTree('CodeProbe')
+                child.attributes['Severity'] = str(severity)
+                child.attributes['File'] = cov.file
+                child.attributes['Line'] = str(cov.line)
+                child.attributes['Comment'] = '' if cov.comment is None else cov.comment
+                child.attributes['HitCount'] = str(count)
+                out.append(child)
+
+        if config.details:
+            for k, runtime, run_count in self.stats:
+                child = SummaryTree('Test')
+                child.attributes['Name'] = k
+                child.attributes['Runtime'] = str(runtime)
+                child.attributes['RunCount'] = str(run_count)
+                out.append(child)
+        if errors > 0:
+            out.attributes['Errors'] = str(errors)
+        str_io = io.StringIO()
+        out.dump(str_io, prefix=prefix, new_line=config.pretty_print)
+        if config.output_format == 'xml':
+            sys.stdout.write(str_io.getvalue())
+        else:
+            sys.stdout.write('{}"EnsembleResults":{}{}'.format('  ' if config.pretty_print else '',
+                                                               '\n' if config.pretty_print else ' ',
+                                                               str_io.getvalue()))
+
+
+def write_header(ensemble_id: str):
+    if config.output_format == 'json':
+        if config.pretty_print:
+            print('{')
+            print('  "{}": {},\n'.format('ID', json.dumps(ensemble_id.strip())))
+        else:
+            sys.stdout.write('{{{}: {},'.format('ID', json.dumps(ensemble_id.strip())))
+    elif config.output_format == 'xml':
+        sys.stdout.write('<Ensemble ID={}>'.format(quoteattr(ensemble_id.strip())))
+        if config.pretty_print:
+            sys.stdout.write('\n')
+    else:
+        assert False, 'unknown output format {}'.format(config.output_format)
+
+
+def write_footer():
+    if config.output_format == 'xml':
+        sys.stdout.write('</Ensemble>\n')
+    elif config.output_format == 'json':
+        sys.stdout.write('}\n')
+    else:
+        assert False, 'unknown output format {}'.format(config.output_format)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('TestHarness Results', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    config.change_default('pretty_print', True)
+    config.change_default('max_warnings', 0)
+    config.build_arguments(parser)
+    parser.add_argument('ensemble_id', type=str, help='The ensemble to fetch the result for')
+    args = parser.parse_args()
+    config.extract_args(args)
+    config.output_format = args.output_format
+    write_header(args.ensemble_id)
+    try:
+        import test_harness.joshua
+        test_harness.joshua.print_errors(args.ensemble_id)
+    except ModuleNotFoundError:
+        child = SummaryTree('JoshuaNotFound')
+        child.attributes['Severity'] = '30'
+        child.attributes['Message'] = 'Could not import Joshua -- set PYTHONPATH to joshua checkout dir'
+        child.dump(sys.stdout, prefix=('  ' if config.pretty_print else ''), new_line=config.pretty_print)
+    results = EnsembleResults(config.cluster_file, args.ensemble_id)
+    results.dump('  ' if config.pretty_print else '')
+    write_footer()
+    exit(0 if results.coverage_ok else 1)
diff --git a/contrib/TestHarness2/test_harness/run.py b/contrib/TestHarness2/test_harness/run.py
new file mode 100644
index 0000000000..c5e948eb6d
--- /dev/null
+++ b/contrib/TestHarness2/test_harness/run.py
@@ -0,0 +1,465 @@
+from __future__ import annotations
+
+import array
+import base64
+import collections
+import math
+import os
+import resource
+import shutil
+import subprocess
+import re
+import sys
+import threading
+import time
+import uuid
+
+from functools import total_ordering
+from pathlib import Path
+from test_harness.version import Version
+from test_harness.config import config
+from typing import List, Pattern, OrderedDict
+
+from test_harness.summarize import Summary, SummaryTree
+
+
+@total_ordering
+class TestDescription:
+    def __init__(self, path: Path, name: str, priority: float):
+        self.paths: List[Path] = [path]
+        self.name = name
+        self.priority: float = priority
+        # we only measure in seconds. Otherwise, keeping determinism will be difficult
+        self.total_runtime: int = 0
+        self.num_runs: int = 0
+
+    def __lt__(self, other):
+        if isinstance(other, TestDescription):
+            return self.name < other.name
+        else:
+            return self.name < str(other)
+
+    def __eq__(self, other):
+        if isinstance(other, TestDescription):
+            return self.name < other.name
+        else:
+            return self.name < str(other.name)
+
+
+class StatFetcher:
+    def __init__(self, tests: OrderedDict[str, TestDescription]):
+        self.tests = tests
+
+    def read_stats(self):
+        pass
+
+    def add_run_time(self, test_name: str, runtime: int, out: SummaryTree):
+        self.tests[test_name].total_runtime += runtime
+
+
+class TestPicker:
+    def __init__(self, test_dir: Path):
+        if not test_dir.exists():
+            raise RuntimeError('{} is neither a directory nor a file'.format(test_dir))
+        self.include_files_regex = re.compile(config.include_test_files)
+        self.exclude_files_regex = re.compile(config.exclude_test_files)
+        self.include_tests_regex = re.compile(config.include_test_classes)
+        self.exclude_tests_regex = re.compile(config.exclude_test_names)
+        self.test_dir: Path = test_dir
+        self.tests: OrderedDict[str, TestDescription] = collections.OrderedDict()
+        self.restart_test: Pattern = re.compile(r".*-\d+\.(txt|toml)")
+        self.follow_test: Pattern = re.compile(r".*-[2-9]\d*\.(txt|toml)")
+
+        for subdir in self.test_dir.iterdir():
+            if subdir.is_dir() and subdir.name in config.test_dirs:
+                self.walk_test_dir(subdir)
+        self.stat_fetcher: StatFetcher
+        if config.stats is not None or config.joshua_dir is None:
+            self.stat_fetcher = StatFetcher(self.tests)
+        else:
+            from test_harness.fdb import FDBStatFetcher
+            self.stat_fetcher = FDBStatFetcher(self.tests)
+        if config.stats is not None:
+            self.load_stats(config.stats)
+        else:
+            self.fetch_stats()
+
+    def add_time(self, test_file: Path, run_time: int, out: SummaryTree) -> None:
+        # getting the test name is fairly inefficient. But since we only have 100s of tests, I won't bother
+        test_name: str | None = None
+        test_desc: TestDescription | None = None
+        for name, test in self.tests.items():
+            for p in test.paths:
+                test_files: List[Path]
+                if self.restart_test.match(p.name):
+                    test_files = self.list_restart_files(p)
+                else:
+                    test_files = [p]
+                for file in test_files:
+                    if file.absolute() == test_file.absolute():
+                        test_name = name
+                        test_desc = test
+                        break
+                if test_name is not None:
+                    break
+            if test_name is not None:
+                break
+        assert test_name is not None and test_desc is not None
+        self.stat_fetcher.add_run_time(test_name, run_time, out)
+        out.attributes['TotalTestTime'] = str(test_desc.total_runtime)
+        out.attributes['TestRunCount'] = str(test_desc.num_runs)
+
+    def dump_stats(self) -> str:
+        res = array.array('I')
+        for _, spec in self.tests.items():
+            res.append(spec.total_runtime)
+        return base64.standard_b64encode(res.tobytes()).decode('utf-8')
+
+    def fetch_stats(self):
+        self.stat_fetcher.read_stats()
+
+    def load_stats(self, serialized: str):
+        times = array.array('I')
+        times.frombytes(base64.standard_b64decode(serialized))
+        assert len(times) == len(self.tests.items())
+        for idx, (_, spec) in enumerate(self.tests.items()):
+            spec.total_runtime = times[idx]
+
+    def parse_txt(self, path: Path):
+        if self.include_files_regex.search(str(path)) is None or self.exclude_files_regex.search(str(path)) is not None:
+            return
+        with path.open('r') as f:
+            test_name: str | None = None
+            test_class: str | None = None
+            priority: float | None = None
+            for line in f:
+                line = line.strip()
+                kv = line.split('=')
+                if len(kv) != 2:
+                    continue
+                kv[0] = kv[0].strip()
+                kv[1] = kv[1].strip(' \r\n\t\'"')
+                if kv[0] == 'testTitle' and test_name is None:
+                    test_name = kv[1]
+                if kv[0] == 'testClass' and test_class is None:
+                    test_class = kv[1]
+                if kv[0] == 'testPriority' and priority is None:
+                    try:
+                        priority = float(kv[1])
+                    except ValueError:
+                        raise RuntimeError("Can't parse {} -- testPriority in {} should be set to a float".format(kv[1],
+                                                                                                                  path))
+                if test_name is not None and test_class is not None and priority is not None:
+                    break
+            if test_name is None:
+                return
+            if test_class is None:
+                test_class = test_name
+            if priority is None:
+                priority = 1.0
+            if self.include_tests_regex.search(test_class) is None \
+                    or self.exclude_tests_regex.search(test_class) is not None:
+                return
+            if test_class not in self.tests:
+                self.tests[test_class] = TestDescription(path, test_class, priority)
+            else:
+                self.tests[test_class].paths.append(path)
+
+    def walk_test_dir(self, test: Path):
+        if test.is_dir():
+            for file in test.iterdir():
+                self.walk_test_dir(file)
+        else:
+            # check whether we're looking at a restart test
+            if self.follow_test.match(test.name) is not None:
+                return
+            if test.suffix == '.txt' or test.suffix == '.toml':
+                self.parse_txt(test)
+
+    @staticmethod
+    def list_restart_files(start_file: Path) -> List[Path]:
+        name = re.sub(r'-\d+.(txt|toml)', '', start_file.name)
+        res: List[Path] = []
+        for test_file in start_file.parent.iterdir():
+            if test_file.name.startswith(name):
+                res.append(test_file)
+        assert len(res) > 1
+        res.sort()
+        return res
+
+    def choose_test(self) -> List[Path]:
+        min_runtime: float | None = None
+        candidates: List[TestDescription] = []
+        for _, v in self.tests.items():
+            this_time = v.total_runtime * v.priority
+            if min_runtime is None or this_time < min_runtime:
+                min_runtime = this_time
+                candidates = [v]
+            elif this_time == min_runtime:
+                candidates.append(v)
+        candidates.sort()
+        choice = config.random.randint(0, len(candidates) - 1)
+        test = candidates[choice]
+        result = test.paths[config.random.randint(0, len(test.paths) - 1)]
+        if self.restart_test.match(result.name):
+            return self.list_restart_files(result)
+        else:
+            return [result]
+
+
+class OldBinaries:
+    def __init__(self):
+        self.first_file_expr = re.compile(r'.*-1\.(txt|toml)')
+        self.old_binaries_path: Path = config.old_binaries_path
+        self.binaries: OrderedDict[Version, Path] = collections.OrderedDict()
+        if not self.old_binaries_path.exists() or not self.old_binaries_path.is_dir():
+            return
+        exec_pattern = re.compile(r'fdbserver-\d+\.\d+\.\d+(\.exe)?')
+        for file in self.old_binaries_path.iterdir():
+            if not file.is_file() or not os.access(file, os.X_OK):
+                continue
+            if exec_pattern.fullmatch(file.name) is not None:
+                self._add_file(file)
+
+    def _add_file(self, file: Path):
+        version_str = file.name.split('-')[1]
+        if version_str.endswith('.exe'):
+            version_str = version_str[0:-len('.exe')]
+        ver = Version.parse(version_str)
+        self.binaries[ver] = file
+
+    def choose_binary(self, test_file: Path) -> Path:
+        if len(self.binaries) == 0:
+            return config.binary
+        max_version = Version.max_version()
+        min_version = Version.parse('5.0.0')
+        dirs = test_file.parent.parts
+        if 'restarting' not in dirs:
+            return config.binary
+        version_expr = dirs[-1].split('_')
+        first_file = self.first_file_expr.match(test_file.name) is not None
+        if first_file and version_expr[0] == 'to':
+            # downgrade test -- first binary should be current one
+            return config.binary
+        if not first_file and version_expr[0] == 'from':
+            # upgrade test -- we only return an old version for the first test file
+            return config.binary
+        if version_expr[0] == 'from' or version_expr[0] == 'to':
+            min_version = Version.parse(version_expr[1])
+        if len(version_expr) == 4 and version_expr[2] == 'until':
+            max_version = Version.parse(version_expr[3])
+        candidates: List[Path] = []
+        for ver, binary in self.binaries.items():
+            if min_version <= ver <= max_version:
+                candidates.append(binary)
+        if len(candidates) == 0:
+            return config.binary
+        return config.random.choice(candidates)
+
+
+def is_restarting_test(test_file: Path):
+    for p in test_file.parts:
+        if p == 'restarting':
+            return True
+    return False
+
+
+def is_no_sim(test_file: Path):
+    return test_file.parts[-2] == 'noSim'
+
+
+class ResourceMonitor(threading.Thread):
+    def __init__(self):
+        super().__init__()
+        self.start_time = time.time()
+        self.end_time: float | None = None
+        self._stop_monitor = False
+        self.max_rss = 0
+
+    def run(self) -> None:
+        while not self._stop_monitor:
+            time.sleep(1)
+            resources = resource.getrusage(resource.RUSAGE_CHILDREN)
+            self.max_rss = max(resources.ru_maxrss, self.max_rss)
+
+    def stop(self):
+        self.end_time = time.time()
+        self._stop_monitor = True
+
+    def time(self):
+        return self.end_time - self.start_time
+
+
+class TestRun:
+    def __init__(self, binary: Path, test_file: Path, random_seed: int, uid: uuid.UUID,
+                 restarting: bool = False, test_determinism: bool = False, buggify_enabled: bool = False,
+                 stats: str | None = None, expected_unseed: int | None = None, will_restart: bool = False):
+        self.binary = binary
+        self.test_file = test_file
+        self.random_seed = random_seed
+        self.uid = uid
+        self.restarting = restarting
+        self.test_determinism = test_determinism
+        self.stats: str | None = stats
+        self.expected_unseed: int | None = expected_unseed
+        self.use_valgrind: bool = config.use_valgrind
+        self.old_binary_path: Path = config.old_binaries_path
+        self.buggify_enabled: bool = buggify_enabled
+        self.fault_injection_enabled: bool = True
+        self.trace_format: str | None = config.trace_format
+        if Version.of_binary(self.binary) < "6.1.0":
+            self.trace_format = None
+        self.temp_path = config.run_dir / str(self.uid)
+        # state for the run
+        self.retryable_error: bool = False
+        self.summary: Summary = Summary(binary, uid=self.uid, stats=self.stats, expected_unseed=self.expected_unseed,
+                                        will_restart=will_restart)
+        self.run_time: int = 0
+        self.success = self.run()
+
+    def log_test_plan(self, out: SummaryTree):
+        test_plan: SummaryTree = SummaryTree('TestPlan')
+        test_plan.attributes['TestUID'] = str(self.uid)
+        test_plan.attributes['RandomSeed'] = str(self.random_seed)
+        test_plan.attributes['TestFile'] = str(self.test_file)
+        test_plan.attributes['Buggify'] = '1' if self.buggify_enabled else '0'
+        test_plan.attributes['FaultInjectionEnabled'] = '1' if self.fault_injection_enabled else '0'
+        test_plan.attributes['DeterminismCheck'] = '1' if self.test_determinism else '0'
+        out.append(test_plan)
+
+    def delete_simdir(self):
+        shutil.rmtree(self.temp_path / Path('simfdb'))
+
+    def run(self):
+        command: List[str] = []
+        valgrind_file: Path | None = None
+        if self.use_valgrind:
+            command.append('valgrind')
+            valgrind_file = self.temp_path / Path('valgrind-{}.xml'.format(self.random_seed))
+            dbg_path = os.getenv('FDB_VALGRIND_DBGPATH')
+            if dbg_path is not None:
+                command.append('--extra-debuginfo-path={}'.format(dbg_path))
+            command += ['--xml=yes', '--xml-file={}'.format(valgrind_file.absolute()), '-q']
+        command += [str(self.binary.absolute()),
+                    '-r', 'test' if is_no_sim(self.test_file) else 'simulation',
+                    '-f', str(self.test_file),
+                    '-s', str(self.random_seed)]
+        if self.trace_format is not None:
+            command += ['--trace_format', self.trace_format]
+        if Version.of_binary(self.binary) >= '7.1.0':
+            command += ['-fi', 'on' if self.fault_injection_enabled else 'off']
+        if self.restarting:
+            command.append('--restarting')
+        if self.buggify_enabled:
+            command += ['-b', 'on']
+        if config.crash_on_error:
+            command.append('--crash')
+
+        self.temp_path.mkdir(parents=True, exist_ok=True)
+
+        # self.log_test_plan(out)
+        resources = ResourceMonitor()
+        resources.start()
+        process = subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, cwd=self.temp_path,
+                                   text=True)
+        did_kill = False
+        timeout = 20 * config.kill_seconds if self.use_valgrind else config.kill_seconds
+        err_out: str
+        try:
+            _, err_out = process.communicate(timeout=timeout)
+        except subprocess.TimeoutExpired:
+            process.kill()
+            _, err_out = process.communicate()
+            did_kill = True
+        resources.stop()
+        resources.join()
+        # we're rounding times up, otherwise we will prefer running very short tests (<1s)
+        self.run_time = math.ceil(resources.time())
+        self.summary.runtime = resources.time()
+        self.summary.max_rss = resources.max_rss
+        self.summary.was_killed = did_kill
+        self.summary.valgrind_out_file = valgrind_file
+        self.summary.error_out = err_out
+        self.summary.summarize(self.temp_path, ' '.join(command))
+        return self.summary.ok()
+
+
+def decorate_summary(out: SummaryTree, test_file: Path, seed: int, buggify: bool):
+    """Sometimes a test can crash before ProgramStart is written to the traces. These
+    tests are then hard to reproduce (they can be reproduced through TestHarness but
+    require the user to run in the joshua docker container). To account for this we
+    will write the necessary information into the attributes if it is missing."""
+    if 'TestFile' not in out.attributes:
+        out.attributes['TestFile'] = str(test_file)
+    if 'RandomSeed' not in out.attributes:
+        out.attributes['RandomSeed'] = str(seed)
+    if 'BuggifyEnabled' not in out.attributes:
+        out.attributes['BuggifyEnabled'] = '1' if buggify else '0'
+
+
+class TestRunner:
+    def __init__(self):
+        self.uid = uuid.uuid4()
+        self.test_path: Path = Path('tests')
+        self.cluster_file: str | None = None
+        self.fdb_app_dir: str | None = None
+        self.binary_chooser = OldBinaries()
+        self.test_picker = TestPicker(self.test_path)
+
+    def backup_sim_dir(self, seed: int):
+        temp_dir = config.run_dir / str(self.uid)
+        src_dir = temp_dir / 'simfdb'
+        assert src_dir.is_dir()
+        dest_dir = temp_dir / 'simfdb.{}'.format(seed)
+        assert not dest_dir.exists()
+        shutil.copytree(src_dir, dest_dir)
+
+    def restore_sim_dir(self, seed: int):
+        temp_dir = config.run_dir / str(self.uid)
+        src_dir = temp_dir / 'simfdb.{}'.format(seed)
+        assert src_dir.exists()
+        dest_dir = temp_dir / 'simfdb'
+        shutil.rmtree(dest_dir)
+        shutil.move(src_dir, dest_dir)
+
+    def run_tests(self, test_files: List[Path], seed: int, test_picker: TestPicker) -> bool:
+        result: bool = True
+        for count, file in enumerate(test_files):
+            will_restart = count + 1 < len(test_files)
+            binary = self.binary_chooser.choose_binary(file)
+            unseed_check = not is_no_sim(file) and config.random.random() < config.unseed_check_ratio
+            buggify_enabled: bool = config.random.random() < config.buggify_on_ratio
+            if unseed_check and count != 0:
+                # for restarting tests we will need to restore the sim2 after the first run
+                self.backup_sim_dir(seed + count - 1)
+            run = TestRun(binary, file.absolute(), seed + count, self.uid, restarting=count != 0,
+                          stats=test_picker.dump_stats(), will_restart=will_restart, buggify_enabled=buggify_enabled)
+            result = result and run.success
+            test_picker.add_time(test_files[0], run.run_time, run.summary.out)
+            decorate_summary(run.summary.out, file, seed + count, run.buggify_enabled)
+            if unseed_check and run.summary.unseed:
+                run.summary.out.append(run.summary.list_simfdb())
+            run.summary.out.dump(sys.stdout)
+            if not result:
+                return False
+            if unseed_check and run.summary.unseed is not None:
+                if count != 0:
+                    self.restore_sim_dir(seed + count - 1)
+                run2 = TestRun(binary, file.absolute(), seed + count, self.uid, restarting=count != 0,
+                               stats=test_picker.dump_stats(), expected_unseed=run.summary.unseed,
+                               will_restart=will_restart, buggify_enabled=buggify_enabled)
+                test_picker.add_time(file, run2.run_time, run.summary.out)
+                decorate_summary(run2.summary.out, file, seed + count, run.buggify_enabled)
+                run2.summary.out.dump(sys.stdout)
+                result = result and run2.success
+                if not result:
+                    return False
+        return result
+
+    def run(self) -> bool:
+        seed = config.random_seed if config.random_seed is not None else config.random.randint(0, 2 ** 32 - 1)
+        test_files = self.test_picker.choose_test()
+        success = self.run_tests(test_files, seed, self.test_picker)
+        if config.clean_up:
+            shutil.rmtree(config.run_dir / str(self.uid))
+        return success
diff --git a/contrib/TestHarness2/test_harness/summarize.py b/contrib/TestHarness2/test_harness/summarize.py
new file mode 100644
index 0000000000..8be5d2b507
--- /dev/null
+++ b/contrib/TestHarness2/test_harness/summarize.py
@@ -0,0 +1,620 @@
+from __future__ import annotations
+
+import collections
+import inspect
+import json
+import os
+import re
+import sys
+import traceback
+import uuid
+import xml.sax
+import xml.sax.handler
+import xml.sax.saxutils
+
+from pathlib import Path
+from typing import List, Dict, TextIO, Callable, Optional, OrderedDict, Any, Tuple, Iterator, Iterable
+
+from test_harness.config import config
+from test_harness.valgrind import parse_valgrind_output
+
+
+class SummaryTree:
+    def __init__(self, name: str):
+        self.name = name
+        self.children: List[SummaryTree] = []
+        self.attributes: Dict[str, str] = {}
+
+    def append(self, element: SummaryTree):
+        self.children.append(element)
+
+    def to_dict(self, add_name: bool = True) -> Dict[str, Any] | List[Any]:
+        if len(self.children) > 0 and len(self.attributes) == 0:
+            children = []
+            for child in self.children:
+                children.append(child.to_dict())
+            if add_name:
+                return {self.name: children}
+            else:
+                return children
+        res: Dict[str, Any] = {}
+        if add_name:
+            res['Type'] = self.name
+        for k, v in self.attributes.items():
+            res[k] = v
+        children = []
+        child_keys: Dict[str, int] = {}
+        for child in self.children:
+            if child.name in child_keys:
+                child_keys[child.name] += 1
+            else:
+                child_keys[child.name] = 1
+        for child in self.children:
+            if child_keys[child.name] == 1 and child.name not in self.attributes:
+                res[child.name] = child.to_dict(add_name=False)
+            else:
+                children.append(child.to_dict())
+        if len(children) > 0:
+            res['children'] = children
+        return res
+
+    def to_json(self, out: TextIO, prefix: str = ''):
+        res = json.dumps(self.to_dict(), indent=('  ' if config.pretty_print else None))
+        for line in res.splitlines(False):
+            out.write('{}{}\n'.format(prefix, line))
+
+    def to_xml(self, out: TextIO, prefix: str = ''):
+        # minidom doesn't support omitting the xml declaration which is a problem for joshua
+        # However, our xml is very simple and therefore serializing manually is easy enough
+        attrs = []
+        print_width = 120
+        try:
+            print_width, _ = os.get_terminal_size()
+        except OSError:
+            pass
+        for k, v in self.attributes.items():
+            attrs.append('{}={}'.format(k, xml.sax.saxutils.quoteattr(v)))
+        elem = '{}<{}{}'.format(prefix, self.name, ('' if len(attrs) == 0 else ' '))
+        out.write(elem)
+        if config.pretty_print:
+            curr_line_len = len(elem)
+            for i in range(len(attrs)):
+                attr_len = len(attrs[i])
+                if i == 0 or attr_len + curr_line_len + 1 <= print_width:
+                    if i != 0:
+                        out.write(' ')
+                    out.write(attrs[i])
+                    curr_line_len += attr_len
+                else:
+                    out.write('\n')
+                    out.write(' ' * len(elem))
+                    out.write(attrs[i])
+                    curr_line_len = len(elem) + attr_len
+        else:
+            out.write(' '.join(attrs))
+        if len(self.children) == 0:
+            out.write('/>')
+        else:
+            out.write('>')
+        for child in self.children:
+            if config.pretty_print:
+                out.write('\n')
+            child.to_xml(out, prefix=('  {}'.format(prefix) if config.pretty_print else prefix))
+        if len(self.children) > 0:
+            out.write('{}{}</{}>'.format(('\n' if config.pretty_print else ''), prefix, self.name))
+
+    def dump(self, out: TextIO, prefix: str = '', new_line: bool = True):
+        if config.output_format == 'json':
+            self.to_json(out, prefix=prefix)
+        else:
+            self.to_xml(out, prefix=prefix)
+        if new_line:
+            out.write('\n')
+
+
+ParserCallback = Callable[[Dict[str, str]], Optional[str]]
+
+
+class ParseHandler:
+    def __init__(self, out: SummaryTree):
+        self.out = out
+        self.events: OrderedDict[Optional[Tuple[str, Optional[str]]], List[ParserCallback]] = collections.OrderedDict()
+
+    def add_handler(self, attr: Tuple[str, Optional[str]], callback: ParserCallback) -> None:
+        self.events.setdefault(attr, []).append(callback)
+
+    def _call(self, callback: ParserCallback, attrs: Dict[str, str]) -> str | None:
+        try:
+            return callback(attrs)
+        except Exception as e:
+            _, _, exc_traceback = sys.exc_info()
+            child = SummaryTree('NonFatalParseError')
+            child.attributes['Severity'] = '30'
+            child.attributes['ErrorMessage'] = str(e)
+            child.attributes['Trace'] = repr(traceback.format_tb(exc_traceback))
+            self.out.append(child)
+            return None
+
+    def handle(self, attrs: Dict[str, str]):
+        if None in self.events:
+            for callback in self.events[None]:
+                self._call(callback, attrs)
+        for k, v in attrs.items():
+            if (k, None) in self.events:
+                for callback in self.events[(k, None)]:
+                    remap = self._call(callback, attrs)
+                    if remap is not None:
+                        v = remap
+                        attrs[k] = v
+            if (k, v) in self.events:
+                for callback in self.events[(k, v)]:
+                    remap = self._call(callback, attrs)
+                    if remap is not None:
+                        v = remap
+                        attrs[k] = v
+
+
+class Parser:
+    def parse(self, file: TextIO, handler: ParseHandler) -> None:
+        pass
+
+
+class XmlParser(Parser, xml.sax.handler.ContentHandler):
+    def __init__(self):
+        super().__init__()
+        self.handler: ParseHandler | None = None
+
+    def parse(self, file: TextIO, handler: ParseHandler) -> None:
+        xml.sax.parse(file, self)
+
+    def startElement(self, name, attrs) -> None:
+        attributes: Dict[str, str] = {}
+        for name in attrs.getNames():
+            attributes[name] = attrs.getValue(name)
+        assert self.handler is not None
+        self.handler.handle(attributes)
+
+
+class JsonParser(Parser):
+    def __init__(self):
+        super().__init__()
+
+    def parse(self, file: TextIO, handler: ParseHandler):
+        for line in file:
+            obj = json.loads(line)
+            handler.handle(obj)
+
+
+class Coverage:
+    def __init__(self, file: str, line: str | int, comment: str | None = None):
+        self.file = file
+        self.line = int(line)
+        self.comment = comment
+
+    def to_tuple(self) -> Tuple[str, int, str | None]:
+        return self.file, self.line, self.comment
+
+    def __eq__(self, other) -> bool:
+        if isinstance(other, tuple) and len(other) == 3:
+            return self.to_tuple() == other
+        elif isinstance(other, Coverage):
+            return self.to_tuple() == other.to_tuple()
+        else:
+            return False
+
+    def __lt__(self, other) -> bool:
+        if isinstance(other, tuple) and len(other) == 3:
+            return self.to_tuple() < other
+        elif isinstance(other, Coverage):
+            return self.to_tuple() < other.to_tuple()
+        else:
+            return False
+
+    def __le__(self, other) -> bool:
+        if isinstance(other, tuple) and len(other) == 3:
+            return self.to_tuple() <= other
+        elif isinstance(other, Coverage):
+            return self.to_tuple() <= other.to_tuple()
+        else:
+            return False
+
+    def __gt__(self, other: Coverage) -> bool:
+        if isinstance(other, tuple) and len(other) == 3:
+            return self.to_tuple() > other
+        elif isinstance(other, Coverage):
+            return self.to_tuple() > other.to_tuple()
+        else:
+            return False
+
+    def __ge__(self, other):
+        if isinstance(other, tuple) and len(other) == 3:
+            return self.to_tuple() >= other
+        elif isinstance(other, Coverage):
+            return self.to_tuple() >= other.to_tuple()
+        else:
+            return False
+
+    def __hash__(self):
+        return hash((self.file, self.line, self.comment))
+
+
+class TraceFiles:
+    def __init__(self, path: Path):
+        self.path: Path = path
+        self.timestamps: List[int] = []
+        self.runs: OrderedDict[int, List[Path]] = collections.OrderedDict()
+        trace_expr = re.compile(r'trace.*\.(json|xml)')
+        for file in self.path.iterdir():
+            if file.is_file() and trace_expr.match(file.name) is not None:
+                ts = int(file.name.split('.')[6])
+                if ts in self.runs:
+                    self.runs[ts].append(file)
+                else:
+                    self.timestamps.append(ts)
+                    self.runs[ts] = [file]
+        self.timestamps.sort(reverse=True)
+
+    def __getitem__(self, idx: int) -> List[Path]:
+        res = self.runs[self.timestamps[idx]]
+        res.sort()
+        return res
+
+    def __len__(self) -> int:
+        return len(self.runs)
+
+    def items(self) -> Iterator[List[Path]]:
+        class TraceFilesIterator(Iterable[List[Path]]):
+            def __init__(self, trace_files: TraceFiles):
+                self.current = 0
+                self.trace_files: TraceFiles = trace_files
+
+            def __iter__(self):
+                return self
+
+            def __next__(self) -> List[Path]:
+                if len(self.trace_files) <= self.current:
+                    raise StopIteration
+                self.current += 1
+                return self.trace_files[self.current - 1]
+        return TraceFilesIterator(self)
+
+
+class Summary:
+    def __init__(self, binary: Path, runtime: float = 0, max_rss: int | None = None,
+                 was_killed: bool = False, uid: uuid.UUID | None = None, expected_unseed: int | None = None,
+                 exit_code: int = 0, valgrind_out_file: Path | None = None, stats: str | None = None,
+                 error_out: str = None, will_restart: bool = False):
+        self.binary = binary
+        self.runtime: float = runtime
+        self.max_rss: int | None = max_rss
+        self.was_killed: bool = was_killed
+        self.expected_unseed: int | None = expected_unseed
+        self.exit_code: int = exit_code
+        self.out: SummaryTree = SummaryTree('Test')
+        self.test_begin_found: bool = False
+        self.test_end_found: bool = False
+        self.unseed: int | None = None
+        self.valgrind_out_file: Path | None = valgrind_out_file
+        self.severity_map: OrderedDict[tuple[str, int], int] = collections.OrderedDict()
+        self.error: bool = False
+        self.errors: int = 0
+        self.warnings: int = 0
+        self.coverage: OrderedDict[Coverage, bool] = collections.OrderedDict()
+        self.test_count: int = 0
+        self.tests_passed: int = 0
+        self.error_out = error_out
+        self.stderr_severity: str = '40'
+        self.will_restart: bool = will_restart
+        self.test_dir: Path | None = None
+
+        if uid is not None:
+            self.out.attributes['TestUID'] = str(uid)
+        if stats is not None:
+            self.out.attributes['Statistics'] = stats
+        self.out.attributes['JoshuaSeed'] = str(config.joshua_seed)
+        self.out.attributes['WillRestart'] = '1' if self.will_restart else '0'
+
+        self.handler = ParseHandler(self.out)
+        self.register_handlers()
+
+    def summarize_files(self, trace_files: List[Path]):
+        assert len(trace_files) > 0
+        for f in trace_files:
+            self.parse_file(f)
+        self.done()
+
+    def summarize(self, trace_dir: Path, command: str):
+        self.test_dir = trace_dir
+        trace_files = TraceFiles(trace_dir)
+        if len(trace_files) == 0:
+            self.error = True
+            child = SummaryTree('NoTracesFound')
+            child.attributes['Severity'] = '40'
+            child.attributes['Path'] = str(trace_dir.absolute())
+            child.attributes['Command'] = command
+            self.out.append(child)
+            return
+        self.summarize_files(trace_files[0])
+        if config.joshua_dir is not None:
+            import test_harness.fdb
+            test_harness.fdb.write_coverage(config.cluster_file,
+                                            test_harness.fdb.str_to_tuple(config.joshua_dir) + ('coverage',),
+                                            test_harness.fdb.str_to_tuple(config.joshua_dir) + ('coverage-metadata',),
+                                            self.coverage)
+
+    def list_simfdb(self) -> SummaryTree:
+        res = SummaryTree('SimFDB')
+        res.attributes['TestDir'] = str(self.test_dir)
+        if self.test_dir is None:
+            return res
+        simfdb = self.test_dir / Path('simfdb')
+        if not simfdb.exists():
+            res.attributes['NoSimDir'] = "simfdb doesn't exist"
+            return res
+        elif not simfdb.is_dir():
+            res.attributes['NoSimDir'] = 'simfdb is not a directory'
+            return res
+        for file in simfdb.iterdir():
+            child = SummaryTree('Directory' if file.is_dir() else 'File')
+            child.attributes['Name'] = file.name
+            res.append(child)
+        return res
+
+    def ok(self):
+        return not self.error
+
+    def done(self):
+        if config.print_coverage:
+            for k, v in self.coverage.items():
+                child = SummaryTree('CodeCoverage')
+                child.attributes['File'] = k.file
+                child.attributes['Line'] = str(k.line)
+                if not v:
+                    child.attributes['Covered'] = '0'
+                if k.comment is not None and len(k.comment):
+                    child.attributes['Comment'] = k.comment
+                self.out.append(child)
+        if self.warnings > config.max_warnings:
+            child = SummaryTree('WarningLimitExceeded')
+            child.attributes['Severity'] = '30'
+            child.attributes['WarningCount'] = str(self.warnings)
+            self.out.append(child)
+        if self.errors > config.max_errors:
+            child = SummaryTree('ErrorLimitExceeded')
+            child.attributes['Severity'] = '40'
+            child.attributes['ErrorCount'] = str(self.errors)
+            self.out.append(child)
+        if self.was_killed:
+            child = SummaryTree('ExternalTimeout')
+            child.attributes['Severity'] = '40'
+            self.out.append(child)
+            self.error = True
+        if self.max_rss is not None:
+            self.out.attributes['PeakMemory'] = str(self.max_rss)
+        if self.valgrind_out_file is not None:
+            try:
+                valgrind_errors = parse_valgrind_output(self.valgrind_out_file)
+                for valgrind_error in valgrind_errors:
+                    if valgrind_error.kind.startswith('Leak'):
+                        continue
+                    self.error = True
+                    child = SummaryTree('ValgrindError')
+                    child.attributes['Severity'] = '40'
+                    child.attributes['What'] = valgrind_error.what.what
+                    child.attributes['Backtrace'] = valgrind_error.what.backtrace
+                    aux_count = 0
+                    for aux in valgrind_error.aux:
+                        child.attributes['WhatAux{}'.format(aux_count)] = aux.what
+                        child.attributes['BacktraceAux{}'.format(aux_count)] = aux.backtrace
+                        aux_count += 1
+                    self.out.append(child)
+            except Exception as e:
+                self.error = True
+                child = SummaryTree('ValgrindParseError')
+                child.attributes['Severity'] = '40'
+                child.attributes['ErrorMessage'] = str(e)
+                _, _, exc_traceback = sys.exc_info()
+                child.attributes['Trace'] = repr(traceback.format_tb(exc_traceback))
+                self.out.append(child)
+        if not self.test_end_found:
+            child = SummaryTree('TestUnexpectedlyNotFinished')
+            child.attributes['Severity'] = '40'
+            self.out.append(child)
+        if self.error_out is not None and len(self.error_out) > 0:
+            lines = self.error_out.splitlines()
+            stderr_bytes = 0
+            for line in lines:
+                if line.endswith("WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!"):
+                    # When running ASAN we expect to see this message. Boost coroutine should be using the correct asan annotations so that it shouldn't produce any false positives.
+                    continue
+                if line.endswith("Warning: unimplemented fcntl command: 1036"):
+                    # Valgrind produces this warning when F_SET_RW_HINT is used
+                    continue
+                if self.stderr_severity == '40':
+                    self.error = True
+                remaining_bytes = config.max_stderr_bytes - stderr_bytes
+                if remaining_bytes > 0:
+                    out_err = line[0:remaining_bytes] + ('...' if len(line) > remaining_bytes else '')
+                    child = SummaryTree('StdErrOutput')
+                    child.attributes['Severity'] = self.stderr_severity
+                    child.attributes['Output'] = out_err
+                    self.out.append(child)
+                stderr_bytes += len(line)
+            if stderr_bytes > config.max_stderr_bytes:
+                child = SummaryTree('StdErrOutputTruncated')
+                child.attributes['Severity'] = self.stderr_severity
+                child.attributes['BytesRemaining'] = stderr_bytes - config.max_stderr_bytes
+                self.out.append(child)
+
+        self.out.attributes['Ok'] = '1' if self.ok() else '0'
+        if not self.ok():
+            reason = 'Unknown'
+            if self.error:
+                reason = 'ProducedErrors'
+            elif not self.test_end_found:
+                reason = 'TestDidNotFinish'
+            elif self.tests_passed == 0:
+                reason = 'NoTestsPassed'
+            elif self.test_count != self.tests_passed:
+                reason = 'Expected {} tests to pass, but only {} did'.format(self.test_count, self.tests_passed)
+            self.out.attributes['FailReason'] = reason
+
+    def parse_file(self, file: Path):
+        parser: Parser
+        if file.suffix == '.json':
+            parser = JsonParser()
+        elif file.suffix == '.xml':
+            parser = XmlParser()
+        else:
+            child = SummaryTree('TestHarnessBug')
+            child.attributes['File'] = __file__
+            frame = inspect.currentframe()
+            if frame is not None:
+                child.attributes['Line'] = str(inspect.getframeinfo(frame).lineno)
+            child.attributes['Details'] = 'Unexpected suffix {} for file {}'.format(file.suffix, file.name)
+            self.error = True
+            self.out.append(child)
+            return
+        with file.open('r') as f:
+            try:
+                parser.parse(f, self.handler)
+            except Exception as e:
+                child = SummaryTree('SummarizationError')
+                child.attributes['Severity'] = '40'
+                child.attributes['ErrorMessage'] = str(e)
+                self.out.append(child)
+
+    def register_handlers(self):
+        def remap_event_severity(attrs):
+            if 'Type' not in attrs or 'Severity' not in attrs:
+                return None
+            k = (attrs['Type'], int(attrs['Severity']))
+            if k in self.severity_map:
+                return str(self.severity_map[k])
+
+        self.handler.add_handler(('Severity', None), remap_event_severity)
+
+        def program_start(attrs: Dict[str, str]):
+            if self.test_begin_found:
+                return
+            self.test_begin_found = True
+            self.out.attributes['RandomSeed'] = attrs['RandomSeed']
+            self.out.attributes['SourceVersion'] = attrs['SourceVersion']
+            self.out.attributes['Time'] = attrs['ActualTime']
+            self.out.attributes['BuggifyEnabled'] = attrs['BuggifyEnabled']
+            self.out.attributes['DeterminismCheck'] = '0' if self.expected_unseed is None else '1'
+            if self.binary.name != 'fdbserver':
+                self.out.attributes['OldBinary'] = self.binary.name
+            if 'FaultInjectionEnabled' in attrs:
+                self.out.attributes['FaultInjectionEnabled'] = attrs['FaultInjectionEnabled']
+
+        self.handler.add_handler(('Type', 'ProgramStart'), program_start)
+
+        def set_test_file(attrs: Dict[str, str]):
+            test_file = Path(attrs['TestFile'])
+            cwd = Path('.').absolute()
+            try:
+                test_file = test_file.relative_to(cwd)
+            except ValueError:
+                pass
+            self.out.attributes['TestFile'] = str(test_file)
+
+        self.handler.add_handler(('Type', 'Simulation'), set_test_file)
+        self.handler.add_handler(('Type', 'NonSimulationTest'), set_test_file)
+
+        def set_elapsed_time(attrs: Dict[str, str]):
+            if self.test_end_found:
+                return
+            self.test_end_found = True
+            self.unseed = int(attrs['RandomUnseed'])
+            if self.expected_unseed is not None and self.unseed != self.expected_unseed:
+                severity = 40 if ('UnseedMismatch', 40) not in self.severity_map \
+                    else self.severity_map[('UnseedMismatch', 40)]
+                if severity >= 30:
+                    child = SummaryTree('UnseedMismatch')
+                    child.attributes['Unseed'] = str(self.unseed)
+                    child.attributes['ExpectedUnseed'] = str(self.expected_unseed)
+                    child.attributes['Severity'] = str(severity)
+                    if severity >= 40:
+                        self.error = True
+                    self.out.append(child)
+            self.out.attributes['SimElapsedTime'] = attrs['SimTime']
+            self.out.attributes['RealElapsedTime'] = attrs['RealTime']
+            if self.unseed is not None:
+                self.out.attributes['RandomUnseed'] = str(self.unseed)
+
+        self.handler.add_handler(('Type', 'ElapsedTime'), set_elapsed_time)
+
+        def parse_warning(attrs: Dict[str, str]):
+            self.warnings += 1
+            if self.warnings > config.max_warnings:
+                return
+            child = SummaryTree(attrs['Type'])
+            for k, v in attrs.items():
+                if k != 'Type':
+                    child.attributes[k] = v
+            self.out.append(child)
+
+        self.handler.add_handler(('Severity', '30'), parse_warning)
+
+        def parse_error(attrs: Dict[str, str]):
+            self.errors += 1
+            self.error = True
+            if self.errors > config.max_errors:
+                return
+            child = SummaryTree(attrs['Type'])
+            for k, v in attrs.items():
+                child.attributes[k] = v
+            self.out.append(child)
+
+        self.handler.add_handler(('Severity', '40'), parse_error)
+
+        def coverage(attrs: Dict[str, str]):
+            covered = True
+            if 'Covered' in attrs:
+                covered = int(attrs['Covered']) != 0
+            comment = ''
+            if 'Comment' in attrs:
+                comment = attrs['Comment']
+            c = Coverage(attrs['File'], attrs['Line'], comment)
+            if covered or c not in self.coverage:
+                self.coverage[c] = covered
+
+        self.handler.add_handler(('Type', 'CodeCoverage'), coverage)
+
+        def expected_test_pass(attrs: Dict[str, str]):
+            self.test_count = int(attrs['Count'])
+
+        self.handler.add_handler(('Type', 'TestsExpectedToPass'), expected_test_pass)
+
+        def test_passed(attrs: Dict[str, str]):
+            if attrs['Passed'] == '1':
+                self.tests_passed += 1
+
+        self.handler.add_handler(('Type', 'TestResults'), test_passed)
+
+        def remap_event_severity(attrs: Dict[str, str]):
+            self.severity_map[(attrs['TargetEvent'], int(attrs['OriginalSeverity']))] = int(attrs['NewSeverity'])
+
+        self.handler.add_handler(('Type', 'RemapEventSeverity'), remap_event_severity)
+
+        def buggify_section(attrs: Dict[str, str]):
+            if attrs['Type'] == 'FaultInjected' or attrs.get('Activated', '0') == '1':
+                child = SummaryTree(attrs['Type'])
+                child.attributes['File'] = attrs['File']
+                child.attributes['Line'] = attrs['Line']
+                self.out.append(child)
+        self.handler.add_handler(('Type', 'BuggifySection'), buggify_section)
+        self.handler.add_handler(('Type', 'FaultInjected'), buggify_section)
+
+        def running_unit_test(attrs: Dict[str, str]):
+            child = SummaryTree('RunningUnitTest')
+            child.attributes['Name'] = attrs['Name']
+            child.attributes['File'] = attrs['File']
+            child.attributes['Line'] = attrs['Line']
+        self.handler.add_handler(('Type', 'RunningUnitTest'), running_unit_test)
+
+        def stderr_severity(attrs: Dict[str, str]):
+            if 'NewSeverity' in attrs:
+                self.stderr_severity = attrs['NewSeverity']
+        self.handler.add_handler(('Type', 'StderrSeverity'), stderr_severity)
diff --git a/contrib/TestHarness2/test_harness/test_valgrind_parser.py b/contrib/TestHarness2/test_harness/test_valgrind_parser.py
new file mode 100644
index 0000000000..0b36e8e6d5
--- /dev/null
+++ b/contrib/TestHarness2/test_harness/test_valgrind_parser.py
@@ -0,0 +1,16 @@
+import sys
+
+from test_harness.valgrind import parse_valgrind_output
+from pathlib import Path
+
+
+if __name__ == '__main__':
+    errors = parse_valgrind_output(Path(sys.argv[1]))
+    for valgrind_error in errors:
+        print('ValgrindError: what={}, kind={}'.format(valgrind_error.what.what, valgrind_error.kind))
+        print('Backtrace: {}'.format(valgrind_error.what.backtrace))
+        counter = 0
+        for aux in valgrind_error.aux:
+            print('Aux {}:'.format(counter))
+            print('  What: {}'.format(aux.what))
+            print('  Backtrace: {}'.format(aux.backtrace))
diff --git a/contrib/TestHarness2/test_harness/timeout.py b/contrib/TestHarness2/test_harness/timeout.py
new file mode 100644
index 0000000000..90af7096fd
--- /dev/null
+++ b/contrib/TestHarness2/test_harness/timeout.py
@@ -0,0 +1,60 @@
+import argparse
+import re
+import sys
+
+from pathlib import Path
+from test_harness.config import config
+from test_harness.summarize import Summary, TraceFiles
+from typing import Pattern, List
+
+
+def files_matching(path: Path, pattern: Pattern, recurse: bool = True) -> List[Path]:
+    res: List[Path] = []
+    for file in path.iterdir():
+        if file.is_file() and pattern.match(file.name) is not None:
+            res.append(file)
+        elif file.is_dir() and recurse:
+            res += files_matching(file, pattern, recurse)
+    return res
+
+
+def dirs_with_files_matching(path: Path, pattern: Pattern, recurse: bool = True) -> List[Path]:
+    res: List[Path] = []
+    sub_directories: List[Path] = []
+    has_file = False
+    for file in path.iterdir():
+        if file.is_file() and pattern.match(file.name) is not None:
+            has_file = True
+        elif file.is_dir() and recurse:
+            sub_directories.append(file)
+    if has_file:
+        res.append(path)
+    if recurse:
+        for file in sub_directories:
+            res += dirs_with_files_matching(file, pattern, recurse=True)
+    res.sort()
+    return res
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('TestHarness Timeout', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    config.build_arguments(parser)
+    args = parser.parse_args()
+    config.extract_args(args)
+    valgrind_files: List[Path] = []
+    if config.use_valgrind:
+        valgrind_files = files_matching(Path.cwd(), re.compile(r'valgrind.*\.xml'))
+
+    for directory in dirs_with_files_matching(Path.cwd(), re.compile(r'trace.*\.(json|xml)'), recurse=True):
+        trace_files = TraceFiles(directory)
+        for files in trace_files.items():
+            if config.use_valgrind:
+                for valgrind_file in valgrind_files:
+                    summary = Summary(Path('bin/fdbserver'), was_killed=True)
+                    summary.valgrind_out_file = valgrind_file
+                    summary.summarize_files(files)
+                    summary.out.dump(sys.stdout)
+            else:
+                summary = Summary(Path('bin/fdbserver'), was_killed=True)
+                summary.summarize_files(files)
+                summary.out.dump(sys.stdout)
diff --git a/contrib/TestHarness2/test_harness/valgrind.py b/contrib/TestHarness2/test_harness/valgrind.py
new file mode 100644
index 0000000000..399b47c0cc
--- /dev/null
+++ b/contrib/TestHarness2/test_harness/valgrind.py
@@ -0,0 +1,141 @@
+import enum
+import xml
+import xml.sax.handler
+from pathlib import Path
+from typing import List
+
+
+class ValgrindWhat:
+    def __init__(self):
+        self.what: str = ''
+        self.backtrace: str = ''
+
+
+class ValgrindError:
+    def __init__(self):
+        self.what: ValgrindWhat = ValgrindWhat()
+        self.kind: str = ''
+        self.aux: List[ValgrindWhat] = []
+
+
+# noinspection PyArgumentList
+class ValgrindParseState(enum.Enum):
+    ROOT = enum.auto()
+    ERROR = enum.auto()
+    ERROR_AUX = enum.auto()
+    KIND = enum.auto()
+    WHAT = enum.auto()
+    TRACE = enum.auto()
+    AUX_WHAT = enum.auto()
+    STACK = enum.auto()
+    STACK_AUX = enum.auto()
+    STACK_IP = enum.auto()
+    STACK_IP_AUX = enum.auto()
+
+
+class ValgrindHandler(xml.sax.handler.ContentHandler):
+    def __init__(self):
+        super().__init__()
+        self.stack: List[ValgrindError] = []
+        self.result: List[ValgrindError] = []
+        self.state_stack: List[ValgrindParseState] = []
+
+    def state(self) -> ValgrindParseState:
+        if len(self.state_stack) == 0:
+            return ValgrindParseState.ROOT
+        return self.state_stack[-1]
+
+    @staticmethod
+    def from_content(content):
+        # pdb.set_trace()
+        if isinstance(content, bytes):
+            return content.decode()
+        assert isinstance(content, str)
+        return content
+
+    def characters(self, content):
+        # pdb.set_trace()
+        state = self.state()
+        if len(self.state_stack) == 0:
+            return
+        else:
+            assert len(self.stack) > 0
+        if state is ValgrindParseState.KIND:
+            self.stack[-1].kind += self.from_content(content)
+        elif state is ValgrindParseState.WHAT:
+            self.stack[-1].what.what += self.from_content(content)
+        elif state is ValgrindParseState.AUX_WHAT:
+            self.stack[-1].aux[-1].what += self.from_content(content)
+        elif state is ValgrindParseState.STACK_IP:
+            self.stack[-1].what.backtrace += self.from_content(content)
+        elif state is ValgrindParseState.STACK_IP_AUX:
+            self.stack[-1].aux[-1].backtrace += self.from_content(content)
+
+    def startElement(self, name, attrs):
+        # pdb.set_trace()
+        if name == 'error':
+            self.stack.append(ValgrindError())
+            self.state_stack.append(ValgrindParseState.ERROR)
+        if len(self.stack) == 0:
+            return
+        if name == 'kind':
+            self.state_stack.append(ValgrindParseState.KIND)
+        elif name == 'what':
+            self.state_stack.append(ValgrindParseState.WHAT)
+        elif name == 'auxwhat':
+            assert self.state() in [ValgrindParseState.ERROR, ValgrindParseState.ERROR_AUX]
+            self.state_stack.pop()
+            self.state_stack.append(ValgrindParseState.ERROR_AUX)
+            self.state_stack.append(ValgrindParseState.AUX_WHAT)
+            self.stack[-1].aux.append(ValgrindWhat())
+        elif name == 'stack':
+            state = self.state()
+            assert state in [ValgrindParseState.ERROR, ValgrindParseState.ERROR_AUX]
+            if state == ValgrindParseState.ERROR:
+                self.state_stack.append(ValgrindParseState.STACK)
+            else:
+                self.state_stack.append(ValgrindParseState.STACK_AUX)
+        elif name == 'ip':
+            state = self.state()
+            assert state in [ValgrindParseState.STACK, ValgrindParseState.STACK_AUX]
+            if state == ValgrindParseState.STACK:
+                self.state_stack.append(ValgrindParseState.STACK_IP)
+                if len(self.stack[-1].what.backtrace) == 0:
+                    self.stack[-1].what.backtrace = 'addr2line -e fdbserver.debug -p -C -f -i '
+                else:
+                    self.stack[-1].what.backtrace += ' '
+            else:
+                self.state_stack.append(ValgrindParseState.STACK_IP_AUX)
+                if len(self.stack[-1].aux[-1].backtrace) == 0:
+                    self.stack[-1].aux[-1].backtrace = 'addr2line -e fdbserver.debug -p -C -f -i '
+                else:
+                    self.stack[-1].aux[-1].backtrace += ' '
+
+    def endElement(self, name):
+        # pdb.set_trace()
+        if name == 'error':
+            self.result.append(self.stack.pop())
+            self.state_stack.pop()
+        elif name == 'kind':
+            assert self.state() == ValgrindParseState.KIND
+            self.state_stack.pop()
+        elif name == 'what':
+            assert self.state() == ValgrindParseState.WHAT
+            self.state_stack.pop()
+        elif name == 'auxwhat':
+            assert self.state() == ValgrindParseState.AUX_WHAT
+            self.state_stack.pop()
+        elif name == 'stack':
+            assert self.state() in [ValgrindParseState.STACK, ValgrindParseState.STACK_AUX]
+            self.state_stack.pop()
+        elif name == 'ip':
+            self.state_stack.pop()
+            state = self.state()
+            assert state in [ValgrindParseState.STACK, ValgrindParseState.STACK_AUX]
+
+
+def parse_valgrind_output(valgrind_out_file: Path) -> List[ValgrindError]:
+    handler = ValgrindHandler()
+    with valgrind_out_file.open('r') as f:
+        xml.sax.parse(f, handler)
+        return handler.result
diff --git a/contrib/TestHarness2/test_harness/version.py b/contrib/TestHarness2/test_harness/version.py
new file mode 100644
index 0000000000..fe04206a8a
--- /dev/null
+++ b/contrib/TestHarness2/test_harness/version.py
@@ -0,0 +1,66 @@
+from functools import total_ordering
+from pathlib import Path
+from typing import Tuple
+
+
+@total_ordering
+class Version:
+    def __init__(self):
+        self.major: int = 0
+        self.minor: int = 0
+        self.patch: int = 0
+
+    def version_tuple(self):
+        return self.major, self.minor, self.patch
+
+    def _compare(self, other) -> int:
+        lhs: Tuple[int, int, int] = self.version_tuple()
+        rhs: Tuple[int, int, int]
+        if isinstance(other, Version):
+            rhs = other.version_tuple()
+        else:
+            rhs = Version.parse(str(other)).version_tuple()
+        if lhs < rhs:
+            return -1
+        elif lhs > rhs:
+            return 1
+        else:
+            return 0
+
+    def __eq__(self, other) -> bool:
+        return self._compare(other) == 0
+
+    def __lt__(self, other) -> bool:
+        return self._compare(other) < 0
+
+    def __hash__(self):
+        return hash(self.version_tuple())
+
+    def __str__(self):
+        return format('{}.{}.{}'.format(self.major, self.minor, self.patch))
+
+    @staticmethod
+    def of_binary(binary: Path):
+        parts = binary.name.split('-')
+        if len(parts) != 2:
+            return Version.max_version()
+        return Version.parse(parts[1])
+
+    @staticmethod
+    def parse(version: str):
+        version_tuple = version.split('.')
+        self = Version()
+        self.major = int(version_tuple[0])
+        if len(version_tuple) > 1:
+            self.minor = int(version_tuple[1])
+            if len(version_tuple) > 2:
+                self.patch = int(version_tuple[2])
+        return self
+
+    @staticmethod
+    def max_version():
+        self = Version()
+        self.major = 2**32 - 1
+        self.minor = 2**32 - 1
+        self.patch = 2**32 - 1
+        return self
diff --git a/contrib/observability_splunk_dashboard/details.xml b/contrib/observability_splunk_dashboard/details.xml
new file mode 100644
index 0000000000..70ff15883b
--- /dev/null
+++ b/contrib/observability_splunk_dashboard/details.xml
@@ -0,0 +1,431 @@
+<form theme="light">
+  <label>FoundationDB - Details</label>
+  <description>Details for FoundationDB Cluster</description>
+  <fieldset submitButton="false">
+    <input type="text" token="Index" searchWhenChanged="true">
+      <label>Index</label>
+      <default>*</default>
+    </input>
+    <input type="text" token="LogGroup" searchWhenChanged="true">
+      <label>LogGroup</label>
+      <default>*</default>
+    </input>
+    <input type="time" token="TimeRange" searchWhenChanged="true">
+      <label>Time Range</label>
+      <default>
+        <earliest>-60m@m</earliest>
+        <latest>now</latest>
+      </default>
+    </input>
+    <input type="dropdown" token="Span" searchWhenChanged="true">
+      <label>Timechart Resolution</label>
+      <choice value="bins=100">Default</choice>
+      <choice value="span=5s">5 seconds</choice>
+      <choice value="span=1m">1 minute</choice>
+      <choice value="span=10m">10 minutes</choice>
+      <choice value="span=1h">1 hour</choice>
+      <choice value="span=1d">1 day</choice>
+      <default>bins=100</default>
+      <initialValue>bins=100</initialValue>
+    </input>
+    <input type="dropdown" token="Roles" searchWhenChanged="true">
+      <label>Roles</label>
+      <choice value="">All</choice>
+      <choice value="Roles=*SS*">Storage Server</choice>
+      <choice value="Roles=*TL*">Transaction Log</choice>
+      <choice value="Roles=*MP*">Proxy</choice>
+      <choice value="Roles=*RV*">Resolver</choice>
+      <choice value="Roles=*MS*">Master</choice>
+      <choice value="Roles=*CC*">Cluster Controller</choice>
+      <choice value="Roles=*LR*">Log Router</choice>
+      <choice value="Roles=*DD*">Data Distributor</choice>
+      <choice value="Roles=*RK*">Ratekeeper</choice>
+      <choice value="Roles=*TS*">Tester</choice>
+      <default></default>
+    </input>
+    <input type="text" token="Host" searchWhenChanged="true">
+      <label>Host</label>
+      <default>*</default>
+    </input>
+    <input type="text" token="Machine" searchWhenChanged="true">
+      <label>Machine</label>
+      <default>*</default>
+    </input>
+  </fieldset>
+  <row>
+    <panel>
+      <chart>
+        <title>Storage Queue Size</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesInput "(?&lt;InputRate&gt;.*) (?&lt;InputRoughness&gt;.*) (?&lt;InputCounter&gt;.*)" | rex field=BytesDurable "(?&lt;DurableRate&gt;.*) (?&lt;DurableRoughness&gt;.*) (?&lt;DurableCounter&gt;.*)" | eval QueueSize=InputCounter-DurableCounter | timechart $Span$ avg(QueueSize) by Machine</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>Storage Input Rate</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesInput "(?&lt;InputRate&gt;.*) (?&lt;InputRoughness&gt;.*) (?&lt;InputCounter&gt;.*)" | timechart $Span$ avg(InputRate) by Machine</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>Storage Bytes Queried</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesQueried "(?&lt;Rate&gt;.*) (?&lt;Roughness&gt;.*) (?&lt;Counter&gt;.*)" | timechart $Span$ avg(Rate) by Machine</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <chart>
+        <title>Average Process CPU by Role (capped at 2; beware kernel bug)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval Cpu=CPUSeconds/Elapsed | timechart $Span$ avg(Cpu) by Roles</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.axisY.maximumNumber">2</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>Max Process CPU by Role (capped at 2; beware kernel bug)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval Cpu=CPUSeconds/Elapsed | timechart $Span$ max(Cpu) by Roles</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.axisY.maximumNumber">2</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>Disk Busyness</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=ProcessMetrics TrackLatestType=Original | eval DiskBusyPercentage=(Elapsed-DiskIdleSeconds)/Elapsed | timechart $Span$ avg(DiskBusyPercentage) by Machine</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <chart>
+        <title>Max Run Loop Busyness by Role (for &lt;=6.1, S2Pri1)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=NetworkMetrics NOT TrackLatestType=Rolled | eval Busyness=if(isnull(PriorityStarvedBelow1), if(isnull(PriorityBusy1), S2Pri1, PriorityBusy1/Elapsed), PriorityStarvedBelow1/Elapsed) | timechart $Span$ max(Busyness) by Roles</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>Max Run Loop Busyness by Priority (6.2+ only)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=NetworkMetrics TrackLatestType=Original | foreach PriorityBusy* [eval Busyness&lt;&lt;MATCHSTR&gt;&gt;=PriorityBusy&lt;&lt;MATCHSTR&gt;&gt;/Elapsed] | timechart $Span$ max(Busyness*)</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>TLog Queue Size</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval QueueSize=SharedBytesInput-SharedBytesDurable | timechart $Span$ avg(QueueSize) by Machine</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <chart>
+        <title>Connection Timeouts (counted on both sides of connection)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut) $Roles$ host=$Host$ | eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr) | rex field=WithAddr "(?&lt;OtherAddr&gt;[^:]*:[^:]*).*" | eval Machine=Machine+","+OtherAddr | makemv delim="," Machine | search Machine=$Machine$ | eval Count=1+SuppressedEventCount | timechart sum(Count) by Machine useother=f</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.chart.nullValueMode">zero</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>Pairwise Connection Timeouts Between Datacenters</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut)  host=* Machine=* NOT TrackLatestType=Rolled 
+| eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr) 
+| rex field=host "(?&lt;Datacenter&gt;..).*" 
+| eval Datacenter=if(isnotnull(pie_work_unit), pie_work_unit, Datacenter) 
+| rex field=WithAddr "(?&lt;OtherIP&gt;[^:]*):.*" 
+| join OtherIP 
+    [search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics NOT TrackLatestType=Rolled 
+    | rex field=Machine "(?&lt;OtherIP&gt;[^:]*):.*" 
+    | rex field=host "(?&lt;OtherDatacenter&gt;..).*"
+    | eval OtherDatacenter=if(isnotnull(pie_work_unit), pie_work_unit, OtherDatacenter)]
+| eval DC1=if(Datacenter&gt;OtherDatacenter, Datacenter, OtherDatacenter), DC2=if(Datacenter&gt;OtherDatacenter, OtherDatacenter, Datacenter) 
+| eval Connection=DC1+" &lt;-&gt; " + DC2 
+| eval Count=1+SuppressedEventCount 
+| timechart count by Connection</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <table>
+        <title>Pairwise Connection Timeouts Between Known Server Processes (Sorted by Count, descending)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut OR Type=ProcessMetrics) $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr), Reason=if(Type=="ConnectionTimedOut", "Timed out trying to connect", "Established connection timed out") | rex field=Machine "(?&lt;IP&gt;[^:]*):.*" | rex field=host "(?&lt;Datacenter&gt;..).*" | rex field=WithAddr "(?&lt;OtherIP&gt;[^:]*):.*" | eventstats values(Roles) as Roles by IP | join OtherIP [search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics NOT TrackLatestType=Rolled | rex field=Machine "(?&lt;OtherIP&gt;[^:]*):.*" | rex field=host "(?&lt;OtherDatacenter&gt;..).*" | stats values(Roles) as OtherRoles by OtherIP, OtherDatacenter | eval OtherRoles="("+mvjoin(OtherRoles,",")+")"] | eval Roles="("+mvjoin(Roles,",")+")" | eval IP=Datacenter+": "+IP+" "+Roles, OtherIP=OtherDatacenter+": "+OtherIP+" "+OtherRoles | eval Addr1=if(IP&gt;OtherIP, IP, OtherIP), Addr2=if(IP&gt;OtherIP, OtherIP, IP) | eval Connection=Addr1+" &lt;-&gt; " + Addr2 | eval Count=1+SuppressedEventCount | stats sum(Count) as Count, values(Reason) as Reasons by Connection | sort -Count</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <chart>
+        <title>Lazy Deletion Rate (making space available for reuse)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=SpringCleaningMetrics | eval Metric=LazyDeletePages | streamstats current=f global=f window=1 first(Metric) as NextMetric, first(Time), as NextTime by ID | eval Rate=4096*(NextMetric-Metric)/(NextTime-Time) | timechart $Span$ avg(Rate) by Machine</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>Vacuuming Rate (shrinking file)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=SpringCleaningMetrics | eval Metric=VacuumedPages | streamstats current=f global=f window=1 first(Metric) as NextMetric, first(Time), as NextTime by ID | eval Rate=4096*(NextMetric-Metric)/(NextTime-Time) | timechart $Span$ avg(Rate) by Machine</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>Roles</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | makemv delim="," Roles | mvexpand Roles | timechart $Span$ distinct_count(Machine) by Roles</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <table>
+        <title>Slow Tasks (Sorted by Duration, Descending)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=SlowTask $Roles$ host=$Host$ Machine=$Machine$ | sort -Duration | table _time, Duration, Machine, TaskID, Roles</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+    <panel>
+      <table>
+        <title>Event Counts (Sorted by Severity and Count, Descending)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | stats count as Count by Type, Severity | sort -Severity, -Count</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+    <panel>
+      <table>
+        <title>Errors</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Severity=40 $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | table _time, Type, Machine, Roles</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <table>
+        <title>Recoveries (Ignores Filters)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=MasterRecoveryState TrackLatestType=Original (StatusCode=0 OR StatusCode=11) | eval RecoveryResetInterval=10 | sort _time | streamstats earliest(_time) as RecoveryStart, count as EventCount reset_after="(StatusCode=11)" | where StatusCode=11 | eval EventCount=if(EventCount==1, 2, EventCount), RecoveryStart=if(RecoveryStart==_time, _time-RecoveryDuration, RecoveryStart) | sort -_time | streamstats current=f global=f window=1 first(RecoveryStart) as NextRecoveryStart | eval RecoverySpan=NextRecoveryStart-_time, FailedRecoveries=EventCount-2, SuccessfulRecoveries=1 | eval AvailableSeconds=if(RecoverySpan&lt;RecoveryResetInterval, RecoverySpan, 0) | sort _time | streamstats earliest(RecoveryStart) as RecoveryStart, sum(FailedRecoveries) as FailedRecoveryCount, sum(SuccessfulRecoveries) as SuccessfulRecoveryCount, sum(AvailableSeconds) as AvailableSeconds reset_after="(NOT RecoverySpan &lt; RecoveryResetInterval)"  | where NOT RecoverySpan &lt; RecoveryResetInterval | eval Duration=_time-RecoveryStart, StartTime=strftime(RecoveryStart, "%F %X.%Q"), ShortLivedRecoveryCount=SuccessfulRecoveryCount-1 | table StartTime, Duration, FailedRecoveryCount, ShortLivedRecoveryCount, AvailableSeconds | sort -StartTime</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+    <panel>
+      <table>
+        <title>Process (Re)starts</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=ProgramStart TrackLatestType=Original $Roles$ host=$Host$ Machine=$Machine$ | table _time, Machine | sort -_time</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+    <panel>
+      <chart>
+        <title>Failure Detection (Machine Filter Only)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=FailureDetectionStatus System=$Machine$ | sort _time | eval Failed=if(Status=="Failed", 1, 0) | streamstats current=t global=f window=2 first(Failed) as PrevFailed by System | where PrevFailed=1 OR Failed=1 | eval Failed=PrevFailed + "," + Failed | makemv delim="," Failed | mvexpand Failed | timechart $Span$ max(Failed) by System</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.axisY.maximumNumber">1</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <table>
+        <title>Storage Server Space Usage (Sorted by Available Space Percentage, Ascending)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval AvailableSpacePercent=KvstoreBytesAvailable/KvstoreBytesTotal, FreeSpacePercent=KvstoreBytesFree/KvstoreBytesTotal, GBUsed=KvstoreBytesUsed/1e9, GBStored=BytesStored/1e9, Overhead=KvstoreBytesUsed/BytesStored, GBTotalSpace=KvstoreBytesTotal/1e9 | stats latest(AvailableSpacePercent) as AvailableSpacePercent, latest(FreeSpacePercent) as FreeSpacePercent, latest(GBStored) as GBStored, latest(GBUsed) as GBUsed, latest(Overhead) as OverheadFactor, latest(GBTotalSpace) as GBTotalSpace by Machine | sort AvailableSpacePercent</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <table>
+        <title>TLog Server Space Usage (Sorted by Available Space Percentage, Ascending)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$  Type=TLogMetrics  host=* Machine=* TrackLatestType=Original Roles=TL | eval AvailableSpacePercent=KvstoreBytesAvailable/KvstoreBytesTotal, FreeDiskSpacePercent=KvstoreBytesFree/KvstoreBytesTotal, GBUsed=KvstoreBytesUsed/1e9,  GBTotalSpace=KvstoreBytesTotal/1e9  | stats latest(AvailableSpacePercent) as AvailableSpacePercent, latest(FreeDiskSpacePercent) as FreeDiskSpacePercent,  latest(GBUsed) as GBUsed, latest(GBTotalSpace) as GBTotalSpace by Machine | sort AvailableSpacePercent</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <chart>
+        <title>Data Movement by Type (Log Scale, Ignores Filters)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=MovingData TrackLatestType=Original | timechart avg(Priority*) as *</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>Storage Server Max Bytes Stored by Host</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval GBStored=BytesStored/1e9 | timechart max(GBStored) by host limit=100</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <table>
+        <title>Master Failed Clients</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$  Type=WaitFailureClient 
+| stats count by FailedEndpoint</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+      </table>
+    </panel>
+  </row>
+</form>
\ No newline at end of file
diff --git a/contrib/observability_splunk_dashboard/performance_overview.xml b/contrib/observability_splunk_dashboard/performance_overview.xml
new file mode 100644
index 0000000000..0719e2bbab
--- /dev/null
+++ b/contrib/observability_splunk_dashboard/performance_overview.xml
@@ -0,0 +1,323 @@
+<form theme="dark">
+  <label>FoundationDB - Performance Overview (Dev WiP)</label>
+  <fieldset submitButton="false" autoRun="true">
+    <input type="text" token="Index" searchWhenChanged="true">
+      <label>Index</label>
+      <default>*</default>
+    </input>
+    <input type="text" token="LogGroup" searchWhenChanged="true">
+      <label>LogGroup</label>
+      <default></default>
+    </input>
+    <input type="time" token="TimeSpan" searchWhenChanged="true">
+      <label>TimeSpan</label>
+      <default>
+        <earliest>-60m@m</earliest>
+        <latest>now</latest>
+      </default>
+    </input>
+    <input type="dropdown" token="UpdateRateTypeToken" searchWhenChanged="true">
+      <label>RK: Normal or Batch Txn</label>
+      <choice value="">Normal</choice>
+      <choice value="Batch">Batch</choice>
+      <default></default>
+    </input>
+    <input type="text" token="ChartBinSizeToken" searchWhenChanged="true">
+      <label>Chart Bin Size</label>
+      <default>60s</default>
+    </input>
+  </fieldset>
+  <row>
+    <panel>
+      <title>Transaction Rate measured on Proxies</title>
+      <chart>
+        <title>Sum in $ChartBinSizeToken$ seconds</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ host=* Machine=*  (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" 
+| makemv delim=" " TxnRequestIn | makemv delim=" " TxnRequestOut | makemv delim=" " TxnStartIn | makemv delim=" " TxnStartOut | makemv delim=" " TxnThrottled
+| eval TxnRequestInRate=mvindex(TxnRequestIn, 0), TxnRequestOutRate=mvindex(TxnRequestOut, 0), TxnStartInRate=mvindex(TxnStartIn, 0), TxnStartOutRate=mvindex(TxnStartOut, 0), TxnThrottledRate=mvindex(TxnThrottled, 0)
+| timechart span=$ChartBinSizeToken$ sum(TxnRequestInRate) as StartedTxnBatchRate, sum(TxnRequestOutRate) as FinishedTxnBatchRate, sum(TxnStartInRate) as StartedTxnRate, sum(TxnStartOutRate) as FinishedTxnRate, sum(TxnThrottledRate) as ThrottledTxnRate</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Read Rate measured on Storage Servers</title>
+      <chart>
+        <title>Average in $ChartBinSizeToken$ seconds</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original" 
+| rex field=BytesQueried "(?&lt;RRate&gt;.*) (?&lt;RRoughness&gt;.*) (?&lt;RCounter&gt;.*)" 
+| rex field=RowsQueried "(?&lt;KRate&gt;.*) (?&lt;KRoughness&gt;.*) (?&lt;KCounter&gt;.*)" 
+| rex field=BytesInput "(?&lt;WRate&gt;.*) (?&lt;WRoughness&gt;.*) (?&lt;WCounter&gt;.*)" 
+| rex field=BytesFetched "(?&lt;FRate&gt;.*) (?&lt;FRoughness&gt;.*) (?&lt;FCounter&gt;.*)" 
+| timechart span=$ChartBinSizeToken$ avg(RRate) as BytesReadPerSecond, avg(KRate) as RowsReadPerSecond, avg(FRate) as DDReadPerSecond</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">linear</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Write Rate measured on Proxies</title>
+      <chart>
+        <title>1min Average</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ host=* Machine=*  (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" 
+| makemv delim=" " MutationBytes
+| makemv delim=" " Mutations
+| eval MutationBytesRate=mvindex(MutationBytes, 0), MutationsRate=mvindex(Mutations,0)
+| bucket span=5s _time
+| stats sum(MutationBytesRate) as MutationBytes, sum(MutationsRate) as Mutations by _time
+|eval MutationMB=MutationBytes/1024/1024, MutationsK=Mutations/1000
+| timechart span=$ChartBinSizeToken$ avg(MutationMB) as MutationMB, avg(MutationsK) as MutationsK</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.abbreviation">none</option>
+        <option name="charting.axisY.scale">linear</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="charting.layout.splitSeries">0</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Write Rate measured on Storage Servers</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original" 
+| rex field=BytesInput "(?&lt;WRate&gt;.*) (?&lt;WRoughness&gt;.*) (?&lt;WCounter&gt;.*)" 
+| rex field=BytesFetched "(?&lt;FRate&gt;.*) (?&lt;FRoughness&gt;.*) (?&lt;FCounter&gt;.*)" 
+| timechart span=$ChartBinSizeToken$ avg(WRate) as BytesPerSecond, avg(FRate) as DDBytesWrittenPerSecond</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>GRV Latency measured on all Proxies</title>
+      <chart>
+        <title>Seconds</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=GRVLatencyMetrics AND TrackLatestType="Original"
+| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="charting.legend.placement">bottom</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Commit Latency measured on all Proxies</title>
+      <chart>
+        <title>Seconds</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$  Type=CommitLatencyMetrics AND TrackLatestType="Original"
+| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="charting.legend.placement">bottom</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Read Latency measured on all Storage Servers</title>
+      <chart>
+        <title>Seconds</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$  Type=ReadLatencyMetrics AND TrackLatestType="Original"
+| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="charting.legend.placement">bottom</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>RateKeeper: ReleasedTPS vs LimitTPS</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" 
+| replace inf with 100000000000 
+| eval _time=Time
+| table _time ReleasedTPS TPSLimit
+| timechart span=$ChartBinSizeToken$ avg(ReleasedTPS) avg(TPSLimit)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="height">251</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>RateKeeper: Throttling Reason</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" 
+| replace inf with 100000000000 
+| eval _time=Time
+| table _time Reason</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisLabelsY.majorUnit">1</option>
+        <option name="charting.axisY.abbreviation">none</option>
+        <option name="charting.axisY.scale">linear</option>
+        <option name="charting.chart">area</option>
+        <option name="charting.drilldown">none</option>
+        <option name="charting.legend.mode">standard</option>
+        <option name="height">249</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>RateKeeper: Throttling Server</title>
+      <table>
+        <title>Ratekeeper: Limit Reason: ReasonServerID (Most recent 10 records)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate AND TrackLatestType="Original" 
+| streamstats count as numOfEvents 
+| where numOfEvents &lt; 10
+| eval DateTime=strftime(Time, "%Y-%m-%dT%H:%M:%S")
+| table DateTime, ReasonServerID</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Disk Overhead = Disk Usage / Logical KV Size</title>
+      <chart>
+        <title>Y-axis is capped at 10</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ host=* Machine=*  (Type=StorageMetrics OR Type=DDTrackerStats) TrackLatestType=Original
+| bucket _time span=5s 
+| stats sum(KvstoreBytesUsed) as StorageDiskUsedBytes, sum(KvstoreBytesTotal) as StorageDiskTotalBytes, avg(TotalSizeBytes) as LogicalKVBytes by _time
+| eval overhead=StorageDiskUsedBytes/LogicalKVBytes
+| timechart avg(overhead)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.maximumNumber">10</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="charting.legend.placement">bottom</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>KV Data Size</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+Roles=*DD* host=* Machine=*  Type=DDTrackerStats TrackLatestType=Original
+| eval TotalKVGB=TotalSizeBytes/1024/1024/1024, SystemKVGB=SystemSizeBytes/1024/1024/1024
+|timechart avg(TotalKVGB), avg(SystemKVGB), avg(Shards)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="charting.legend.placement">bottom</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Disk Usage</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ host=* Machine=*  Type=StorageMetrics TrackLatestType=Original
+| bucket _time span=5s 
+| stats sum(KvstoreBytesUsed) as StorageDiskUsedBytes, sum(KvstoreBytesTotal) as StorageDiskTotalBytes by _time
+|eval StorageDiskTotalMB = StorageDiskTotalBytes/1024/1024, StorageDiskUsedMB=StorageDiskUsedBytes/1024/1024
+| timechart avg(StorageDiskTotalMB) as StorageDiskTotalMB, avg(StorageDiskUsedMB) as StorageDiskUsedMB</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="charting.legend.placement">bottom</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Cluster Roles</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics TrackLatestType="Original"
+| rex field=host "(?&lt;HostDC&gt;..).*-..(?&lt;HostConfig&gt;..).*"
+| eval HostDC=if(isnotnull(pie_work_unit), pie_work_unit, HostDC) 
+| makemv delim="," Roles
+| stats dc(Machine) as MachineCount by Roles, HostDC
+| stats list(HostDC), list(MachineCount) by Roles
+| sort Roles</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Storage Engine</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=Role Origination=Recruited As=StorageServer | table StorageEngine, OriginalDateTime, DateTime |head 2</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+    <panel>
+      <title>Cluster Generations</title>
+      <chart>
+        <title>Indicate FDB recoveries</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics |timechart max(Generation)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+</form>
\ No newline at end of file
diff --git a/contrib/observability_splunk_dashboard/ratekeeper.xml b/contrib/observability_splunk_dashboard/ratekeeper.xml
new file mode 100644
index 0000000000..c4a31a8fbc
--- /dev/null
+++ b/contrib/observability_splunk_dashboard/ratekeeper.xml
@@ -0,0 +1,928 @@
+<form theme="dark">
+  <label>FoundationDB - RateKeeper (Dev)</label>
+  <fieldset submitButton="false">
+    <input type="text" token="Index" searchWhenChanged="true">
+      <label>Index</label>
+      <default>*</default>
+    </input>
+    <input type="text" token="LogGroup" searchWhenChanged="true">
+      <label>LogGroup</label>
+      <default></default>
+    </input>
+    <input type="time" token="TimeSpan" searchWhenChanged="true">
+      <label>TimeSpan</label>
+      <default>
+        <earliest>-60m@m</earliest>
+        <latest>now</latest>
+      </default>
+    </input>
+    <input type="dropdown" token="UpdateRateTypeToken" searchWhenChanged="true">
+      <label>RKChart: Normal or Batch</label>
+      <choice value="">Normal</choice>
+      <choice value="Batch">Batch</choice>
+      <default></default>
+    </input>
+    <input type="text" token="ChartBinSizeToken" searchWhenChanged="true">
+      <label>Chart Bin Size</label>
+      <default>30s</default>
+    </input>
+    <input type="dropdown" token="ChartByMachineToken" searchWhenChanged="true">
+      <label>ClusterStateMetric byMachine</label>
+      <choice value="by Machine">Yes</choice>
+      <choice value="">No</choice>
+      <default></default>
+    </input>
+    <input type="dropdown" token="RolePerformanceChartToken" searchWhenChanged="true">
+      <label>Role for Proc Perf Charts</label>
+      <choice value="MasterServer">MasterServer</choice>
+      <choice value="MasterProxyServer">MasterProxyServer</choice>
+      <choice value="StorageServer">StorageServer</choice>
+      <choice value="TLog">TLog</choice>
+      <choice value="Resolver">Resolver</choice>
+      <choice value="GrvProxyServer">GrvProxyServer</choice>
+      <choice value="CommitProxyServer">CommitProxyServer</choice>
+    </input>
+    <input type="dropdown" token="SourcePerfConnectionToken" searchWhenChanged="true">
+      <label>Source for Perf Connection</label>
+      <choice value="MasterServer">MasterServer</choice>
+      <choice value="MasterProxyServer">MasterProxyServer</choice>
+      <choice value="Resolver">Resolver</choice>
+      <choice value="TLog">TLog</choice>
+      <choice value="StorageServer">StorageServer</choice>
+      <choice value="GrvProxyServer">GrvProxyServer</choice>
+      <choice value="CommitProxyServer">CommitProxyServer</choice>
+    </input>
+    <input type="dropdown" token="DestinationPerfConnectionToken" searchWhenChanged="true">
+      <label>Dest for Perf Connection</label>
+      <choice value="MasterServer">MasterServer</choice>
+      <choice value="MasterProxyServer">MasterProxyServer</choice>
+      <choice value="Resolver">Resolver</choice>
+      <choice value="TLog">TLog</choice>
+      <choice value="StorageServer">StorageServer</choice>
+      <choice value="GrvProxyServer">GrvProxyServer</choice>
+      <choice value="CommitProxyServer">CommitProxyServer</choice>
+    </input>
+  </fieldset>
+  <row>
+    <panel>
+      <title>Aggregated Storage Server Bandwidth</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original" 
+        | rex field=BytesQueried "(?&lt;RRate&gt;.*) (?&lt;RRoughness&gt;.*) (?&lt;RCounter&gt;.*)" 
+         |  rex field=BytesInput "(?&lt;WRate&gt;.*) (?&lt;WRoughness&gt;.*) (?&lt;WCounter&gt;.*)" 
+          | rex field=BytesFetched "(?&lt;FRate&gt;.*) (?&lt;FRoughness&gt;.*) (?&lt;FCounter&gt;.*)" 
+          | bin span=5s _time 
+          | stats sum(RRate) as ReadSum, sum(WRate) as WriteSum, sum(FRate) as FetchedKeyRate by _time
+          | eval ReadSpeedMB=ReadSum/1024/1024, WriteSpeedMB=WriteSum/1024/1024, FetchedKeyRateMB=FetchedKeyRate/1024/1024
+          |timechart avg(ReadSpeedMB), avg(WriteSpeedMB), avg(FetchedKeyRateMB)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Aggregated Proxy Bandwidth</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" 
+| makemv delim=" " TxnRequestIn | makemv delim=" " TxnRequestOut | makemv delim=" " TxnStartIn | makemv delim=" " TxnStartOut | makemv delim=" " MutationBytes
+| eval TxnRequestInRate=mvindex(TxnRequestIn, 0), TxnRequestOutRate=mvindex(TxnRequestOut, 0), TxnStartInRate=mvindex(TxnStartIn, 0), TxnStartOutRate=mvindex(TxnStartOut, 0), MutationBytesRate=mvindex(MutationBytes, 0)
+| bin span=60s _time
+| stats avg(TxnRequestInRate) as TxnRequestInRatePerHost, avg(TxnRequestOutRate) as TxnRequestOutRatePerHost, avg(TxnStartInRate) as TxnStartInRatePerHost, avg(TxnStartOutRate) as TxnStartOutRatePerHost, avg(MutationBytesRate) as MutationBytesRatePerHost by Machine,_time
+| eval WriteThroughputKB=sum(MutationBytesRatePerHost)/1000  
+| timechart span=1m sum(TxnRequestInRatePerHost), sum(TxnRequestOutRatePerHost), sum(TxnStartInRatePerHost), sum(TxnStartOutRatePerHost), sum(WriteThroughputKB)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 1: Overview - GRV Arrivals and Leaves per Second Seen by Proxies</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" 
+| eval TxnRequestIn=mvindex(TxnRequestIn, 0), TxnRequestOut=mvindex(TxnRequestOut, 0), TxnStartIn=mvindex(TxnStartIn, 0), TxnStartOut=mvindex(TxnStartOut, 0) 
+| timechart span=30s avg(TxnRequestIn) avg(TxnRequestOut) avg(TxnStartIn) avg(TxnStartOut) by Machine</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="height">249</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 2: RKOverview - Input ReleasedTPS and Output TPSLimit</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" 
+| replace inf with 100000000000 
+| eval _time=Time
+| table _time ReleasedTPS TPSLimit
+| timechart span=$ChartBinSizeToken$ avg(ReleasedTPS) avg(TPSLimit)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="height">251</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 3: RKOverview - RKLimitReason</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" 
+| replace inf with 100000000000 
+| eval _time=Time
+| table _time Reason</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisLabelsY.majorUnit">1</option>
+        <option name="charting.axisY.abbreviation">none</option>
+        <option name="charting.axisY.scale">linear</option>
+        <option name="charting.chart">area</option>
+        <option name="charting.drilldown">none</option>
+        <option name="height">249</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 4: Don't Process Transactions - RkSSListFetchTimeout (TpsLimit = 0)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ 
+Type="RkSSListFetchTimeout" 
+| timechart span=1s count</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 5: Don't Process Transactions - RkTlogMinFreeSpaceZero (TpsLimit = 0)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ 
+Type="RkTlogMinFreeSpaceZero" 
+| timechart span=1s count</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 6: Don't Process Transactions - ProxyGRVThresholdExceeded</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ (Type="ProxyGRVThresholdExceeded*") AND TrackLatestType="Original" 
+| timechart span=1s count by Type</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 7: RKLimitReasonCandidate - LimitingStorageServerDurabilityLag (MVCCVersionInMemory)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" 
+| replace inf with 100000000000 
+| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerDurabilityLag)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 8: RKLimitReasonCandidate - LimitingStorageServerVersionLag (TLogVer-SSVer)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" 
+| replace inf with 100000000000 
+| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerVersionLag)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 9: RKLimitReasonCandidate - LimitingStorageServerQueue</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" 
+| replace inf with 100000000000 
+| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerQueue)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 10: Runtime Monitoring - StorageServer MVCCVersionInMemory (storage_server_durability_lag)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" 
+| eval NonDurableVersions=Version-DurableVersion
+| timechart span=$ChartBinSizeToken$ limit=0 avg(NonDurableVersions) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">linear</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="height">251</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 11: Runtime Monitoring - StorageServer LocalRate (higher MVCCVersionInMemory -&gt; lower LocalRate)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" 
+| timechart limit=0 avg(LocalRate) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 12: Runtime Monitoring - StorageServer ReadsRejected (lower LocalRate -&gt; higher probability of rejecting read))</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" 
+| timechart limit=0 avg(ReadsRejected) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 13: Runtime Monitoring - Version Lag between StorageServer and Tlog (storage_server_readable_behind)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" 
+| eval SSFallBehindVersions=VersionLag
+| timechart span=$ChartBinSizeToken$ limit=0 avg(SSFallBehindVersions) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">linear</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 14: Runtime Monitoring - StorageServerBytes (storage_server_write_queue_size)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" 
+| makemv delim=" " BytesInput | makemv delim=" " BytesDurable | makemv delim=" " BytesFetched | makemv delim=" " MutationBytes
+| eval BytesInput=mvindex(BytesInput, 2), BytesDurable=mvindex(BytesDurable, 2), BytesFetched=mvindex(BytesFetched, 2), MutationBytes=mvindex(MutationBytes, 2), BytesInMemoryQueue=BytesInput-BytesDurable
+| timechart span=$ChartBinSizeToken$ limit=0 avg(BytesInMemoryQueue) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">linear</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 15: Runtime Monitoring - StorageServer KVStore Free Space Ratio (storage_server_min_free_space)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" 
+| eval KvstoreBytesFreeRatio=KvstoreBytesFree/KvstoreBytesTotal
+| timechart span=$ChartBinSizeToken$ limit=0 avg(KvstoreBytesFreeRatio) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 16: Runtime Monitoring - TLog Queue Free Space Ratio (log_server_min_free_space)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" 
+| eval QueueBytesFreeRatio=QueueDiskBytesFree/QueueDiskBytesTotal
+| timechart span=$ChartBinSizeToken$ limit=0 avg(QueueBytesFreeRatio) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 17: Runtime Monitoring - TLog KVStore Free Space Ratio (log_server_min_free_space)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" 
+| eval KvstoreBytesFreeRatio=KvstoreBytesFree/KvstoreBytesTotal
+| timechart span=$ChartBinSizeToken$ limit=0 avg(KvstoreBytesFreeRatio) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 18: Runtime Monitoring - TLogBytes (log_server_write_queue)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" 
+| makemv delim=" " BytesInput 
+| makemv delim=" " BytesDurable 
+| eval BytesInput=mvindex(BytesInput, 2), BytesDurable=mvindex(BytesDurable, 2), BytesInMemoryQueue=BytesInput-BytesDurable | timechart span=$ChartBinSizeToken$ limit=0 avg(BytesInMemoryQueue) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 19: Runtime Monitoring - Proxy Throughput</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" 
+| timechart span=$ChartBinSizeToken$ limit=0 avg(TxnRequestIn) avg(TxnRequestOut) avg(TxnStartIn) avg(TxnStartOut) avg(TxnStartBatch) avg(TxnStartErrors) avg(TxnCommitIn) avg(TxnCommitVersionAssigned) avg(TxnCommitResolving) avg(TxnCommitResolved) avg(TxnCommitOut) avg(TxnCommitOutSuccess) avg(TxnCommitErrors) avg(TxnThrottled) avg(TxnConflicts) avg(CommitBatchIn) avg(CommitBatchOut) avg(TxnRejectedForQueuedTooLong) avg(Mutations)  $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 20: Runtime Monitoring - Proxy Queue Length</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" | timechart span=$ChartBinSizeToken$ limit=0 avg(*QueueSize*)  $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 21: Runtime Monitoring - TLog UnpoppedVersion</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" 
+| eval UnpoppedVersion=PersistentDataDurableVersion-QueuePoppedVersion 
+| timechart span=$ChartBinSizeToken$ limit=0 avg(UnpoppedVersion) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 22: Runtime Monitoring - Storage Server Disk (AIODiskStall)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="ProcessMetrics" 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As="StorageServer" 
+    | stats first(Machine) by Machine 
+    | rename first(Machine) as Machine 
+    | table Machine] 
+| timechart span=$ChartBinSizeToken$ limit=0 avg(AIODiskStall) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 23: Runtime Monitoring - StorageServer Query Queue Length</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" 
+| makemv QueryQueue | eval QueryQueue=mvindex(QueryQueue, 1) | table _time QueryQueue Machine
+| timechart span=$ChartBinSizeToken$ limit=0 avg(QueryQueue) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 24: Transaction Trace Stats - GRV Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
+      <input type="dropdown" token="GRVByMachineStatsToken" searchWhenChanged="true">
+        <label>By Machine</label>
+        <choice value="Machine">Yes</choice>
+        <choice value="">No</choice>
+        <default></default>
+      </input>
+      <input type="text" token="StatsGRVSpanToken" searchWhenChanged="true">
+        <label>Span</label>
+        <default>500ms</default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="TransactionDebug" AND (*ProxyServer.masterProxyServerCore.Broadcast OR *ProxyServer.getLiveCommittedVersion.confirmEpochLive OR *ProxyServer.getLiveCommittedVersion.After) 
+| table Time Type ID Location Machine Roles 
+| append 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionDebug" AND (*ProxyServer.queueTransactionStartRequests.Before) 
+    | rename ID as ParentID 
+    | table Time Type ParentID Location Machine Roles 
+    | join ParentID 
+        [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionAttachID" 
+        | rename ID as ParentID 
+        | rename To as ID 
+        | table ParentID ID] 
+    | table Time Type ID Location Machine Roles] 
+| table Time Type ID Location Machine Roles 
+| sort 0 Time 
+| table Machine Location Time Roles Type ID 
+| stats list(*) by ID 
+| rename list(*) as * 
+| eval TBegin=mvindex(Time, 0), TEnd=mvindex(Time, -1), TimeSpan=TEnd-TBegin, _time=TBegin 
+| bin bins=20 span=$StatsGRVSpanToken$ TimeSpan 
+| chart limit=0 count by TimeSpan $GRVByMachineStatsToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">column</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 25: Transaction Trace Stats - GetValue Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
+      <input type="dropdown" token="GetValueByMachineStatsToken" searchWhenChanged="true">
+        <label>By Machine</label>
+        <choice value="Machine">Yes</choice>
+        <choice value="">No</choice>
+        <default></default>
+      </input>
+      <input type="text" token="StatsReadSpanToken" searchWhenChanged="true">
+        <label>Span</label>
+        <default>500ms</default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (storageServer.received OR getValueQ.DoRead OR getValueQ.AfterVersion OR Reader.Before OR Reader.After OR getValueQ.AfterRead OR NativeAPI.getKeyLocation.Before OR NativeAPI.getKeyLocation.After) 
+| table Machine Location Time Roles ID Type 
+| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10) 
+| sort 0 Time Order 
+| stats list(*) by ID 
+| rename list(*) as * 
+| table Machine Location Time Roles ID Type 
+| eval count = mvcount(Location)
+| search count&gt;2
+| eval TEnd=mvindex(Time, -1), TBegin=mvindex(Time, 0), TimeSpan=TEnd-TBegin, _time=TBegin
+| table _time ID TimeSpan Machine Location Time 
+| bin bins=20 span=$StatsReadSpanToken$ TimeSpan 
+| chart limit=0 count by TimeSpan $GetValueByMachineStatsToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">column</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 26: Transaction Trace Stats - Commit Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
+      <input type="dropdown" token="CommitByMachineStatsToken">
+        <label>By Machine</label>
+        <choice value="Machine">Yes</choice>
+        <choice value="">No</choice>
+        <default>Machine</default>
+      </input>
+      <input type="text" token="StatsCommitSpanToken" searchWhenChanged="true">
+        <label>Span</label>
+        <default>500ms</default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="CommitDebug" AND (*ProxyServer.commitBatch.Before OR *ProxyServer.commitBatch.GettingCommitVersion OR *ProxyServer.commitBatch.GotCommitVersion OR *ProxyServer.commitBatch.ProcessingMutations OR *ProxyServer.commitBatch.AfterStoreCommits OR *ProxyServer.commitBatch.AfterLogPush OR *ProxyServer.commitBatch.AfterResolution) 
+| table Time Type ID Location Machine Roles 
+| sort 0 Time 
+| table Machine Location Time Roles Type ID
+| stats list(*) by ID
+| rename list(*) as *
+| eval Count=mvcount(Location)
+| search Count&gt;=2
+| eval TBegin=mvindex(Time, 0), TEnd=mvindex(Time, -1), TimeSpan=TEnd-TBegin, _time=T1
+| table _time TimeSpan Machine
+| bin bins=20 span=$StatsCommitSpanToken$ TimeSpan 
+| chart limit=0 count by TimeSpan $CommitByMachineStatsToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">column</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 27: Transaction Tracing - GRV Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
+      <input type="dropdown" token="GRVLatencyByMachineToken" searchWhenChanged="true">
+        <label>By Machine</label>
+        <choice value="by Machine">Yes</choice>
+        <choice value="">No</choice>
+        <default></default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="TransactionDebug" AND (*ProxyServer.*ProxyServerCore.Broadcast OR *ProxyServer.getLiveCommittedVersion.confirmEpochLive OR *ProxyServer.getLiveCommittedVersion.After) 
+| table Time Type ID Location Machine Roles 
+| append 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionDebug" AND (*ProxyServer.queueTransactionStartRequests.Before) 
+    | rename ID as ParentID 
+    | table Time Type ParentID Location Machine Roles 
+    | join ParentID 
+        [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionAttachID" 
+        | rename ID as ParentID 
+        | rename To as ID 
+        | table ParentID ID] 
+    | table Time Type ID Location Machine Roles] 
+| table Time Type ID Location Machine Roles 
+| eval Order = case(Location=="NativeAPI.getConsistentReadVersion.Before", 0, Location like "%ProxyServer.queueTransactionStartRequests.Before", 1, Location="MasterProxyServer.masterProxyServerCore.Broadcast", 2, Location like "%ProxyServer.getLiveCommittedVersion.confirmEpochLive", 3, Location like "%ProxyServer.getLiveCommittedVersion.After", 5, Location=="NativeAPI.getConsistentReadVersion.After", 6) 
+| table Time Order Type ID Location Machine Roles 
+| sort 0 Order Time 
+| table Machine Location Time Roles Type ID 
+| stats list(*) by ID 
+| rename list(*) as * 
+| eval T1=mvindex(Time, 0), T2=mvindex(Time, 1), T3=mvindex(Time, 2), T4=mvindex(Time, 3), TimeInQueue = T2-T1, TimeGetVersionFromProxies = if(mvcount==4, T3-T2, -0.0000001), TimeConfirmLivenessFromTLogs = if(mvcount==4, T4-T3, T3-T2), TimeSpan=if(mvcount==4,T4-T1,T3-T1), _time=T1 
+| table _time TimeSpan TimeInQueue TimeGetVersionFromProxies TimeConfirmLivenessFromTLogs Machine 
+| timechart span=$ChartBinSizeToken$ limit=0 avg(TimeSpan), avg(TimeInQueue), avg(TimeGetVersionFromProxies), avg(TimeConfirmLivenessFromTLogs) $GRVLatencyByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 28: Transaction Tracing - GetValue Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
+      <input type="dropdown" token="GetValueLatencyByMachineToken" searchWhenChanged="true">
+        <label>By Machine</label>
+        <choice value="by Machine">Yes</choice>
+        <choice value="">No</choice>
+        <default></default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (storageServer.received OR getValueQ.DoRead OR getValueQ.AfterVersion OR Reader.Before OR Reader.After OR getValueQ.AfterRead OR NativeAPI.getKeyLocation.Before OR NativeAPI.getKeyLocation.After) 
+| table Machine Location Time Roles ID Type 
+| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10) 
+| sort 0 Time Order 
+| stats list(*) by ID 
+| rename list(*) as * 
+| table Machine Location Time Roles ID Type 
+| eval count = mvcount(Location)
+| search count&gt;2
+| eval TEnd=mvindex(Time, -1), TBegin=mvindex(Time, 0), TimeSpan=TEnd-TBegin, _time=TBegin
+| table _time TimeSpan  
+| timechart span=30s limit=0 avg(TimeSpan) $GetValueLatencyByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 29: Transaction Tracing - Commit Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
+      <input type="dropdown" token="CommitByMachineToken" searchWhenChanged="true">
+        <label>By Machine</label>
+        <choice value="By Machine">Yes</choice>
+        <choice value="">No</choice>
+        <default></default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="CommitDebug" AND (*ProxyServer.commitBatch.Before OR *ProxyServer.commitBatch.GettingCommitVersion OR *ProxyServer.commitBatch.GotCommitVersion OR *ProxyServer.commitBatch.ProcessingMutations OR *ProxyServer.commitBatch.AfterStoreCommits OR *ProxyServer.commitBatch.AfterLogPush OR *ProxyServer.commitBatch.AfterResolution) 
+| table Time Type ID Location Machine Roles 
+| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location like "%ProxyServer.batcher", 1, Location like "%ProxyServer.commitBatch.Before", 2, Location like "%ProxyServer.commitBatch.GettingCommitVersion", 3, Location like "%ProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location like "%ProxyServer.commitBatch.AfterResolution", 8.5, Location like "%ProxyServer.commitBatch.ProcessingMutations", 9, Location like "%ProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLog.tLogCommit.BeforeWaitForVersion", 11, Location=="TLog.tLogCommit.Before", 12, Location=="TLog.tLogCommit.AfterTLogCommit", 13, Location=="TLog.tLogCommit.After", 14, Location like "%ProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16)
+| table Time Order Type ID Location Machine Roles 
+| sort 0 Time Order 
+| table Machine Location Time Roles Type ID
+| stats list(*) by ID
+| rename list(*) as *
+| eval Count=mvcount(Location)
+| search Count=7
+| eval T1=mvindex(Time, 0), T2=mvindex(Time, 1), T3=mvindex(Time, 2), T4=mvindex(Time, 3), T5=mvindex(Time, 4), T6=mvindex(Time, 5), T7=mvindex(Time, 6), TimeSpan=T7-T1, TimeResolution=T4-T3, TimePostResolution=T5-T4, TimeProcessingMutation=T6-T5, TimeTLogPush=T7-T6, _time=T1
+| table _time TimeSpan TimeResolution TimePostResolution TimeProcessingMutation TimeTLogPush Machine
+| timechart span=$ChartBinSizeToken$ limit=0 avg(TimeSpan), avg(TimeResolution), avg(TimePostResolution), avg(TimeProcessingMutation), avg(TimeTLogPush) $CommitByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 30: Transaction Tracing - Commit - TLogPush and Resolver Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
+      <input type="dropdown" token="TLogResolverByMachineToken" searchWhenChanged="true">
+        <label>By Machine</label>
+        <choice value="MachineStep">Yes</choice>
+        <choice value="Step">No</choice>
+        <default>Step</default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="CommitDebug" AND (Resolver.resolveBatch.Before OR Resolver.resolveBatch.AfterQueueSizeCheck OR Resolver.resolveBatch.AfterOrderer OR Resolver.resolveBatch.After OR TLog.tLogCommit.BeforeWaitForVersion OR TLog.tLogCommit.Before OR TLog.tLogCommit.AfterTLogCommit OR TLog.tLogCommit.After) 
+| table Time Type ID Location Machine Roles 
+| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location=="MasterProxyServer.batcher", 1, Location=="MasterProxyServer.commitBatch.Before", 2, Location=="MasterProxyServer.commitBatch.GettingCommitVersion", 3, Location=="MasterProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location=="MasterProxyServer.commitBatch.AfterResolution", 8.5, Location=="MasterProxyServer.commitBatch.ProcessingMutations", 9, Location=="MasterProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLog.tLogCommit.BeforeWaitForVersion", 11, Location=="TLog.tLogCommit.Before", 12, Location=="TLog.tLogCommit.AfterTLogCommit", 13, Location=="TLog.tLogCommit.After", 14, Location=="MasterProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16)
+| table Time Order Type ID Location Machine Roles 
+| sort 0 Time Order 
+| table Machine Location Time Roles Type ID
+| stats list(*) by ID
+| rename list(*) as *
+| eval Count=mvcount(Location), Step=case(Count=4 and (mvindex(Location, 0) like "TLog%"), "TimeTLogCommit", Count=4 and (mvindex(Location, 0) like "Resolver%"), "TimeResolver", Count=10, "TimeSpan"), BeginTime=mvindex(Time, 0), EndTime=mvindex(Time, -1), Duration=EndTime-BeginTime, _time=BeginTime
+| search Count=4
+| eval Machinei=mvindex(Machine, 0), MachineStep = Step."-".Machinei
+| table _time Step Duration Machinei Location Machine MachineStep
+| timechart span=$ChartBinSizeToken$ limit=0 avg(Duration) by $TLogResolverByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 31: Machine Performance - CPU Utilization (CPU Time divided by Elapsed)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" 
+| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory Elapsed
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ 
+    | stats first(Machine) by Machine 
+    | rename first(Machine) as Machine 
+    | table Machine] 
+| eval Utilization=CPUSeconds/Elapsed
+| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">linear</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 32: Machine Performance - Memory Utilization (ResidentMemory divided by Memory)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" 
+| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ 
+    | stats first(Machine) by Machine 
+    | rename first(Machine) as Machine 
+    | table Machine] 
+| eval Utilization = ResidentMemory/Memory
+| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">linear</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 33: Machine Performance - Disk Utilization ((DiskTotalBytes-DiskFreeBytes)/DiskTotalBytes)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" 
+| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$
+    | stats first(Machine) by Machine 
+    | rename first(Machine) as Machine 
+    | table Machine] 
+| eval Utilization = (DiskTotalBytes-DiskFreeBytes)/DiskTotalBytes
+| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 34: Machine Performance - Network (Mbps Received and Mbps Sent)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" 
+| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ 
+    | stats first(Machine) by Machine 
+    | rename first(Machine) as Machine 
+    | table Machine] 
+| timechart span=$ChartBinSizeToken$ avg(MbpsReceived) avg(MbpsSent) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 35: Machine Performance - Disk (Reads Count and Writes Count)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" 
+| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ 
+    | stats first(Machine) by Machine 
+    | rename first(Machine) as Machine 
+    | table Machine] 
+| timechart span=$ChartBinSizeToken$ avg(DiskReadsCount) avg(DiskWritesCount) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 36: Network Performance - Timeout</title>
+      <input type="dropdown" token="TimeoutByConnectionToken" searchWhenChanged="true">
+        <label>By Connection</label>
+        <choice value="By Connection">Yes</choice>
+        <choice value="">No</choice>
+        <default></default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type=ConnectionTimedOut OR Type=ConnectionTimeout) 
+| replace *:tls with * in PeerAddr 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($SourcePerfConnectionToken$)) 
+    | dedup ID] 
+| join PeerAddr 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($DestinationPerfConnectionToken$)) 
+    | dedup ID 
+    | rename Machine as PeerAddr] 
+| eval Connection=Machine."-".PeerAddr
+| timechart useother=0 span=$ChartBinSizeToken$ count $TimeoutByConnectionToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 37: Network Performance - PingLatency</title>
+      <input type="dropdown" token="PingLatencyByConnectionToken" searchWhenChanged="true">
+        <label>By Connection</label>
+        <choice value="By Connection">Yes</choice>
+        <choice value="">No</choice>
+        <default></default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type=PingLatency) 
+| replace *:tls with * in PeerAddr 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($SourcePerfConnectionToken$)) 
+    | dedup ID] 
+| join PeerAddr 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($DestinationPerfConnectionToken$)) 
+    | dedup ID 
+    | rename Machine as PeerAddr] 
+| eval Connection=Machine."-".PeerAddr
+| timechart useother=0 span=$ChartBinSizeToken$ avg(MeanLatency) avg(MaxLatency)   $PingLatencyByConnectionToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+</form>
\ No newline at end of file
diff --git a/contrib/observability_splunk_dashboard/recovery.xml b/contrib/observability_splunk_dashboard/recovery.xml
new file mode 100644
index 0000000000..6ba6b9a63b
--- /dev/null
+++ b/contrib/observability_splunk_dashboard/recovery.xml
@@ -0,0 +1,873 @@
+<form theme="dark">
+  <label>FoundationDB - Long Recovery (Dev)</label>
+  <fieldset submitButton="false" autoRun="false"></fieldset>
+  <row>
+    <panel>
+      <title>Table 1: Find long recovery (Input Index and LogGroup and Select a time span).</title>
+      <input type="text" token="IndexForOverview" searchWhenChanged="true">
+        <label>Index</label>
+        <default>*</default>
+      </input>
+      <input type="text" token="LogGroupForOverview" searchWhenChanged="true">
+        <label>LogGroup</label>
+        <default></default>
+      </input>
+      <input type="time" token="time_token_for_recoveryhistorytable" searchWhenChanged="true">
+        <label>Select a time span</label>
+        <default>
+          <earliest>-0s</earliest>
+          <latest>now</latest>
+        </default>
+      </input>
+      <table>
+        <search>
+          <query>index=$IndexForOverview$ LogGroup=$LogGroupForOverview$
+    ((Type="MasterRecoveryState" AND (Status="reading_coordinated_state" OR Status="fully_recovered" OR Status="accepting_commits")) OR (Type="Role" AND As="MasterServer" AND ("Transition"="Begin" OR "Transition"="End")) OR Type="MasterTerminated") AND (NOT TrackLatestType="Rolled") | eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
+| table ID Machine Type Transition As Status DateTime Time ErrorDescription LogGroup
+| search NOT ErrorDescription="Success"
+| eval EventType=case(Transition="Begin" AND As="MasterServer" AND Type="Role", "MasterStart", Type="MasterRecoveryState" AND Status="fully_recovered", "FullRecovery", Type="MasterRecoveryState" AND Status="reading_coordinated_state", "StartRecoveryAttempt", Transition="End" AND As="MasterServer" AND Type="Role", "MasterTerminated", Type="MasterTerminated", "MasterTerminated", Type="MasterRecoveryState" AND Status="accepting_commits", "AcceptingCommits") 
+| table ID Machine EventType DateTime Time ErrorDescription LogGroup
+| fillnull value="-" 
+| sort -Time 
+| eval ifMasterTerminatedEvent=if(EventType="MasterTerminated", 1, 0) 
+| stats list(*) by ID Machine ifMasterTerminatedEvent 
+| rename list(*) as * 
+| table ID Machine EventType DateTime Time ErrorDescription LogGroup
+| sort -Time 
+| eval LastTime=mvindex(Time, 0), FirstTime=mvindex(Time, -1), Duration=LastTime-FirstTime 
+| table ID Machine Duration EventType DateTime Time ErrorDescription LogGroup</query>
+          <earliest>$time_token_for_recoveryhistorytable.earliest$</earliest>
+          <latest>$time_token_for_recoveryhistorytable.latest$</latest>
+        </search>
+        <option name="count">15</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 2: Select timespan containing the long recovery and see all recovery attempts in the time span (The input Index and LogGroup and Timespan are for all following tables and charts)</title>
+      <input type="text" token="Index" searchWhenChanged="true">
+        <label>Index</label>
+        <default>*</default>
+      </input>
+      <input type="text" searchWhenChanged="true" token="LogGroup">
+        <label>LogGroup</label>
+      </input>
+      <input type="time" token="ReoveryTime" searchWhenChanged="true">
+        <label>ReoveryTimeSpan</label>
+        <default>
+          <earliest>-0s@s</earliest>
+          <latest>now</latest>
+        </default>
+      </input>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type="MasterRecoveryState" OR (Type="MasterTerminated") OR (Type="Role" AND As="MasterServer" AND "Transition"="End") OR Type="RecoveryInternal" OR Type="ProxyReplies" OR Type="CommitProxyReplies" OR Type="ResolverReplies" OR Type="MasterRecruitedInitialStorageServers") AND (NOT TrackLatestType="Rolled") 
+| rename ID as MasterID 
+| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+| table MasterID Machine Status Step Type DateTime Time StatusCode MyRecoveryCount ErrorDescription Reason ErrorCode 
+| fillnull value="-" ErrorDescription Reason ErrorCode 
+| eval Status=case(Type=="MasterRecoveryState", Status, Type=="Role", "RoleEnd", Type=="MasterTerminated", "MasterTerminated", Type=="RecoveryInternal", Status."/".Step, Type=="ProxyReplies" OR Type=="CommitProxyReplies", "initializing_transaction_servers/ProxyReplies", Type="ResolverReplies", "initializing_transaction_servers/ResolverReplies", Type=="MasterRecruitedInitialStorageServers", "initializing_transaction_servers/MasterRecruitedInitialStorageServers"), StatusCode=case(Type=="ProxyReplies" OR Type=="CommitProxyReplies" OR Type=="ResolverReplies" OR Type=="MasterRecruitedInitialStorageServers", "8", Type!="ProxyReplies" AND Type!="CommitProxyReplies" AND Type!="ResolverReplies" AND Type!="MasterRecruitedInitialStorageServers", StatusCode)
+| fillnull value="-" StatusCode 
+| sort 0 -Time -StatusCode
+| stats list(*) by MasterID Machine 
+| rename list(*) as * 
+| eval FirstTime=mvindex(Time, -1), LastTime=mvindex(Time, 0), Duration=LastTime-FirstTime 
+| table MasterID Machine MyRecoveryCount Duration ErrorDescription Reason ErrorCode StatusCode Status DateTime Time 
+| sort -MyRecoveryCount 
+| fillnull value="-" MyRecoveryCount</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="count">3</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+        <option name="wrap">false</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 3: Why recovery is triggered? Using WaitFailureClient event. Machine A detects Machine B's failure. First column is the time when WaitFailureClient happens. Columns of 2,3,4,5 are for A. Columns of 6,7 are for B.</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="WaitFailureClient" 
+| table Type Time Machine FailedEndpoint 
+| replace *:tls with * in FailedEndpoint 
+| join Machine type=left 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND Transition="End" 
+    | eval EndTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+    | rename As as Role 
+    | table ID EndTime Machine Role] 
+| join FailedEndpoint type=left 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" 
+    | stats latest(*) by ID | rename latest(*) as *
+    | rename Machine as FailedEndpoint 
+    | eval FailedEndpointLatestRoleEventInfo=As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+    | stats list(*) by FailedEndpoint 
+    | rename list(*) as * 
+    | table FailedEndpoint FailedEndpointLatestRoleEventInfo] 
+| eval FailureDetectedTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+| makemv delim=" " FailedEndpointLatestRoleEventInfo 
+| table FailureDetectedTime Machine ID Role EndTime FailedEndpoint FailedEndpointLatestRoleEventInfo</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="wrap">false</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 4: New Recruitment Configuration (using MasterRecoveredConfig event)</title>
+      <event>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="MasterRecoveredConfig" AND TrackLatestType="Original" 
+| eval Configuration=replace(Conf, "&amp;quot;", "\"") 
+| rename Configuration as _raw</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="list.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </event>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 5: Data Centers (using ProcessMetrics event)</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type=ProcessMetrics 
+| dedup DCID 
+| rename DCID as DataCenterID 
+| table DataCenterID pie_work_unit
+| fillnull value="-"</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+    <panel>
+      <title>Table 6: New Role (using Role event joined by ProcessMetrics event)</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type="Role" AND ((As="ClusterController") OR (As="MasterServer") OR (As="TLog") OR (As="Resolver") OR (As="MasterProxyServer") OR (As="CommitProxyServer") OR (As="GrvProxyServer") OR (As="LogRouter")) AND (NOT TrackLatestType="Rolled") AND (NOT Transition="Refresh")) 
+| eventstats count by ID 
+| rename As as Role 
+| search count=1 AND Transition="Begin" 
+| table ID Role Machine 
+| join type=left Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics 
+    | dedup Machine, DCID 
+    | rename DCID as DataCenter 
+    | table Machine DataCenter] 
+| table ID Role Machine DataCenter 
+| fillnull value="null" DataCenter 
+| stats count by Role DataCenter</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 7: Role Details</title>
+      <input type="multiselect" token="RolesToken" searchWhenChanged="true">
+        <label>Roles</label>
+        <choice value="MasterServer">MasterServer</choice>
+        <choice value="TLog">TLog</choice>
+        <choice value="Resolver">Resolver</choice>
+        <choice value="MasterProxyServer">MasterProxyServer (for &lt;7.0)</choice>
+        <choice value="LogRouter">LogRouter</choice>
+        <choice value="CommitProxyServer">CommitProxyServer (for 7.0+)</choice>
+        <choice value="GrvProxyServer">GrvProxyServer (for 7.0+)</choice>
+        <valuePrefix>As="</valuePrefix>
+        <valueSuffix>"</valueSuffix>
+        <delimiter> OR </delimiter>
+      </input>
+      <input type="dropdown" token="RoleDetailTableWhichRoleToken" searchWhenChanged="true">
+        <label>Begin/End</label>
+        <choice value="count=1 AND Transition=&quot;Begin&quot;">Begin</choice>
+        <choice value="count=1 AND Transition=&quot;End&quot;">End</choice>
+        <choice value="count=2">Begin-&gt;End</choice>
+        <default>count=1 AND Transition="Begin"</default>
+      </input>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type="Role" AND ($RolesToken$) AND (NOT TrackLatestType="Rolled") AND (NOT Transition="Refresh")) 
+| eventstats count by ID 
+| rename As as Role 
+| search $RoleDetailTableWhichRoleToken$
+| table ID Role Machine Time
+| join type=left Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics 
+    | dedup Machine, DCID 
+    | rename DCID as DataCenter 
+    | table Machine DataCenter] 
+| table ID Role Machine DataCenter Time
+| fillnull value="null" DataCenter 
+| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+| table ID Role Machine DataCenter DateTime 
+| sort 0 -DateTime</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 8: CC Recruitment SevWarn OR SevError (use events in clusterRecruitFromConfiguration and clusterRecruitRemoteFromConfiguration)</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="RecruitFromConfigurationNotAvailable" OR Type="RecruitFromConfigurationRetry" OR Type="RecruitFromConfigurationError" OR Type="RecruitRemoteFromConfigurationNotAvailable" OR Type="RecruitRemoteFromConfigurationRetry" OR Type="RecruitRemoteFromConfigurationError"
+    | eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)"), GoodRecruitmentTimeReady=case(Type=="RecruitFromConfigurationNotAvailable" OR Type=="RecruitRemoteFromConfigurationNotAvailable", "True", Type=="RecruitFromConfigurationRetry" OR Type=="RecruitRemoteFromConfigurationRetry", GoodRecruitmentTimeReady, Type=="RecruitFromConfigurationError" OR Type=="RecruitRemoteFromConfigurationError", "-")
+    | table Type GoodRecruitmentTimeReady Time DateTime</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 9: RecoveryCount of the selected TLog (in Table 11)</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (ID=$row.TLogID$ AND Type="TLogStart") OR (LogId=$row.TLogID$ AND Type="TLogPersistentStateRestore") 
+| eval ID=if(Type="TLogStart", ID, LogId), DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)")  
+| table ID RecoveryCount Type DateTime | fillnull value="Not found. The fdb version is somewhat old."</query>
+          <earliest>-7d@h</earliest>
+          <latest>now</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+    <panel>
+      <title>Table 10: Which roles the selected TLog (in Table 11) talks to</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    ((Type="TLogRejoining" AND ID=$row.TLogID$) OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow") AND TLog=$row.TLogID$) OR ((Type="TLogLockStarted" OR Type="TLogLocked") AND TLog=$row.TLogID$) OR (Type="TLogStop" AND ID=$row.TLogID$) OR (Type="TLogStop2" AND LogId=$row.TLogID$) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND ID=$row.TLogID$)) AND (NOT TrackLatestType="Rolled") 
+| sort -Time 
+| eval TLogID=case((Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRejoining"), Time." ".Type." ".Master, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."Null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."Null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."Null") 
+| stats list(*) by TLogID 
+| rename list(*) As * 
+| table TLogID TLogEvents 
+| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) 
+| search ignore=0 
+| sort TLogID 
+| table TLogID TLogEvents 
+| mvexpand TLogEvents 
+| eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), MasterID=mvindex(temp,2) 
+| fields - temp - TLogEvents 
+| sort 0 -Time 
+| search NOT MasterID="NULL" 
+| dedup MasterID 
+| rename MasterID as ID 
+| join type=left ID 
+    [ search index=$Index$ LogGroup=$LogGroup$ 
+        (Type="Role")
+    | sort 0 -Time 
+    | dedup ID 
+    | table ID Machine As] 
+| table ID Machine As | fillnull value="null" Machine As</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 11: TLog Events (Collecting all TLogs that produce interesting events during the time span)</title>
+      <input type="text" token="SeeLogEventDetailTableToken" searchWhenChanged="true">
+        <label>Input * to do search</label>
+      </input>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type="TLogRecover") OR (Type="TLogReady") OR (Type="TLogStart") OR
+    ((Type="TLogLockStarted") OR (Type="TLogLocked") OR (Type="TLogStop") OR (Type="TLogStop2")) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh") AND (NOT TrackLatestType="Rolled") AND $SeeLogEventDetailTableToken$
+| sort -Time 
+| eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRecover"), Time." ".Type." "."null", (Type="TLogReady"), Time." ".Type." "."null", (Type="TLogStart"), Time." ".Type." "."null", (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."null") 
+| stats list(TLogEvents) by TLogID 
+| rename list(TLogEvents) As TLogEvents 
+| eval EarliestEvent=mvindex(TLogEvents, -1) , LatestEvent=mvindex(TLogEvents, 0) 
+| table TLogID TLogEvents EarliestEvent LatestEvent 
+| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) 
+| search ignore=0 
+| sort TLogID 
+| join type=left TLogID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND As="TLog") 
+    | sort 0 -Time 
+    | dedup ID 
+    | rename ID as TLogID 
+    | table TLogID host LogGroup Machine] 
+| table TLogID Machine LogGroup host EarliestEvent LatestEvent 
+| fillnull value="null" Machine host LogGroup
+| eval temp=split(LatestEvent," "), LatestTime=mvindex(temp,0), LatestEvent=mvindex(temp,1), temp2=split(EarliestEvent," "), EarliestTime=mvindex(temp2,0), EarliestEvent=mvindex(temp2,1), Duration=LatestTime-EarliestTime 
+| table TLogID Machine EarliestTime Duration LogGroup host 
+| join type=left Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ 
+        Type=ProcessMetrics 
+    | dedup Machine, DCID 
+    | rename DCID as DataCenter 
+    | table Machine DataCenter] 
+| fillnull value="null" DataCenter 
+| table TLogID Machine DataCenter EarliestTime Duration host LogGroup 
+| join type=left TLogID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        ((Type="TLogRejoining") OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow")) OR ((Type="TLogLockStarted" OR Type="TLogLocked")) OR (Type="TLogStop") OR (Type="TLogStop2") OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh")) AND (NOT TrackLatestType="Rolled") 
+    | sort -Time 
+    | eval TLogID=case((Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRejoining"), Time." ".Type." ".Master, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."Null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."Null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."Null") 
+    | stats list(*) by TLogID 
+    | rename list(*) As * 
+    | table TLogID TLogEvents 
+    | eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) 
+    | search ignore=0 
+    | sort TLogID 
+    | table TLogID TLogEvents 
+    | mvexpand TLogEvents 
+    | eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), RoleID=mvindex(temp,2) 
+    | fields - temp - TLogEvents 
+    | sort 0 -Time 
+    | search NOT RoleID="NULL" 
+    | table TLogID RoleID MasterMachine 
+    | stats list(*) by TLogID 
+    | rename list(*) as * 
+    | streamstats count 
+    | mvexpand RoleID 
+    | dedup count RoleID 
+    | fields - count 
+    | stats count by TLogID 
+    | rename count as Roles 
+    | table TLogID Roles] 
+| table TLogID Machine DataCenter Roles EarliestTime Duration host LogGroup 
+| join type=left TLogID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="TLogRecover") OR (Type="TLogReady") OR (Type="TLogStart") OR
+        ((Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked") OR (Type="TLogStop") OR (Type="TLogStop2") OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh") AND (NOT TrackLatestType="Rolled")) 
+    | sort -Time 
+    | eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=if(Type="Role", Type.Transition, Type) 
+    | sort 0 TLogEvents 
+    | stats list(TLogEvents) by TLogID 
+    | rename list(TLogEvents) As TLogEvents 
+    | table TLogID TLogEvents 
+    | eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) 
+    | search ignore=0 
+    | mvcombine delim=" " TLogEvents 
+    | table TLogID TLogEvents] 
+| table TLogID Machine DataCenter Roles Duration TLogEvents EarliestTime host LogGroup 
+| eval EarliestDateTime=strftime(EarliestTime, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+| table TLogID Machine DataCenter Roles Duration TLogEvents EarliestDateTime host LogGroup
+| join type=left TLogID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="TLogStart") OR (Type="TLogPersistentStateRestore") 
+    | eval TLogID=if(Type="TLogStart", ID, LogId) 
+    | table TLogID RecoveryCount] 
+| table TLogID RecoveryCount Machine DataCenter Roles Duration TLogEvents EarliestDateTime host LogGroup 
+| fillnull value="TLog too old, click and see details" RecoveryCount</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">cell</option>
+        <option name="wrap">false</option>
+        <drilldown>
+          <set token="row.TLogID">$click.value$</set>
+        </drilldown>
+      </table>
+    </panel>
+    <panel>
+      <title>Table 12: Event Details (Including rejoining events) of the selected TLog (in Table 11)</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type="TLogRecover" AND LogId=$row.TLogID$) OR (Type="TLogReady" AND ID=$row.TLogID$) OR (Type="TLogStart" AND ID=$row.TLogID$) OR
+    ((Type="TLogRejoining" AND ID=$row.TLogID$) OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow") AND TLog=$row.TLogID$) OR ((Type="TLogLockStarted" OR Type="TLogLocked") AND TLog=$row.TLogID$) OR (Type="TLogStop" AND ID=$row.TLogID$) OR (Type="TLogStop2" AND LogId=$row.TLogID$) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND ID=$row.TLogID$)) AND (NOT TrackLatestType="Rolled") 
+| sort -Time 
+| eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRecover"), Time." ".Type." "."-"." "."-", (Type="TLogReady"), Time." ".Type." "."-"." "."-", (Type="TLogStart"), Time." ".Type." "."-"." "."-", (Type="TLogRejoining"), Time." ".Type." ".Master." "."-", (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."-", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."-"." "."-", (Type="Role" AND As="TLog" AND Transition="Begin" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."-"." ".Origination, (Type="Role" AND As="TLog" AND Transition="End" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."-"." "."-") 
+| stats list(*) by TLogID 
+| rename list(*) As * 
+| table TLogID TLogEvents 
+| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) 
+| search ignore=0 
+| sort TLogID 
+| join type=left TLogID 
+    [ search index=$Index$ LogGroup=$LogGroup$ (Type="Role" AND As="TLog" AND ID=$row.TLogID$) 
+    | dedup ID 
+    | rename ID as TLogID 
+    | table TLogID Machine] 
+| table TLogID Machine TLogEvents 
+| fillnull value="-" Machine 
+| mvexpand TLogEvents 
+| eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), ToID=mvindex(temp,2), Origination= mvindex(temp,3) 
+| fields - temp - TLogEvents 
+| join type=left 
+    [ search index=$Index$ LogGroup=$LogGroup$ (Type="Role") 
+    | dedup ID 
+    | rename ID as ToID 
+    | rename As as ToRole 
+    | rename Machine as ToMachine 
+    | table ToID ToRole ToMachine] 
+| sort 0 -Time 
+| fillnull value="-" ToRole ToMachine 
+| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+| table TLogID Machine Event DateTime ToID ToRole ToMachine Time DateTime</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="count">14</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+        <option name="wrap">false</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 13: All Tags of the selected TLog (in Table 11) that have been popped by SSes (using TLogPoppedTag event)</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (ID=$row.TLogID$ AND Type="TLogPoppedTag") 
+| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+| rename ID as TLogID 
+| rename Tags as UnpoppedRecoveredTagCount 
+| rename Tag as TagPopped 
+| rename DurableKCVer as DurableKnownCommittedVersion 
+| search TagPopped!="-1:2" 
+| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped DurableKnownCommittedVersion RecoveredAt 
+| sort 0 -UnpoppedRecoveredTagCount 
+| join TagPopped type=left 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="StorageMetrics") 
+    | stats latest(*) by Machine 
+    | rename latest(*) as * 
+    | rename Tag as TagPopped 
+    | table TagPopped ID Machine] 
+| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped DurableKnownCommittedVersion RecoveredAt ID Machine
+| join type=left Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type=ProcessMetrics
+    | dedup Machine, DCID 
+    | rename DCID as DataCenter 
+    | table Machine DataCenter] 
+| rename ID as SSID 
+| rename Machine as SSMachine 
+| rename DataCenter as SSDataCenter 
+| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped SSID SSMachine SSDataCenter DurableKnownCommittedVersion RecoveredAt 
+| fillnull value="-"</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+        <option name="wrap">false</option>
+      </table>
+    </panel>
+    <panel>
+      <title>Table 14: All Tags of the selected TLog (in Table 11) to be popped by SSes (using TLogReady event)</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (ID=$row.TLogID$ AND Type="TLogReady") 
+| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+| rename ID as TLogID 
+| table TLogID Type AllTags Locality 
+| makemv delim="," AllTags 
+| mvexpand AllTags 
+| rename AllTags as Tag | sort 0 Tag
+| join Tag type=left 
+    [ search index=$Index$ LogGroup=$LogGroup$ 
+        (Type="StorageMetrics") 
+    | stats latest(*) by Machine 
+    | rename latest(*) as * 
+    | table Tag ID Machine] 
+| table TLogID Tag ID Machine
+| join type=left Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type=ProcessMetrics 
+    | dedup Machine, DCID 
+    | rename DCID as DataCenter 
+    | table Machine DataCenter] 
+| fillnull value="-"
+| table TLogID Tag ID Machine DataCenter 
+| rename ID as SSID | rename Machine as SSMachine | rename DataCenter as SSDataCenter
+| search Tag!="-1:2"</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 15: The Tags of the selected TLog (in Table 11) that are not popped by SSes (using set diff tags in Table 13 and Table 14) (if result contains "...", the result of Table 15 is wrong)</title>
+      <table>
+        <search>
+          <query>| set diff 
+    [ search index=$Index$ LogGroup=$LogGroup$ 
+        (ID=$row.TLogID$ AND Type="TLogReady") 
+    | table AllTags  
+    | makemv delim="," AllTags 
+    | mvexpand AllTags 
+    | rename AllTags as Tag 
+    | table Tag] 
+    [ search index=$Index$ LogGroup=$LogGroup$ 
+        (ID=$row.TLogID$ AND Type="TLogPoppedTag") 
+    | table Tag]</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+    <panel>
+      <title>Table 16: All Current Storage Servers (assume each machine has at most one SS)</title>
+      <input type="text" token="TriggerSSTableToken" searchWhenChanged="true">
+        <label>Input * to search</label>
+      </input>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type="StorageMetrics") AND $TriggerSSTableToken$ 
+| stats latest(*) by Machine 
+| rename latest(*) as * 
+| table Tag ID Machine 
+| join type=left Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type=ProcessMetrics 
+    | dedup Machine, DCID 
+    | rename DCID as DataCenter 
+    | table Machine DataCenter] 
+| table ID Machine DataCenter Tag 
+| join ID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ((As="StorageServer")) AND (NOT TrackLatestType="Rolled")) 
+    | stats latest(*) by Machine 
+    | rename latest(*) as * 
+    | rename As as Role 
+    | table ID Role Machine 
+    | join type=left Machine 
+        [ search index=$Index$ LogGroup=$LogGroup$
+            Type=ProcessMetrics 
+        | dedup Machine, DCID 
+        | rename DCID as DataCenter 
+        | table Machine DataCenter] 
+    | table ID Role Machine DataCenter 
+    | fillnull value="null" DataCenter] 
+| sort 0 DataCenter
+| table Tag ID Machine DataCenter | sort 0 Tag</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 1: Timeout/TimedOut event distribution grouped by source (Machine)</title>
+      <input type="text" token="TimeoutEventByMachineTableTimeSpanToken" searchWhenChanged="true">
+        <label>TimeSpan</label>
+        <default>5s</default>
+      </input>
+      <input type="multiselect" token="TimeoutbyMachineTableSourceRoleToken" searchWhenChanged="true">
+        <label>Select Source Roles</label>
+        <choice value="TLog">TLog</choice>
+        <choice value="MasterServer">MasterServer</choice>
+        <choice value="MasterProxyServer">MasterProxyServer (for version &lt; 7)</choice>
+        <choice value="Resolver">Resolver</choice>
+        <choice value="ClusterController">ClusterController</choice>
+        <choice value="SharedTLog">SharedTLog</choice>
+        <choice value="LogRouter">LogRouter</choice>
+        <choice value="Coordinator">Coordinator</choice>
+        <choice value="StorageServer">StorageServer</choice>
+        <choice value="CommitProxyServer">CommitProxyServer (for version 7+)</choice>
+        <choice value="GrvProxyServer">GrvProxyServer (for ver 7+)</choice>
+        <valuePrefix>As="</valuePrefix>
+        <valueSuffix>"</valueSuffix>
+        <delimiter> OR </delimiter>
+      </input>
+      <input type="multiselect" token="TimeoutbyMachineTableDestinationRoleToken" searchWhenChanged="true">
+        <label>Select Destination Roles</label>
+        <choice value="TLog">TLog</choice>
+        <choice value="MasterServer">MasterServer</choice>
+        <choice value="MasterProxyServer">MasterProxyServer (for version &lt;7)</choice>
+        <choice value="Resolver">Resolver</choice>
+        <choice value="ClusterController">ClusterController</choice>
+        <choice value="SharedTLog">SharedTLog</choice>
+        <choice value="LogRouter">LogRouter</choice>
+        <choice value="Coordinator">Coordinator</choice>
+        <choice value="StorageServer">StorageServer</choice>
+        <choice value="CommitProxyServer">CommitProxyServer (for version 7+)</choice>
+        <choice value="GrvProxyServer">GrvProxyServer (for version 7+)</choice>
+        <valuePrefix>As="</valuePrefix>
+        <valueSuffix>"</valueSuffix>
+        <delimiter> OR </delimiter>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type=ConnectionTimedOut OR Type=ConnectionTimeout) 
+| replace *:tls with * in PeerAddr 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$)) 
+    | dedup ID] 
+| join PeerAddr 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$)) 
+    | dedup ID 
+    | rename Machine as PeerAddr] 
+| timechart useother=0 span=$TimeoutEventByMachineTableTimeSpanToken$ count by Machine</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="height">233</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 2: Timeout/TimedOut event distribution grouped by destination (PeerAddr)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type=ConnectionTimedOut OR Type=ConnectionTimeout) 
+| replace *:tls with * in PeerAddr 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$)) 
+    | dedup ID] 
+| join PeerAddr 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$)) 
+    | dedup ID 
+    | rename Machine as PeerAddr] 
+| timechart useother=0 span=$TimeoutEventByMachineTableTimeSpanToken$ count by PeerAddr</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="height">219</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 17: Check Type=ConnectionTimedOut OR Type=ConnectionTimeout events between transaction roles in the recovery (including the role that refresh/begin/end in the timespan)</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type=ConnectionTimedOut OR Type=ConnectionTimeout) 
+| replace *:tls with * in PeerAddr 
+| stats count as TotalTimeouts by Machine PeerAddr 
+| table Machine PeerAddr TotalTimeouts 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$)) 
+    | stats latest(*) by ID 
+    | rename latest(*) as * 
+    | eval Role = As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+    | stats list(Role) AS MachineRoleLatestEvent BY Machine 
+        ] 
+| join PeerAddr 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$)) 
+    | stats latest(*) by ID 
+    | rename latest(*) as * 
+    | eval Role = As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+    | stats list(Role) AS PeerRoleLatestEvent BY Machine 
+    | rename Machine AS PeerAddr
+        ] 
+| table Machine PeerAddr TotalTimeouts MachineRoleLatestEvent PeerRoleLatestEvent</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 18: Proxy 0</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type="ProxyReplies" OR Type="CommitProxyReplies") AND FirstProxy="True" 
+| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+| table WorkerID LogGroup FirstProxy Time DateTime 
+| sort 0 -Time 
+| join type=left WorkerID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type="Role" AND As="Worker" AND Transition="Refresh" 
+    | dedup ID 
+    | rename ID as WorkerID 
+    | stats list(*) by WorkerID 
+    | rename list(*) as * 
+    | table WorkerID Machine Roles] 
+| table WorkerID Machine Roles LogGroup FirstProxy Time DateTime 
+| join type=left Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type="Role" AND (As="MasterProxyServer" OR As="CommitProxyServer") AND Transition="Refresh" 
+    | dedup ID 
+    | rename ID as ProxyID 
+    | table Machine ProxyID] 
+| table ProxyID Machine LogGroup FirstProxy</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 19: Latest Role Events on the input Machine (Input Machine, like 172.27.113.121:4500)</title>
+      <input type="text" token="SearchMachineToken" searchWhenChanged="true">
+        <label>Machine (IP:PORT)</label>
+      </input>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="Role" AND Machine=$SearchMachineToken$ 
+| stats latest(*) by ID Transition 
+| rename latest(*) as * 
+| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+| table DateTime Machine ID Transition As Roles LogGroup Error ErrorDescription Reason 
+| sort 0 -DateTime 
+| fillnull value="-"</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 3: severity&gt;=20 event distribution (including roles that refresh/begin/end in the timespan)</title>
+      <input type="text" token="BadEvents" searchWhenChanged="true">
+        <label>Events</label>
+        <default>*</default>
+      </input>
+      <input type="multiselect" token="BadEventRoleToken" searchWhenChanged="true">
+        <label>Roles</label>
+        <choice value="TLog">TLog</choice>
+        <choice value="MasterServer">MasterServer</choice>
+        <choice value="MasterProxyServer">MasterProxyServer (for version &lt;7)</choice>
+        <choice value="Resolver">Resolver</choice>
+        <choice value="ClusterController">ClusterController</choice>
+        <choice value="SharedTLog">SharedTLog</choice>
+        <choice value="LogRouter">LogRouter</choice>
+        <choice value="Coordinator">Coordinator</choice>
+        <choice value="StorageServer">StorageServer</choice>
+        <choice value="CommitProxyServer">CommitProxyServer (for version 7+)</choice>
+        <choice value="GrvProxyServer">GrvProxyServer (for version 7+)</choice>
+        <valuePrefix>As="</valuePrefix>
+        <valueSuffix>"</valueSuffix>
+        <delimiter> OR </delimiter>
+      </input>
+      <input type="dropdown" token="BadEventChartBy" searchWhenChanged="true">
+        <label>By</label>
+        <choice value="Type">EventType</choice>
+        <choice value="Machine">Machine</choice>
+        <choice value="Severity">Severity</choice>
+        <default>Type</default>
+      </input>
+      <input type="text" token="BadEventChartTimeSpanToken" searchWhenChanged="true">
+        <label>TimeSpan</label>
+        <default>5s</default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Severity&gt;10 AND $BadEvents$
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type="Role" AND ($BadEventRoleToken$)
+    | dedup ID | table Machine] 
+| table Machine Type Severity _time
+| timechart useother=0 span=$BadEventChartTimeSpanToken$ count by $BadEventChartBy$</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="height">305</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 20: Check severity&gt;20 events of roles in the recovery (including the role that refresh/begin/end in the timespan)</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ 
+    Severity&gt;10 
+| stats count by Machine Type 
+| rename count as Count 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ 
+        Type="Role" AND ($BadEventRoleToken$)
+    | dedup ID 
+    | eval Role=As."-".ID 
+    | stats list(Role) by Machine 
+    | rename list(Role) as Roles 
+    | table Machine Roles] 
+| table Type Count Roles Machine 
+| sort -Count</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+        <option name="wrap">false</option>
+      </table>
+    </panel>
+  </row>
+</form>
\ No newline at end of file
diff --git a/contrib/observability_splunk_dashboard/transaction_latency.xml b/contrib/observability_splunk_dashboard/transaction_latency.xml
new file mode 100644
index 0000000000..99b551f2c9
--- /dev/null
+++ b/contrib/observability_splunk_dashboard/transaction_latency.xml
@@ -0,0 +1,247 @@
+<form theme="dark">
+  <label>FoundationDB - Tracing GRV and Commit Long Latency of CC Transactions (6.3 and 7.0+) (DEV)</label>
+  <description>Design for ClusterController issued transactions.</description>
+  <fieldset submitButton="false" autoRun="true">
+    <input type="text" token="Index" searchWhenChanged="true">
+      <label>Index</label>
+      <default></default>
+    </input>
+    <input type="text" token="LogGroup" searchWhenChanged="true">
+      <label>LogGroup</label>
+      <default>*</default>
+    </input>
+    <input type="text" token="transactionID">
+      <label>Hex Transaction ID (optional)</label>
+      <default>*</default>
+    </input>
+    <input type="time" token="time_token" searchWhenChanged="true">
+      <label>Time span</label>
+      <default>
+        <earliest>@d</earliest>
+        <latest>now</latest>
+      </default>
+    </input>
+  </fieldset>
+  <row>
+    <panel>
+      <title>All Transactions (Currently, this table also does not cover getrange operation and the operation which not do commit).</title>
+      <table>
+        <title>for FDB 6.3 and 7.0+</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ ID=$transactionID$
+    (Type="TransactionAttachID" OR Type="GetValueAttachID" OR Type="CommitAttachID") 
+| eval To=case(Type=="TransactionAttachID", "0"."-".To, Type="GetValueAttachID", "1"."-".To, Type=="CommitAttachID", "2"."-".To) 
+| stats list(To) by ID 
+| rename list(To) as ToList 
+| table ID ToList 
+| eval Count = mvcount(ToList) 
+| search Count=3 
+| eval To0=mvindex(ToList,0), To1=mvindex(ToList,1), To2=mvindex(ToList,2), To0=split(To0,"-"), To1=split(To1,"-"), To2=split(To2,"-"), GrvID=case(mvindex(To0, 0)=="0", mvindex(To0, 1), mvindex(To1, 0)=="0", mvindex(To1, 1), mvindex(To2, 0)=="0", mvindex(To2, 1)), ReadID=case(mvindex(To0, 0)=="1", mvindex(To0, 1), mvindex(To1, 0)=="1", mvindex(To1, 1), mvindex(To2, 0)=="1", mvindex(To2, 1)), CommitID=case(mvindex(To0, 0)=="2", mvindex(To0, 1), mvindex(To1, 0)=="2", mvindex(To1, 1), mvindex(To2, 0)=="2", mvindex(To2, 1)) 
+| table ID GrvID ReadID CommitID 
+| join GrvID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="TransactionDebug" AND Location="NativeAPI.getConsistentReadVersion.Before") 
+    | rename ID as GrvID 
+    | rename Time as BeginTime 
+    | table GrvID BeginTime
+        ] 
+| join GrvID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="TransactionDebug" AND Location="NativeAPI.getConsistentReadVersion.After") 
+    | rename ID as GrvID 
+    | rename Time as GRVDoneTime 
+    | table GrvID GRVDoneTime
+        ] 
+| join ReadID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="GetValueDebug" AND Location="NativeAPI.getValue.After") 
+    | rename ID as ReadID 
+    | rename Time as ReadDoneTime 
+    | table ReadID ReadDoneTime
+        ] 
+| join CommitID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="CommitDebug" AND Location="NativeAPI.commit.After") 
+    | rename ID as CommitID 
+    | rename Time as CommitDoneTime 
+    | table CommitID CommitDoneTime
+        ] 
+| rename ID as TransactionID 
+| eval BeginToGRVDone = GRVDoneTime-BeginTime, GRVDoneToReadDone = ReadDoneTime-GRVDoneTime, ReadDoneToCommitDone = CommitDoneTime-ReadDoneTime, Duration=CommitDoneTime-BeginTime, BeginTimeScope=BeginTime-1, EndTimeScope=CommitDoneTime+1, BeginDateTime=strftime(BeginTime, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
+| table TransactionID Duration BeginDateTime BeginToGRVDone GRVDoneToReadDone ReadDoneToCommitDone Duration  GrvID ReadID CommitID BeginTimeScope EndTimeScope | sort -Duration</query>
+          <earliest>$time_token.earliest$</earliest>
+          <latest>$time_token.latest$</latest>
+        </search>
+        <option name="drilldown">cell</option>
+        <drilldown>
+          <set token="BeginTime">$row.BeginTimeScope$</set>
+          <set token="EndTime">$row.EndTimeScope$</set>
+          <set token="ReadID">$row.ReadID$</set>
+          <set token="GrvID">$row.GrvID$</set>
+          <set token="CommitID">$row.CommitID$</set>
+        </drilldown>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Step1: GRV</title>
+      <table>
+        <title>for FDB 6.3 and 7.0+</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ 
+    Type="TransactionDebug" AND (NOT MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion) 
+AND (ID=$GrvID$ OR ID= 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type="TransactionAttachID" AND ID=$GrvID$
+    | return $To])
+| table Time Type ID Location Machine Roles
+| eventstats min(Time) as MinTime
+| eval Delta = Time - MinTime, Order = case(Location=="NativeAPI.getConsistentReadVersion.Before", 0, Location like "%ProxyServer.queueTransactionStartRequests.Before", 1, Location=="MasterProxyServer.masterProxyServerCore.Broadcast", 2, Location=="GrvProxyServer.transactionStarter.AskLiveCommittedVersionFromMaster", 2.1, Location like "%ProxyServer.getLiveCommittedVersion.confirmEpochLive", 3, Location=="MasterServer.serveLiveCommittedVersion.GetRawCommittedVersion", 4, Location like "%ProxyServer.getLiveCommittedVersion.After", 5, Location=="NativeAPI.getConsistentReadVersion.After", 6)
+| table Time Delta Order Type ID Location Machine Roles
+| sort 0 Order
+| table Machine Location Delta Time Roles ID Type</query>
+          <earliest>$BeginTime$</earliest>
+          <latest>$EndTime$</latest>
+        </search>
+        <option name="drilldown">none</option>
+      </table>
+    </panel>
+    <panel>
+      <title>Step1: (Only for FDB v6.3): GRV --- Get Committed Version (MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion Events)</title>
+      <table>
+        <title>only for FDB 6.3</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="TransactionDebug" AND Location="MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion" 
+    AND ID= 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type="TransactionAttachID" AND ID=$GrvID$ 
+    | return $To] 
+| table Time Type ID Location Machine Roles
+| eventstats min(Time) as MinTime
+| eval Delta = Time - MinTime
+| sort 0 -Time
+| table Machine Delta Time Roles ID Type</query>
+          <earliest>$BeginTime$</earliest>
+          <latest>$EndTime$</latest>
+        </search>
+        <option name="drilldown">none</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Step2: GetValue</title>
+      <table>
+        <title>for FDB 6.3 and 7.0+</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="GetValueDebug" AND ID=$ReadID$ 
+| eventstats min(Time) as MinTime 
+| eval Delta = Time-MinTime 
+| table Machine Location Delta Time Roles ID Type 
+| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10) 
+| sort 0 Order
+| table Machine Location Delta Time Roles ID Type</query>
+          <earliest>$time_token.earliest$</earliest>
+          <latest>$time_token.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Step3: Commit</title>
+      <table>
+        <title>for FDB 6.3 and 7.0+</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="CommitDebug" AND (ID=$CommitID$ OR ID= 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type="CommitAttachID" AND ID=$CommitID$ 
+    | return $To]) 
+
+| table Time Type ID Location Machine Roles 
+| eventstats min(Time) as MinTime
+| eval Delta = Time-MinTime
+| table Machine Location Delta Time Roles ID Type
+| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location like "%ProxyServer.batcher", 1, Location like "%ProxyServer.commitBatch.Before", 2, Location like "%ProxyServer.commitBatch.GettingCommitVersion", 3, Location like "%ProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location like "%ProxyServer.commitBatch.AfterResolution", 8.5, Location like "%ProxyServer.commitBatch.ProcessingMutations", 9, Location like "%ProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLogServer.tLogCommit.BeforeWaitForVersion", 11, Location=="TLogServer.tLogCommit.Before", 12, Location=="TLogServer.tLogCommit.AfterTLogCommit", 13, Location=="TLogServer.tLogCommit.After", 14, Location like "%ProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16)
+| sort 0 Order
+| table Machine Location Delta Time Roles ID Type</query>
+          <earliest>$BeginTime$</earliest>
+          <latest>$EndTime$</latest>
+        </search>
+        <option name="drilldown">none</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Step3: Commit --- Resolver</title>
+      <table>
+        <title>for FDB 6.3 and 7.0+</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Location="Resolver*") 
+| join ID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type="CommitAttachID" AND ID= 
+        [ search index=$Index$ LogGroup=$LogGroup$
+            Type="CommitAttachID" AND ID=$CommitID$ 
+        | return $To] 
+    | rename To as ID 
+    | table ID] 
+| eventstats min(Time) as MinTime 
+| eval Delta = Time-MinTime 
+| eval Order=case(Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8)
+| sort 0 Time Order
+| stats list(*) by Type ID Machine Roles
+| rename list(*) as *
+| eval T1=mvindex(Time, 0), T2=mvindex(Time, 3), Duration=T2-T1 | sort -Duration
+| table Machine Roles Duration Location Delta Time
+| join type=left Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics 
+    | dedup Machine, DCID 
+    | rename DCID as DataCenter 
+    | table Machine DataCenter]
+| table Machine DataCenter Roles Duration Location Delta Time</query>
+          <earliest>$time_token.earliest$</earliest>
+          <latest>$time_token.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Step3: Commit --- Commit to TLogs (CommitDebug Events), grouped by Machine and sorted by Duration</title>
+      <table>
+        <title>for FDB 6.3 and 7.0+</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Location="TLog*") 
+| join ID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type="CommitAttachID" AND ID= 
+        [ search index=$Index$ LogGroup=$LogGroup$
+            Type="CommitAttachID" AND ID=$CommitID$ 
+        | return $To] 
+    | rename To as ID 
+    | table ID] 
+| eventstats min(Time) as MinTime 
+| eval Delta = Time-MinTime 
+| sort 0 Time
+| stats list(*) by Type ID Machine Roles
+| rename list(*) as *
+| eval T1=mvindex(Time, 0), T2=mvindex(Time, 3), Duration=T2-T1 | sort -Duration
+| table Machine Roles Duration Location Delta Time</query>
+          <earliest>$BeginTime$</earliest>
+          <latest>$EndTime$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+      </table>
+    </panel>
+  </row>
+</form>
\ No newline at end of file
diff --git a/contrib/pkg_tester/test_fdb_pkgs.py b/contrib/pkg_tester/test_fdb_pkgs.py
index 08ccd35aa6..178f84d93c 100644
--- a/contrib/pkg_tester/test_fdb_pkgs.py
+++ b/contrib/pkg_tester/test_fdb_pkgs.py
@@ -165,7 +165,6 @@ def centos_image_with_fdb_helper(versioned: bool) -> Iterator[Optional[Image]]:
         container = Container("centos:7", initd=True)
         for rpm in rpms:
             container.copy_to(rpm, "/opt")
-        container.run(["bash", "-c", "yum update -y"])
         container.run(
             ["bash", "-c", "yum install -y prelink"]
         )  # this is for testing libfdb_c execstack permissions
@@ -327,7 +326,7 @@ def test_execstack_permissions_libfdb_c(linux_container: Container, snapshot):
         [
             "bash",
             "-c",
-            "execstack -q $(ldconfig -p | grep libfdb_c | awk '{print $(NF)}')",
+            "execstack -q $(ldconfig -p | grep libfdb_c.so | awk '{print $(NF)}')",
         ]
     )
 
diff --git a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py
index d0df0708aa..79534596b5 100644
--- a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py
+++ b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py
@@ -284,6 +284,12 @@ class ErrorCommitInfo(BaseInfo):
         if protocol_version >= PROTOCOL_VERSION_6_3:
             self.report_conflicting_keys = bb.get_bool()
 
+        if protocol_version >= PROTOCOL_VERSION_7_1:
+            lock_aware = bb.get_bool()
+            if bb.get_bool():
+                spanId = bb.get_bytes(16)
+
+
 class UnsupportedProtocolVersionError(Exception):
     def __init__(self, protocol_version):
         super().__init__("Unsupported protocol version 0x%0.2X" % protocol_version)
diff --git a/contrib/tsan.suppressions b/contrib/tsan.suppressions
new file mode 100644
index 0000000000..2078f7e8c6
--- /dev/null
+++ b/contrib/tsan.suppressions
@@ -0,0 +1,5 @@
+# ThreadSanitizer suppressions file for FDB
+# https://github.com/google/sanitizers/wiki/ThreadSanitizerSuppressions
+
+# FDB signal handler is not async-signal safe
+signal:crashHandler
diff --git a/design/data-distributor-internals.md b/design/data-distributor-internals.md
index a935bbced6..ccaba537b6 100644
--- a/design/data-distributor-internals.md
+++ b/design/data-distributor-internals.md
@@ -20,7 +20,7 @@ Data distribution manages the lifetime of storage servers, decides which storage
 
 **RelocateShard (`struct RelocateShard`)**: A `RelocateShard` records the key range that need to be moved among servers and the data movement’s priority. DD always move shards with higher priorities first.
 
-**Data distribution queue (`struct DDQueueData`)**: It receives shards to be relocated (i.e., RelocateShards), decides which shard should be moved to which server team, prioritizes the data movement based on relocate shard’s priority, and controls the progress of data movement based on servers’ workload.
+**Data distribution queue (`struct DDQueue`)**: It receives shards to be relocated (i.e., RelocateShards), decides which shard should be moved to which server team, prioritizes the data movement based on relocate shard’s priority, and controls the progress of data movement based on servers’ workload.
 
 **Special keys in the system keyspace**: DD saves its state in the system keyspace to recover from failure and to ensure every process (e.g., commit proxies, tLogs and storage servers) has a consistent view of which storage server is responsible for which key range.
 
@@ -153,3 +153,25 @@ CPU utilization. This metric is in a positive relationship with “FinishedQueri
 * The typical movement size under a read-skew scenario is 100M ~ 600M under default KNOB value `READ_REBALANCE_MAX_SHARD_FRAC=0.2, READ_REBALANCE_SRC_PARALLELISM = 20`. Increasing those knobs may accelerate the converge speed with the risk of data movement churn, which overwhelms the destination and over-cold the source.
 * The upper bound of `READ_REBALANCE_MAX_SHARD_FRAC` is 0.5. Any value larger than 0.5 can result in hot server switching.
 * When needing a deeper diagnosis of the read aware DD, `BgDDMountainChopper_New`, and `BgDDValleyFiller_New` trace events are where to go.
+
+## Data Distribution Diagnosis Q&A
+* Why Read-aware DD hasn't been triggered when there's a read imbalance? 
+  * Check `BgDDMountainChopper_New`, `BgDDValleyFiller_New` `SkipReason` field.
+* The Read-aware DD is triggered, and some data movement happened, but it doesn't help the read balance. Why? 
+  * Need to figure out which server is selected as the source and destination. The information is in `BgDDMountainChopper*`, `BgDDValleyFiller*`  `DestTeam` and `SourceTeam` field.
+  * Also, the `DDQueueServerCounter` event tells how many times a server being a source or destination (defined in 
+  ```c++
+  enum CountType : uint8_t { ProposedSource = 0, QueuedSource, LaunchedSource, LaunchedDest };
+  ```
+  ) for different relocation reason (`Other`, `RebalanceDisk` and so on) in different phase within `DD_QUEUE_COUNTER_REFRESH_INTERVAL` (default 60) seconds. For example, 
+  ```xml
+  <Event Severity="10" Time="1659974950.984176" DateTime="2022-08-08T16:09:10Z" Type="DDQueueServerCounter" ID="0000000000000000" ServerId="0000000000000004" OtherPQSD="0 1 3 2" RebalanceDiskPQSD="0 0 1 4" RebalanceReadPQSD="2 0 0 5" MergeShardPQSD="0 0 1 0" SizeSplitPQSD="0 0 5 0" WriteSplitPQSD="1 0 0 0" ThreadID="9733255463206053180" Machine="0.0.0.0:0" LogGroup="default" Roles="TS" />
+  ```
+  `RebalanceReadPQSD="2 0 0 5"` means server `0000000000000004` has been selected as for read balancing for twice, but it's not queued and executed yet. This server also has been a destination for read balancing for 5 times in the past 1 min. Note that the field will be skipped if all 4 numbers are 0. To avoid spammy traces, if is enabled with knob `DD_QUEUE_COUNTER_SUMMARIZE = true`, event `DDQueueServerCounterTooMany` will summarize the unreported servers that involved in launched relocations (aka. `LaunchedSource`, `LaunchedDest` count are non-zero):
+    ```xml
+    <Event Severity="10" Time="1660095057.995837" DateTime="2022-08-10T01:30:57Z" Type="DDQueueServerCounterTooMany" ID="0000000000000000" RemainedLaunchedSources="000000000000007f,00000000000000d9,00000000000000e8,000000000000014c,0000000000000028,00000000000000d6,0000000000000067,000000000000003e,000000000000007d,000000000000000a,00000000000000cb,0000000000000106,00000000000000c1,000000000000003c,000000000000016e,00000000000000e4,000000000000013c,0000000000000016,0000000000000179,0000000000000061,00000000000000c2,000000000000005a,0000000000000001,00000000000000c9,000000000000012a,00000000000000fb,0000000000000146," RemainedLaunchedDestinations="0000000000000079,0000000000000115,000000000000018e,0000000000000167,0000000000000135,0000000000000139,0000000000000077,0000000000000118,00000000000000bb,0000000000000177,00000000000000c0,000000000000014d,000000000000017f,00000000000000c3,000000000000015c,00000000000000fb,0000000000000186,0000000000000157,00000000000000b6,0000000000000072,0000000000000144," ThreadID="1322639651557440362" Machine="0.0.0.0:0" LogGroup="default" Roles="TS" />
+    ```
+* How to track the lifecycle of a relocation attempt for balancing?
+  * First find the TraceId fields in `BgDDMountainChopper*`, `BgDDValleyFiller*`, which indicates a relocation is triggered.
+  * (Only when enabled) Find the `QueuedRelocation` event with the same `BeginPair` and `EndPair` as the original `TraceId`. This means the relocation request is queued.
+  * Find the `RelocateShard` event whose `BeginPair`, `EndPair` field is the same as `TraceId`. This event means the relocation is ongoing.
diff --git a/design/dynamic-knobs.md b/design/dynamic-knobs.md
new file mode 100644
index 0000000000..00fe39e725
--- /dev/null
+++ b/design/dynamic-knobs.md
@@ -0,0 +1,420 @@
+# Dynamic Knobs
+
+This document is largely adapted from original design documents by Markus
+Pilman and Trevor Clinkenbeard.
+
+## Background
+
+FoundationDB parameters control the behavior of the database, including whether
+certain features are available and the value of internal constants. Parameters
+will be referred to as knobs for the remainder of this document. Currently,
+these knobs are configured through arguments passed to `fdbserver` processes,
+often controlled by `fdbmonitor`. This has a number of problems:
+
+1. Updating knobs involves updating `foundationdb.conf` files on each host in a
+   cluster. This has a lot of overhead and typically requires external tooling
+   for large scale changes.
+2. All knob changes require a process restart.
+3. We can't easily track the history of knob changes.
+
+## Overview
+
+The dynamic knobs project creates a strictly serializable quorum-based
+configuration database stored on the coordinators. Each `fdbserver` process
+specifies a configuration path and applies knob overrides from the
+configuration database for its specified classes.
+
+### Caveats
+
+The configuration database explicitly does not support the following:
+
+1. A high load. The update rate, while not specified, should be relatively low.
+2. A large amount of data. The database is meant to be relatively small (under
+   one megabyte). Data is not sharded and every coordinator stores a complete
+   copy.
+3. Concurrent writes. At most one write can succeed at a time, and clients must
+   retry their failed writes.
+
+## Design
+
+### Configuration Path
+
+Each `fdbserver` process can now include a `--config_path` argument specifying
+its configuration path. A configuration path is a hierarchical list of
+configuration classes specifying which knob overrides the `fdbserver` process
+should apply from the configuration database. For example:
+
+```bash
+$ fdbserver --config_path classA/classB/classC ...
+```
+
+Knob overrides follow descending priority:
+
+1. Manually specified command line knobs.
+2. Individual configuration class overrides.
+  * Subdirectories override parent directories. For example, if the
+    configuration path is `az-1/storage/gp3`, the `gp3` configuration takes
+    priority over the `storage` configuration, which takes priority over the
+    `az-1` configuration.
+3. Global configuration knobs.
+4. Default knob values.
+
+#### Example
+
+For example, imagine an `fdbserver` process run as follows:
+
+```bash
+$ fdbserver --datadir /mnt/fdb/storage/4500 --logdir /var/log/foundationdb --public_address auto:4500 --config_path az-1/storage/gp3 --knob_disable_asserts false
+```
+
+And the configuration database contains:
+
+| ConfigClass | KnobName            | KnobValue |
+|-------------|---------------------|-----------|
+| az-2        | page_cache_4k       | 8e9       |
+| storage     | min_trace_severity  | 20        |
+| az-1        | compaction_interval | 280       |
+| storage     | compaction_interval | 350       |
+| az-1        | disable_asserts     | true      |
+| \<global\>  | max_metric_size     | 5000      |
+| gp3         | max_metric_size     | 1000      |
+
+The final configuration for the process will be:
+
+| KnobName            |  KnobValue  | Explanation |
+|---------------------|-------------|-------------|
+| page_cache_4k       | \<default\> | The configuration database knob override for `az-2` is ignored, so the compiled default is used |
+| min_trace_severity  | 20          | Because the `storage` configuration class is part of the process’s configuration path, the corresponding knob override is applied from the configuration database |
+| compaction_interval | 350         | The `storage` knob override takes precedence over the `az-1` knob override |
+| disable_asserts     | false       | This knob is manually overridden, so all other overrides are ignored |
+| max_metric_size     | 1000        | Knob overrides for specific configuration classes take precedence over global knob overrides, so the global override is ignored |
+
+### Clients
+
+Clients can write to the configuration database using transactions.
+Configuration database transactions are differentiated from regular
+transactions through specification of the `USE_CONFIG_DATABASE` database
+option.
+
+In configuration transactions, the client uses the tuple layer to interact with
+the configuration database. Keys are tuples of size two, where the first item
+is the configuration class being written, and the second item is the knob name.
+The value should be specified as a string. It will be converted to the
+appropriate type based on the declared type of the knob being set.
+
+Below is a sample Python script to write to the configuration database.
+
+```python
+import fdb
+
+fdb.api_version(720)
+
+@fdb.transactional
+def set_knob(tr, knob_name, knob_value, config_class, description):
+        tr['\xff\xff/description'] = description
+        tr[fdb.tuple.pack((config_class, knob_name,))] = knob_value
+
+# This function performs two knob changes transactionally.
+@fdb.transactional
+def set_multiple_knobs(tr):
+        tr['\xff\xff/description'] = 'description'
+        tr[fdb.tuple.pack((None, 'min_trace_severity',))] = '10'
+        tr[fdb.tuple.pack(('az-1', 'min_trace_severity',))] = '20'
+
+db = fdb.open()
+db.options.set_use_config_database()
+
+set_knob(db, 'min_trace_severity', '10', None, 'description')
+set_knob(db, 'min_trace_severity', '20', 'az-1', 'description')
+```
+
+### Disable the Configuration Database
+
+The configuration database includes both client and server changes and is
+enabled by default. Thus, to disable the configuration database, changes must
+be made to both.
+
+#### Server
+
+The configuration database can be disabled by specifying the ``fdbserver``
+command line option ``--no-config-db``. Note that this option must be specified
+for *every* ``fdbserver`` process.
+
+#### Client
+
+The only client change from the configuration database is as part of the change
+coordinators command. The change coordinators command is not considered
+successful until the configuration database is readable on the new
+coordinators. This will cause the change coordinators command to hang if run
+against a database with dynamic knobs disabled. To disable the client side
+configuration database liveness check, specify the ``--no-config-db`` flag when
+changing coordinators. For example:
+
+```
+fdbcli> coordinators auto --no-config-db
+```
+
+## Status
+
+The current state of the configuration database is output as part of `status
+json`. The configuration path for each process can be determined from the
+``command_line`` key associated with each process.
+
+Sample from ``status json``:
+
+```
+"configuration_database" : {
+    "commits" : [
+        {
+            "description" : "set some knobs",
+            "timestamp" : 1659570000,
+            "version" : 1
+        },
+        {
+            "description" : "make some other changes",
+            "timestamp" : 1659570000,
+            "version" : 2
+        }
+    ],
+    "last_compacted_version" : 0,
+    "most_recent_version" : 2,
+    "mutations" : [
+        {
+            "config_class" : "<global>",
+            "knob_name" : "min_trace_severity",
+            "knob_value" : "int:5",
+            "type" : "set",
+            "version" : 1
+        },
+        {
+            "config_class" : "<global>",
+            "knob_name" : "compaction_interval",
+            "knob_value" : "double:30.000000",
+            "type" : "set",
+            "version" : 1
+        },
+        {
+            "config_class" : "az-1",
+            "knob_name" : "compaction_interval",
+            "knob_value" : "double:60.000000",
+            "type" : "set",
+            "version" : 1
+        },
+        {
+            "config_class" : "<global>",
+            "knob_name" : "compaction_interval",
+            "type" : "clear",
+            "version" : 2
+        },
+        {
+            "config_class" : "<global>",
+            "knob_name" : "update_node_timeout",
+            "knob_value" : "double:4.000000",
+            "type" : "set",
+            "version" : 2
+        }
+    ],
+    "snapshot" : {
+        "<global>" : {
+            "min_trace_severity" : "int:5",
+            "update_node_timeout" : "double:4.000000"
+        },
+        "az-1" : {
+            "compaction_interval" : "double:60.000000"
+        }
+    }
+}
+```
+
+After compaction, ``status json`` would show:
+
+```
+"configuration_database" : {
+    "commits" : [
+    ],
+    "last_compacted_version" : 2,
+    "most_recent_version" : 2,
+    "mutations" : [
+    ],
+    "snapshot" : {
+        "<global>" : {
+            "min_trace_severity" : "int:5",
+            "update_node_timeout" : "double:4.000000"
+        },
+        "az-1" : {
+            "compaction_interval" : "double:60.000000"
+        }
+    }
+}
+```
+
+## Detailed Implementation
+
+The configuration database is implemented as a replicated state machine living
+on the coordinators. This allows configuration database transactions to
+continue to function in the event of a catastrophic loss of the transaction
+subsystem.
+
+To commit a transaction, clients run the two phase Paxos protocol. First, the
+client asks for a live version from a quorum of coordinators. When a
+coordinator receives a request for its live version, it increments its local
+live version by one and returns it to the client. Then, the client submits its
+writes at the live version it received in the previous step. A coordinator will
+accept the commit if it is still on the same live version. If a majority of
+coordinators accept the commit, it is considered committed.
+
+### Coordinator
+
+Each coordinator runs a ``ConfigNode`` which serves as a replica storing one
+full copy of the configuration database. Coordinators never communicate with
+other coordinators while processing configuration database transactions.
+Instead, the client runs the transaction and determines when it has quorum
+agreement.
+
+Coordinators serve the following ``ConfigTransactionInterface`` to allow
+clients to read from and write to the configuration database.
+
+#### ``ConfigTransactionInterface``
+| Request          | Request fields                                                 | Reply fields                                                                                  | Explanation                                                                                                      |
+|------------------|----------------------------------------------------------------|-----------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|
+| GetGeneration    | (coordinatorsHash)                                             | (generation) or (coordinators_changed error)                                                  | Get a new read version. This read version is used for all future requests in the transaction                     |
+| Get              | (configuration class, knob name, coordinatorsHash, generation) | (knob value or empty) or (coordinators_changed error) or (transaction_too_old error)          | Returns the current value stored at the specified configuration class and knob name, or empty if no value exists |
+| GetConfigClasses | (coordinatorsHash, generation)                                 | (configuration classes) or (coordinators_changed error) or (transaction_too_old error)        | Returns a list of all configuration classes stored in the configuration database                                 |
+| GetKnobs         | (configuration class, coordinatorsHash, generation)            | (knob names) or (coordinators_changed error) or (transaction_too_old error)                   | Returns a list of all knob names stored for the provided configuration class                                     |
+| Commit           | (mutation list, coordinatorsHash, generation)                  | ack or (coordinators_changed error) or (commit_unknown_result error) or (not_committed error) | Commit mutations set by the transaction                                                                          |
+
+Coordinators also serve the following ``ConfigFollowerInterface`` to provide
+access to (and modification of) their current state. Most interaction through
+this interface is done by the cluster controller through its
+``IConfigConsumer`` implementation living on the ``ConfigBroadcaster``.
+
+#### ``ConfigFollowerInterface``
+| Request               | Request fields                                                       | Reply fields                                                                            | Explanation                                                                                                         |
+|-----------------------|----------------------------------------------------------------------|-----------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------|
+| GetChanges            | (lastSeenVersion, mostRecentVersion)                                 | (mutation list, version) or (version_already_compacted error) or (process_behind error) | Request changes since the last seen version, receive a new most recent version, as well as recent mutations         |
+| GetSnapshotAndChanges | (mostRecentVersion)                                                  | (snapshot, snapshotVersion, changes)                                                    | Request the full configuration database, in the form of a base snapshot and changes to apply on top of the snapshot |
+| Compact               | (version)                                                            | ack                                                                                     | Compact mutations up to the provided version                                                                        |
+| Rollforward           | (rollbackTo, lastKnownCommitted, target, changes, specialZeroQuorum) | ack or (version_already_compacted error) or (transaction_too_old error)                 | Rollback/rollforward mutations on a node to catch it up with the majority                                           |
+| GetCommittedVersion   | ()                                                                   | (registered, lastCompacted, lastLive, lastCommitted)                                    | Request version information from a ``ConfigNode``                                                                   |
+| Lock                  | (coordinatorsHash)                                                   | ack                                                                                     | Lock a ``ConfigNode`` to prevent it from serving requests during a coordinator change                               |
+
+### Cluster Controller
+
+The cluster controller runs a singleton ``ConfigBroadcaster`` which is
+responsible for periodically polling the ``ConfigNode``s for updates, then
+broadcasting these updates to workers through the ``ConfigBroadcastInterface``.
+When workers join the cluster, they register themselves and their
+``ConfigBroadcastInterface`` with the broadcaster. The broadcaster then pushes
+new updates to registered workers.
+
+The ``ConfigBroadcastInterface`` is also used by ``ConfigNode``s to register
+with the ``ConfigBroadcaster``. ``ConfigNode``s need to register with the
+broadcaster because the broadcaster decides when the ``ConfigNode`` may begin
+serving requests, based on global information about status of other
+``ConfigNode``s. For example, if a system with three ``ConfigNode``s suffers a
+fault where one ``ConfigNode`` loses data, the faulty ``ConfigNode``  should
+not be allowed to begin serving requests again until it has been rolled forward
+and is up to date with the latest state of the configuration database.
+
+#### ``ConfigBroadcastInterface``
+
+| Request    | Request fields                                             | Reply fields                  | Explanation                                                                                 |
+|------------|------------------------------------------------------------|-------------------------------|---------------------------------------------------------------------------------------------|
+| Snapshot   | (snapshot, version, restartDelay)                          | ack                           | A snapshot of the configuration database sent by the broadcaster to workers                 |
+| Changes    | (changes, mostRecentVersion, restartDelay)                 | ack                           | A list of changes up to and including mostRecentVersion, sent by the broadcaster to workers |
+| Registered | ()                                                         | (registered, lastSeenVersion) | Sent by the broadcaster to new ``ConfigNode``s to determine their registration status       |
+| Ready      | (snapshot, snapshotVersion, liveVersion, coordinatorsHash) | ack                           | Sent by the broadcaster to new ``ConfigNode``s to allow them to start serving requests      |
+
+### Worker
+
+Each worker runs a ``LocalConfiguration`` instance which receives and applies
+knob updates from the ``ConfigBroadcaster``. The local configuration maintains
+a durable ``KeyValueStoreMemory`` containing the following:
+
+* The latest known configuration version
+* The most recently used configuration path
+* All knob overrides corresponding to the configuration path at the latest known version
+
+Once a worker starts, it will:
+
+* Apply manually set knobs
+* Read its local configuration file
+  * If the stored configuration path does not match the configuration path
+    specified on the command line, delete the local configuration file
+  * Otherwise, apply knob updates from the local configuration file. Manually
+    specified knobs will not be overridden
+  * Register with the broadcaster to receive new updates for its configuration
+    classes
+    * Persist these updates when received and restart if necessary
+
+### Knob Atomicity
+
+All knobs are classified as either atomic or non-atomic. Atomic knobs require a
+process restart when changed, while non-atomic knobs do not.
+
+### Compaction
+
+``ConfigNode``s store individual mutations in order to be able to update other,
+out of date ``ConfigNode``s without needing to send a full snapshot. Each
+configuration database commit also contains additional metadata such as a
+timestamp and a text description of the changes being made. To keep the size of
+the configuration database manageable, a compaction process runs periodically
+(defaulting to every five minutes) which compacts individual mutations into a
+simplified snapshot of key-value pairs. Compaction is controlled by the
+``ConfigBroadcaster``, using information it peridiodically requests from
+``ConfigNode``s. Compaction will only compact up to the minimum known version
+across *all* ``ConfigNode``s. This means that if one ``ConfigNode`` is
+permanently partitioned from the ``ConfigBroadcaster`` or from clients, no
+compaction will ever take place.
+
+### Rollback / Rollforward
+
+It is necessary to be able to roll ``ConfigNode``s backward and forward with
+respect to their committed versions due to the nature of quorum logic and
+unreliable networks.
+
+Consider a case where a client commit gets persisted durably on one out of
+three ``ConfigNode``s (assume commit messages to the other two nodes are lost).
+Since the value is not committed on a majority of ``ConfigNode``s, it cannot be
+considered committed. But it is also incorrect to have the value persist on one
+out of three nodes as future commits are made. In this case, the most common
+result is that the ``ConfigNode`` will be rolled back when the next commit from
+a different client is made, and then rolled forward to contain the data from
+the commit. ``PaxosConfigConsumer`` contains logic to recognize ``ConfigNode``
+minorities and update them to match the quorum.
+
+### Changing Coordinators
+
+Since the configuration database lives on the coordinators and the
+[coordinators can be
+changed](https://apple.github.io/foundationdb/configuration.html#configuration-changing-coordination-servers),
+it is necessary to copy the configuration database from the old to the new
+coordinators during such an event. A coordinator change performs the following
+steps in regards to the configuration database:
+
+1. Write ``\xff/coordinatorsKey`` with the new coordinators string. The key
+   ``\xff/previousCoordinators`` contains the current (old) set of
+   coordinators.
+2. Lock the old ``ConfigNode``s so they can no longer serve client requests.
+3. Start a recovery, causing a new cluster controller (and therefore
+   ``ConfigBroadcaster``) to be selected.
+4. Read ``\xff/previousCoordinators`` on the ``ConfigBroadcaster`` and, if
+   present, read an up-to-date snapshot of the configuration database on the
+   old coordinators.
+5. Determine if each registering ``ConfigNode`` needs an up-to-date snapshot of
+   the configuration database sent to it, based on its reported version and the
+   snapshot version of the database received from the old coordinators.
+   * Some new coordinators which were also coordinators in the previous
+     configuration may not need a snapshot.
+6. Send ready requests to new ``ConfigNode``s, including an up-to-date snapshot
+   if necessary. This allows the new coordinators to begin serving
+   configuration database requests from clients.
+
+## Testing
+
+The ``ConfigDatabaseUnitTests`` class unit test a number of different
+configuration database dimensions.
+
+The ``ConfigIncrement`` workload tests contention between clients attempting to
+write to the configuration database, paired with machine failure and
+coordinator changes.
diff --git a/design/global-tag-throttling.md b/design/global-tag-throttling.md
index 82f5c847d1..fa710b5a8f 100644
--- a/design/global-tag-throttling.md
+++ b/design/global-tag-throttling.md
@@ -125,6 +125,3 @@ In each test, the `GlobalTagThrottlerTesting::monitor` function is used to perio
 On the ratekeeper, every `SERVER_KNOBS->TAG_THROTTLE_PUSH_INTERVAL` seconds, the ratekeeper will call `GlobalTagThrottler::getClientRates`. At the end of the rate calculation for each tag, a trace event of type `GlobalTagThrottler_GotClientRate` is produced. This trace event reports the relevant inputs that went in to the rate calculation, and can be used for debugging.
 
 On storage servers, every `SERVER_KNOBS->TAG_MEASUREMENT_INTERVAL` seconds, there are `BusyReadTag` events for every tag that has sufficient read cost to be reported to the ratekeeper. Both cost and fractional busyness are reported.
-
-### Status
-For each storage server, the busiest read tag is reported in the full status output, along with its cost and fractional busyness. 
diff --git a/documentation/sphinx/source/architecture.rst b/documentation/sphinx/source/architecture.rst
index 7c28518d74..f693865430 100644
--- a/documentation/sphinx/source/architecture.rst
+++ b/documentation/sphinx/source/architecture.rst
@@ -14,8 +14,12 @@ Detailed FoundationDB Architecture
 
 The FoundationDB architecture chooses a decoupled design, where
 processes are assigned different heterogeneous roles (e.g.,
-Coordinators, Storage Servers, Master). Scaling the database is achieved
-by horizontally expanding the number of processes for separate roles:
+Coordinators, Storage Servers, Master). Cluster attempts to recruit
+different roles as separate processes, however, it is possible that
+multiple Stateless roles gets colocated (recruited) on a single
+process to meet the cluster recruitment goals. Scaling the database
+is achieved by horizontally expanding the number of processes for
+separate roles:
 
 Coordinators
 ~~~~~~~~~~~~
diff --git a/documentation/sphinx/source/client-testing.rst b/documentation/sphinx/source/client-testing.rst
index 433a47ce7d..0eb159e8f4 100644
--- a/documentation/sphinx/source/client-testing.rst
+++ b/documentation/sphinx/source/client-testing.rst
@@ -373,3 +373,302 @@ with the ``multitest`` role:
    fdbserver -r multitest -f testfile.txt
 
 This command will block until all tests are completed.
+
+##########
+API Tester
+##########
+
+Introduction
+============
+
+API tester is a framework for implementing end-to-end tests of FDB C API, i.e. testing the API on a real
+FDB cluster through all layers of the FDB client. Its executable is ``fdb_c_api_tester``, and the source
+code is located in ``bindings/c/test/apitester``. The structure of API Tests is similar to that of the 
+Simulation Tests. The tests are implemented as workloads using FDB API, which are all built into the 
+``fdb_c_api_tester``. A concrete test configuration is defined as a TOML file, which specifies the
+combination of workloads to be executed by the test together with their parameters. The test can be then
+executed by passing the TOML file as a parameter to ``fdb_c_api_tester``. 
+
+Since simulation tests rely on the actor model to execute the tests deterministically in single-threaded
+mode, they are not suitable for testing various multi-threaded aspects of the FDB client. End-to-end API
+tests complement the simulation tests by testing the FDB Client layers above the single-threaded Native
+Client. 
+
+- The specific testing goals of the end-to-end tests are:
+- Check functional correctness of the Multi-Version Client (MVC) and Thread-Safe Client 
+- Detecting race conditions. They can be caused by accessing the state of the Native Client from wrong
+  threads or introducing other shared state without proper synchronization
+- Detecting memory management errors. Thread-safe reference counting must be used where necessary. MVC
+  works with multiple client libraries. Memory allocated by one client library must be also deallocated
+  by the same library.
+- Maintaining interoperability with other client versions.  The client functionality is made available
+  depending on the selected API version. The API changes are correctly adapted. 
+- Client API behaves correctly in case of cluster upgrades. Database and transaction state is correctly
+  migrated to the upgraded connections. Pending operations are canceled and successfully retried on the
+  upgraded connections.
+
+Implementing a Workload
+=======================
+
+Each workload is declared as a direct or indirect subclass of ``WorkloadBase`` implementing a constructor
+with ``WorkloadConfig`` as a parameter and the method ``start()``, which defines the entry point of the
+workload. 
+
+``WorkloadBase`` provides a set of methods that serve as building blocks for implementation of a workload:
+
+.. function:: execTransaction(start, cont, failOnError = true)
+
+   creates and executes an FDB transaction. Here ``start`` is a function that takes a transaction context
+   as parameter and implements the starting point of the transaction, and ``cont`` is a function implementing
+   a continuation to be executed after finishing the transaction execution. Transactions are automatically
+   retried on retryable errors. Transactions are retried by calling the ``start`` function again. In case
+   of a fatal error, the entire workload is considered as failed unless ``failOnError`` is set to ``false``.
+
+.. function:: schedule(task)
+
+   schedules a task for asynchronous execution. It is usually used in the continuations to schedule 
+   the next step of the workload.
+
+.. function:: info(msg) 
+              error(msg) 
+              
+   are used for logging a message with a tag identifying the workload. Issuing an error message marks
+   the workload as failed.
+
+The transaction context provides methods for implementation of the transaction logics:
+
+.. function:: tx()
+   
+   the reference to the FDB transaction object
+
+.. function:: continueAfter(future, cont, retryOnError = true)
+   
+   set a continuation to be executed when the future is ready. The ``retryOnError`` flag controls whether
+   the transaction should be automatically retried in case the future results in a retriable error.
+
+.. function:: continueAfterAll(futures, cont)
+   
+   takes a vector of futures and sets a continuation to be executed when all of the futures get ready.
+   The transaction is retried if at least one of the futures results in an error. This method is useful 
+   for handling multiple concurrent reads.
+
+.. function:: commit() 
+   
+   commit and finish the transaction. If the commit is successful, the execution proceeds to the
+   continuation of ``execTransaction()``. In case of a retriable error the transaction is
+   automatically retried. A fatal error results in a failure of the workoad.
+
+
+.. function:: done() 
+   
+   finish the transaction without committing. This method should be used to finish read transactions. 
+   The transaction gets destroyed and execution proceeds to the continuation of ``execTransaction()``.
+   Each transaction must be finished either by ``commit()`` or ``done()``, because otherwise
+   the framework considers that the transaction is still being executed, so it won't destroy it and
+   won't call the continuation.
+
+.. function:: onError(err) 
+   
+   Handle an error: restart the transaction in case of a retriable error, otherwise fail the workload.
+   This method is typically used in the continuation of ``continueAfter`` called with
+   ``retryOnError=false`` as a fallback to the default error handling.
+
+A workload execution ends automatically when it is marked as failed or its last continuation does not
+schedule any new task or transaction. 
+
+The workload class should be defined in the namespace FdbApiTester. The file name convention is
+``Tester{Name}Workload.cpp`` so that we distinguish them from the source files of simulation workloads.
+
+Basic Workload Example
+======================
+
+The code below implements a workload that consists of only two transactions. The first one sets a
+randomly generated key to a randomly generated value, and the second one reads the key and checks if
+the returned value matches the written one.
+
+.. literalinclude:: ../../../bindings/c/test/apitester/TesterExampleWorkload.cpp
+   :language: C++
+   :lines: 21-
+
+The workload is implemented in the method ``setAndGet``. It generates a random key and a random value
+and executes a transaction that writes that key-value pair and commits. In the continuation of the
+first ``execTransaction`` call, we execute the second transaction that reads the same key. The read
+operation returns a future. So we call ``continueAfter`` to set a continuation for that future. In the
+continuation we check if the returned value matches the written one and finish the transaction by
+calling ``ctx->done()``. After completing the second transaction we execute the continuation passed
+as parameter to the ``setAndGet`` method by the start method. In this case it is ``NO_OP_TASK``, which
+does nothing and so finishes the workload.
+
+Finally, we declare an instance ``WorkloadFactory`` to register this workload with the name ``SetAndGet``.
+
+Note that we use ``workloadId`` as a key prefix. This is necessary for isolating the key space of this
+workload, because the framework may be instructed to create multiple instances of the ``SetAndGet``
+workload. If we do not isolate the key space, another workload can write a different value for the
+same key and so break the assumption of the test.
+
+The workload is implemented using the internal C++ API, implemented in ``fdb_api.hpp``. It introduces
+a set of classes representing the FDB objects (transactions, futures, etc.). These classes provide C++-style 
+methods wrapping FDB C API calls and automate memory management by means of reference counting.
+
+Implementing Control Structures
+===============================
+
+Our basic workload executes just 2 transactions, but in practice we want to have workloads that generate
+multiple transactions. The following code demonstrates how we can modify our basic workload to generate
+multiple transactions in a loop. 
+
+.. code-block:: C++
+
+   class SetAndGetWorkload : public WorkloadBase {
+   public:
+      ...
+      int numIterations;
+      int iterationsLeft;
+
+      SetAndGetWorkload(const WorkloadConfig& config) : WorkloadBase(config) {
+         keyPrefix = fdb::toBytesRef(fmt::format("{}/", workloadId));
+         numIterations = config.getIntOption("numIterations", 1000);
+      }
+
+      void start() override {
+         iterationsLeft = numIterations;
+         setAndGetLoop();
+      }
+
+      void setAndGetLoop() {
+         if (iterationsLeft == 0) {
+            return;
+         }
+         iterationsLeft--;
+         setAndGet([this]() { setAndGetLoop(); });
+      }
+      ...
+   }
+
+We introduce a workload parameter ``numIterations`` to specify the number of iterations. If not specified
+in the test configuration it defaults to 1000.
+
+The method ``setAndGetLoop`` implements the loop that decrements iterationsLeft counter until it reaches 0
+and each iteration calls setAndGet with a continuation that returns the execution to the loop. As you
+can see we don't need any change in ``setAndGet``, just call it with another continuation. 
+
+The pattern of passing a continuation as a parameter also can be used to decompose the workload into a
+sequence of steps. For example,  we can introduce setup and cleanUp steps to our workload and modify the
+``setAndGetLoop`` to make it composable with an arbitrary continuation:
+
+.. code-block:: C++
+
+    void start() override {
+       setup([this](){
+           iterationsLeft = numIterations;
+           setAndGetLoop([this](){
+               cleanup(NO_OP_TASK);
+           });
+       });
+    }
+
+    void setAndGetLoop(TTaskFct cont) {
+       if (iterationsLeft == 0) {
+           schedule(cont);
+       }
+       iterationsLeft--;
+       setAndGet([this, cont]() { setAndGetLoop(cont); });
+   }
+
+   void setup(TTaskFct cont) { ... }
+
+   void cleanup(TTaskFct cont) {  ... }
+
+Note that we call ``schedule(cont)`` in ``setAndGetLoop`` instead of calling the continuation directly.
+In this way we avoid keeping ``setAndGetLoop`` in the call stack, when executing the next step.
+
+Subclassing ApiWorkload
+=======================
+
+``ApiWorkload`` is an abstract subclass of ``WorkloadBase`` that provides a framework for a typical
+implementation of API test workloads. It implements a workflow consisting of cleaning up the key space
+of the workload, populating it with newly generated data and then running a loop consisting of random
+database operations. The concrete subclasses of ``ApiWorkload`` are expected to override the method
+``randomOperation`` with an implementation of concrete random operations.
+
+The ``ApiWorkload`` maintains a local key-value store that mirrors the part of the database state
+relevant to the workload. A successful database write operation should be followed by a continuation
+that performs equivalent changes in the local store, and the results of a database read operation should
+be validated against the values from the local store. 
+
+Test Configuration
+==================
+
+A concrete test configuration is specified by a TOML file. The file must contain one ``[[test]]`` section
+specifying the general settings for test execution followed by one or more ``[[test.workload]]``
+configuration sessions, specifying the workloads to be executed and their parameters. The specified
+workloads are started all at once and executed concurrently.
+
+The ``[[test]]`` section can contain the following options:
+
+- ``title``: descriptive title of the test
+- ``multiThreaded``: enable multi-threading (default: false)
+- ``minFdbThreads`` and ``maxFdbThreads``: the number of FDB (network) threads to be randomly selected
+  from the given range (default: 1-1). Used only if ``multiThreaded=true``. It is also important to use
+  multiple database instances to make use of the multithreading.
+- ``minDatabases`` and ``maxDatabases``: the number of database instances to be randomly selected from
+  the given range (default 1-1). The transactions of all workloads are randomly load-balanced over the
+  pool of database instances.
+- ``minClients`` and ``maxClients``: the number of clients, i.e. instances of each workload, to be
+  randomly selected from the given range (default 1-8).
+- ``minClientThreads`` and ``maxClientThreads``: the number of client threads, i.e. the threads used
+  for execution of the workload, to be randomly selected from the given range (default 1-1).
+- ``blockOnFutures``: use blocking waits on futures instead of scheduling future callbacks asynchronously
+  (default: false)
+- ``buggify``: Enable client-side failure injection (default: false)
+- ``databasePerTransaction``: Create a separate database instance for each transaction (default: false).
+  It is a special mode useful for testing bugs related to creation and destruction of database instances. 
+- ``fdbCallbacksOnExternalThreads``: Enables the option ``FDB_NET_OPTION_CALLBACKS_ON_EXTERNAL_THREADS``
+  causting the callbacks of futures to be executed directly on the threads of the external FDB clients 
+  rather than on the thread of the local FDB client. 
+
+The workload section ``[[test.workload]]`` must contain the attribute name matching the registered name
+of the workload to be executed. Other options are workload-specific. 
+
+The subclasses of the ``ApiWorkload`` inherit the following configuration options:
+
+- ``minKeyLength`` and ``maxKeyLength``: the size range of randomly generated keys (default: 1-64)
+- ``minValueLength`` and ``maxValueLength``:  the size range of randomly generated values 
+  (default: 1-1000)
+- ``maxKeysPerTransaction``: the maximum number of keys per transaction (default: 50)
+- ``initialSize``: the number of key-value pairs in the initially populated database (default: 1000)
+- ``readExistingKeysRatio``: the probability of choosing an existing key for read operations 
+  (default: 0.9)
+- ``numRandomOperations``: the number of random operations to be executed per workload (default: 1000)
+- ``runUntilStop``: run the workload indefinitely until the stop command is received (default: false).
+   This execution mode in upgrade tests and other scripted tests, where the workload needs to
+   be generated continously until completion of the scripted test.
+- ``numOperationsForProgressCheck``: the number of operations to be performed to confirm a progress 
+   check (default: 10). This option is used in combination with ``runUntilStop``. Progress checks are
+   initiated by a test script to check if the client workload is successfully progressing after a
+   cluster change.
+
+Executing the Tests
+===================
+
+The ``fdb_c_api_tester`` executable takes a single TOML file as a parameter and executes the test
+according to its specification. Before that we must create a FDB cluster and pass its cluster file as
+a parameter to ``fdb_c_api_tester``. Note that multithreaded tests also need to be provided with an
+external client library. 
+
+For example, we can create a temporary cluster and use it for execution of one of the existing API tests:
+
+.. code-block:: bash
+
+   ${srcDir}/tests/TestRunner/tmp_cluster.py --build-dir ${buildDir} -- \
+      ${buildDir}/bin/fdb_c_api_tester \
+      --cluster-file @CLUSTER_FILE@ \
+      --external-client-library=${buildDir}/bindings/c/libfdb_c_external.so \
+      --test-file ${srcDir}/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml
+
+The test specifications added to the ``bindings/c/test/apitester/tests/`` directory are executed as a part
+of the regression test suite. They can be executed using the ``ctest`` target ``fdb_c_api_tests``:
+
+.. code-block:: bash
+   
+   ctest -R fdb_c_api_tests -VV
diff --git a/documentation/sphinx/source/configuration.rst b/documentation/sphinx/source/configuration.rst
index 699c811139..5d52d40910 100644
--- a/documentation/sphinx/source/configuration.rst
+++ b/documentation/sphinx/source/configuration.rst
@@ -416,6 +416,9 @@ FoundationDB will never use processes on the same machine for the replication of
 ``three_data_hall`` mode
     FoundationDB stores data in triplicate, with one copy on a storage server in each of three data halls. The transaction logs are replicated four times, with two data halls containing two replicas apiece. Four available machines (two in each of two data halls) are therefore required to make progress. This configuration enables the cluster to remain available after losing a single data hall and one machine in another data hall.
 
+``three_data_hall_fallback`` mode
+    FoundationDB stores data in duplicate, with one copy each on a storage server in two of three data halls. The transaction logs are replicated four times, with two data halls containing two replicas apiece. Four available machines (two in each of two data halls) are therefore required to make progress. This configuration is similar to ``three_data_hall``, differing only in that data is stored on two instead of three replicas. This configuration is useful to unblock data distribution when a data hall becomes temporarily unavailable. Because ``three_data_hall_fallback`` reduces the redundancy level to two, it should only be used as a temporary measure to restore cluster health during a datacenter outage.
+
 Datacenter-aware mode
 ---------------------
 
diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index 9d68ab36c6..2cca7fb608 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -379,7 +379,9 @@
                   "log_server_min_free_space",
                   "log_server_min_free_space_ratio",
                   "storage_server_durability_lag",
-                  "storage_server_list_fetch_failed"
+                  "storage_server_list_fetch_failed",
+                  "blob_worker_lag",
+                  "blob_worker_missing"
                ]
             },
             "description":"The database is not being saturated by the workload."
@@ -400,7 +402,9 @@
                   "log_server_min_free_space",
                   "log_server_min_free_space_ratio",
                   "storage_server_durability_lag",
-                  "storage_server_list_fetch_failed"
+                  "storage_server_list_fetch_failed",
+                  "blob_worker_lag",
+                  "blob_worker_missing"
                ]
             },
             "description":"The database is not being saturated by the workload."
@@ -599,7 +603,7 @@
                "counter":0,
                "roughness":0.0
             },
-            "memory_errors":{ // measures number of proxy_memory_limit_exceeded errors
+            "memory_errors":{ // measures number of (commit/grv)_proxy_memory_limit_exceeded errors
                "hz":0.0,
                "counter":0,
                "roughness":0.0
diff --git a/documentation/sphinx/source/mr-status.rst b/documentation/sphinx/source/mr-status.rst
index 5eb404bbd4..ed550cbee7 100644
--- a/documentation/sphinx/source/mr-status.rst
+++ b/documentation/sphinx/source/mr-status.rst
@@ -131,6 +131,9 @@ min_free_space_ratio                Running out of space (approaching 5% limit).
 log_server_min_free_space           Log server running out of space (approaching 100MB limit).
 log_server_min_free_space_ratio     Log server running out of space (approaching 5% limit).
 storage_server_durability_lag       Storage server durable version falling behind.
+storage_server_list_fetch_failed    Unable to fetch storage server list.
+blob_worker_lag                     Blob worker granule version falling behind.
+blob_worker_missing                 No blob workers are reporting metrics.
 =================================== ====================================================
 
 The JSON path ``cluster.qos.throttled_tags``, when it exists, is an Object containing ``"auto"`` , ``"manual"`` and ``"recommended"``.  The possible fields for those object are in the following table:
diff --git a/documentation/sphinx/source/release-notes/release-notes-710.rst b/documentation/sphinx/source/release-notes/release-notes-710.rst
index 1cd51ad968..05a33625e8 100644
--- a/documentation/sphinx/source/release-notes/release-notes-710.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-710.rst
@@ -2,6 +2,30 @@
 Release Notes
 #############
 
+7.1.21
+======
+* Same as 7.1.20 release with AVX enabled.
+
+7.1.20
+======
+* Released with AVX disabled.
+* Fixed missing localities for fdbserver that can cause cross DC calls among storage servers. `(PR #7995) <https://github.com/apple/foundationdb/pull/7995>`_
+* Removed extremely spammy trace event in FetchKeys and fixed transaction_profiling_analyzer.py. `(PR #7934) <https://github.com/apple/foundationdb/pull/7934>`_
+* Fixed bugs when GRV proxy returns an error. `(PR #7860) <https://github.com/apple/foundationdb/pull/7860>`_
+
+7.1.19
+======
+* Same as 7.1.18 release with AVX enabled.
+
+7.1.18
+======
+* Released with AVX disabled.
+* Added knobs for the minimum and the maximum of the Ratekeeper's default priority. `(PR #7820) <https://github.com/apple/foundationdb/pull/7820>`_
+* Fixed bugs in ``getRange`` of the special key space. `(PR #7778) <https://github.com/apple/foundationdb/pull/7778>`_, `(PR #7720) <https://github.com/apple/foundationdb/pull/7720>`_
+* Added debug ID for secondary queries in index prefetching. `(PR #7755) <https://github.com/apple/foundationdb/pull/7755>`_
+* Changed hostname resolving to prefer IPv6 addresses. `(PR #7750) <https://github.com/apple/foundationdb/pull/7750>`_
+* Added more transaction debug events for prefetch queries. `(PR #7732) <https://github.com/apple/foundationdb/pull/7732>`_
+
 7.1.17
 ======
 * Same as 7.1.16 release with AVX enabled.
@@ -15,7 +39,7 @@ Release Notes
 * Fixed ScopeEventFieldTypeMismatch error for TLogMetrics. `(PR #7640) <https://github.com/apple/foundationdb/pull/7640>`_
 * Added getMappedRange latency metrics. `(PR #7632) <https://github.com/apple/foundationdb/pull/7632>`_
 * Fixed a version vector performance bug due to not updating client side tag cache. `(PR #7616) <https://github.com/apple/foundationdb/pull/7616>`_
-* Fixed DiskReadSeconds and DiskWriteSeconds calculaion in ProcessMetrics. `(PR #7609) <https://github.com/apple/foundationdb/pull/7609>`_
+* Fixed DiskReadSeconds and DiskWriteSeconds calculation in ProcessMetrics. `(PR #7609) <https://github.com/apple/foundationdb/pull/7609>`_
 * Added Rocksdb compression and data size stats. `(PR #7596) <https://github.com/apple/foundationdb/pull/7596>`_
 
 7.1.15
@@ -74,7 +98,7 @@ Release Notes
 * Added support of the reboot command in go bindings. `(PR #7270) <https://github.com/apple/foundationdb/pull/7270>`_
 * Fixed several issues in profiling special keys using GlobalConfig. `(PR #7120) <https://github.com/apple/foundationdb/pull/7120>`_
 * Fixed a stuck transaction system bug due to inconsistent recovery transaction version. `(PR #7261) <https://github.com/apple/foundationdb/pull/7261>`_
-* Fixed a unknown_error crash due to not resolving hostnames. `(PR #7254) <https://github.com/apple/foundationdb/pull/7254>`_
+* Fixed an unknown_error crash due to not resolving hostnames. `(PR #7254) <https://github.com/apple/foundationdb/pull/7254>`_
 * Fixed a heap-use-after-free bug. `(PR #7250) <https://github.com/apple/foundationdb/pull/7250>`_
 * Fixed a performance issue that remote TLogs are sending too many pops to log routers. `(PR #7235) <https://github.com/apple/foundationdb/pull/7235>`_
 * Fixed an issue that SharedTLogs are not displaced and leaking disk space. `(PR #7246) <https://github.com/apple/foundationdb/pull/7246>`_
diff --git a/documentation/sphinx/source/special-keys.rst b/documentation/sphinx/source/special-keys.rst
index 45b9576f31..aa5eede4af 100644
--- a/documentation/sphinx/source/special-keys.rst
+++ b/documentation/sphinx/source/special-keys.rst
@@ -22,6 +22,8 @@ Each special key that existed before api version 630 is its own module. These ar
 #. ``\xff\xff/cluster_file_path`` - See :ref:`cluster file client access <cluster-file-client-access>`
 #. ``\xff\xff/status/json`` - See :doc:`Machine-readable status <mr-status>`
 
+#. ``\xff\xff/worker_interfaces`` - key as the worker's network address and value as the serialized ClientWorkerInterface, not transactional
+
 Prior to api version 630, it was also possible to read a range starting at ``\xff\xff/worker_interfaces``. This is mostly an implementation detail of fdbcli,
 but it's available in api version 630 as a module with prefix ``\xff\xff/worker_interfaces/``.
 
@@ -210,6 +212,7 @@ that process, and wait for necessary data to be moved away.
 #. ``\xff\xff/management/options/failed_locality/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed_locality/<locality>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
 #. ``\xff\xff/management/tenant/map/<tenant>`` Read/write. Setting a key in this range to any value will result in a tenant being created with name ``<tenant>``. Clearing a key in this range will delete the tenant with name ``<tenant>``. Reading all or a portion of this range will return the list of tenants currently present in the cluster, excluding any changes in this transaction. Values read in this range will be JSON objects containing the metadata for the associated tenants.
 #. ``\xff\xff/management/tenant/rename/<tenant>`` Read/write. Setting a key in this range to an unused tenant name will result in the tenant with the name ``<tenant>`` to be renamed to the value provided. If the rename operation is a transaction retried in a loop, it is possible for the rename to be applied twice, in which case ``tenant_not_found`` or ``tenant_already_exists`` errors may be returned. This can be avoided by checking for the tenant's existence first.
+#. ``\xff\xff/management/options/worker_interfaces/verify`` Read/write. Setting this key will add a verification phase in reading ``\xff\xff/worker_interfaces``. Setting this key only has an effect in the current transaction and is not persisted on commit. Try to establish connections with every worker from the list returned by Cluster Controller and only return those workers that the client can connect to. This option is now only used in fdbcli commands ``kill``, ``suspend`` and ``expensive_data_check`` to populate the worker list.
 
 An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or
 an ip address and port (e.g. ``127.0.0.1:4500``) or any locality (e.g ``locality_dcid:primary-satellite`` or
diff --git a/documentation/sphinx/source/tenants.rst b/documentation/sphinx/source/tenants.rst
index d22603b20e..b631c55ba2 100644
--- a/documentation/sphinx/source/tenants.rst
+++ b/documentation/sphinx/source/tenants.rst
@@ -49,7 +49,7 @@ All operations performed within a tenant transaction will occur within the tenan
 Raw access
 ----------
 
-When operating in the tenant mode ``required_experimental``, transactions are not ordinarily permitted to run without using a tenant. In order to access the system keys or perform maintenance operations that span multiple tenants, it is required to use the ``RAW_ACCESS`` transaction option to access the global key-space. It is an error to specify ``RAW_ACCESS`` on a transaction that is configured to use a tenant.
+When operating in the tenant mode ``required_experimental`` or using a metacluster, transactions are not ordinarily permitted to run without using a tenant. In order to access the system keys or perform maintenance operations that span multiple tenants, it is required to use the ``RAW_ACCESS`` transaction option to access the global key-space. It is an error to specify ``RAW_ACCESS`` on a transaction that is configured to use a tenant.
 
 .. note :: Setting the ``READ_SYSTEM_KEYS`` or ``ACCESS_SYSTEM_KEYS`` options implies ``RAW_ACCESS`` for your transaction.
 
diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp
index 3d7889a36c..1a0f2eba14 100644
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@@ -928,7 +928,7 @@ void parentWatcher(void* parentHandle) {
 static void printVersion() {
 	printf("FoundationDB " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n");
 	printf("source version %s\n", getSourceVersion());
-	printf("protocol %llx\n", (long long)currentProtocolVersion.version());
+	printf("protocol %llx\n", (long long)currentProtocolVersion().version());
 }
 
 static void printBuildInformation() {
diff --git a/fdbcli/BlobRangeCommand.actor.cpp b/fdbcli/BlobRangeCommand.actor.cpp
index b5fa48ff0d..4c6bdf9614 100644
--- a/fdbcli/BlobRangeCommand.actor.cpp
+++ b/fdbcli/BlobRangeCommand.actor.cpp
@@ -23,6 +23,7 @@
 #include "fdbclient/FDBOptions.g.h"
 #include "fdbclient/IClientApi.h"
 #include "fdbclient/ManagementAPI.actor.h"
+#include "fdbclient/NativeAPI.actor.h"
 
 #include "flow/Arena.h"
 #include "flow/FastRef.h"
@@ -31,33 +32,6 @@
 
 namespace {
 
-// copy to standalones for krm
-ACTOR Future<Void> setBlobRange(Database db, Key startKey, Key endKey, Value value) {
-	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
-
-	loop {
-		try {
-			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-
-			// FIXME: check that the set range is currently inactive, and that a revoked range is currently its own
-			// range in the map and fully set.
-
-			tr->set(blobRangeChangeKey, deterministicRandom()->randomUniqueID().toString());
-			// This is not coalescing because we want to keep each range logically separate.
-			wait(krmSetRange(tr, blobRangeKeys.begin, KeyRange(KeyRangeRef(startKey, endKey)), value));
-			wait(tr->commit());
-			printf("Successfully updated blob range [%s - %s) to %s\n",
-			       startKey.printable().c_str(),
-			       endKey.printable().c_str(),
-			       value.printable().c_str());
-			return Void();
-		} catch (Error& e) {
-			wait(tr->onError(e));
-		}
-	}
-}
-
 ACTOR Future<Version> getLatestReadVersion(Database db) {
 	state Transaction tr(db);
 	loop {
@@ -78,7 +52,7 @@ ACTOR Future<Void> printAfterDelay(double delaySeconds, std::string message) {
 	return Void();
 }
 
-ACTOR Future<Void> doBlobPurge(Database db, Key startKey, Key endKey, Optional<Version> version) {
+ACTOR Future<Void> doBlobPurge(Database db, Key startKey, Key endKey, Optional<Version> version, bool force) {
 	state Version purgeVersion;
 	if (version.present()) {
 		purgeVersion = version.get();
@@ -86,7 +60,7 @@ ACTOR Future<Void> doBlobPurge(Database db, Key startKey, Key endKey, Optional<V
 		wait(store(purgeVersion, getLatestReadVersion(db)));
 	}
 
-	state Key purgeKey = wait(db->purgeBlobGranules(KeyRange(KeyRangeRef(startKey, endKey)), purgeVersion, {}));
+	state Key purgeKey = wait(db->purgeBlobGranules(KeyRange(KeyRangeRef(startKey, endKey)), purgeVersion, {}, force));
 
 	fmt::print("Blob purge registered for [{0} - {1}) @ {2}\n", startKey.printable(), endKey.printable(), purgeVersion);
 
@@ -99,65 +73,10 @@ ACTOR Future<Void> doBlobPurge(Database db, Key startKey, Key endKey, Optional<V
 	return Void();
 }
 
-ACTOR Future<Version> checkBlobSubrange(Database db, KeyRange keyRange, Optional<Version> version) {
-	state Transaction tr(db);
-	state Version readVersionOut = invalidVersion;
-	loop {
-		try {
-			wait(success(tr.readBlobGranules(keyRange, 0, version, &readVersionOut)));
-			return readVersionOut;
-		} catch (Error& e) {
-			wait(tr.onError(e));
-		}
-	}
-}
-
 ACTOR Future<Void> doBlobCheck(Database db, Key startKey, Key endKey, Optional<Version> version) {
-	state Transaction tr(db);
-	state Version readVersionOut = invalidVersion;
 	state double elapsed = -timer_monotonic();
-	state KeyRange range = KeyRange(KeyRangeRef(startKey, endKey));
-	state Standalone<VectorRef<KeyRangeRef>> allRanges;
-	loop {
-		try {
-			wait(store(allRanges, tr.getBlobGranuleRanges(range)));
-			break;
-		} catch (Error& e) {
-			wait(tr.onError(e));
-		}
-	}
 
-	if (allRanges.empty()) {
-		fmt::print("ERROR: No blob ranges for [{0} - {1})\n", startKey.printable(), endKey.printable());
-		return Void();
-	}
-	fmt::print("Loaded {0} blob ranges to check\n", allRanges.size());
-	state std::vector<Future<Version>> checkParts;
-	// chunk up to smaller ranges than max
-	int maxChunkSize = 1000;
-	KeyRange currentChunk;
-	int currentChunkSize = 0;
-	for (auto& it : allRanges) {
-		if (currentChunkSize == maxChunkSize) {
-			checkParts.push_back(checkBlobSubrange(db, currentChunk, version));
-			currentChunkSize = 0;
-		}
-		if (currentChunkSize == 0) {
-			currentChunk = it;
-		} else if (it.begin != currentChunk.end) {
-			fmt::print("ERROR: Blobrange check failed, gap in blob ranges from [{0} - {1})\n",
-			           currentChunk.end.printable(),
-			           it.begin.printable());
-			return Void();
-		} else {
-			currentChunk = KeyRangeRef(currentChunk.begin, it.end);
-		}
-		currentChunkSize++;
-	}
-	checkParts.push_back(checkBlobSubrange(db, currentChunk, version));
-
-	wait(waitForAll(checkParts));
-	readVersionOut = checkParts.back().get();
+	state Version readVersionOut = wait(db->verifyBlobRange(KeyRangeRef(startKey, endKey), version));
 
 	elapsed += timer_monotonic();
 
@@ -201,7 +120,7 @@ ACTOR Future<bool> blobRangeCommandActor(Database localDb,
 		fmt::print("Invalid blob range [{0} - {1})\n", tokens[2].printable(), tokens[3].printable());
 	} else {
 		if (tokencmp(tokens[1], "start") || tokencmp(tokens[1], "stop")) {
-			bool starting = tokencmp(tokens[1], "start");
+			state bool starting = tokencmp(tokens[1], "start");
 			if (tokens.size() > 4) {
 				printUsage(tokens[0]);
 				return false;
@@ -210,9 +129,22 @@ ACTOR Future<bool> blobRangeCommandActor(Database localDb,
 			           starting ? "Starting" : "Stopping",
 			           tokens[2].printable().c_str(),
 			           tokens[3].printable().c_str());
-			wait(setBlobRange(localDb, begin, end, starting ? LiteralStringRef("1") : StringRef()));
-		} else if (tokencmp(tokens[1], "purge") || tokencmp(tokens[1], "check")) {
-			bool purge = tokencmp(tokens[1], "purge");
+			state bool success = false;
+			if (starting) {
+				wait(store(success, localDb->blobbifyRange(KeyRangeRef(begin, end))));
+			} else {
+				wait(store(success, localDb->unblobbifyRange(KeyRangeRef(begin, end))));
+			}
+			if (!success) {
+				fmt::print("{0} blobbify range for [{1} - {2}) failed\n",
+				           starting ? "Starting" : "Stopping",
+				           tokens[2].printable().c_str(),
+				           tokens[3].printable().c_str());
+			}
+			return success;
+		} else if (tokencmp(tokens[1], "purge") || tokencmp(tokens[1], "forcepurge") || tokencmp(tokens[1], "check")) {
+			bool purge = tokencmp(tokens[1], "purge") || tokencmp(tokens[1], "forcepurge");
+			bool forcePurge = tokencmp(tokens[1], "forcepurge");
 
 			Optional<Version> version;
 			if (tokens.size() > 4) {
@@ -225,17 +157,18 @@ ACTOR Future<bool> blobRangeCommandActor(Database localDb,
 				version = v;
 			}
 
-			fmt::print("{0} blob range [{1} - {2})",
+			fmt::print("{0} blob range [{1} - {2}){3}",
 			           purge ? "Purging" : "Checking",
 			           tokens[2].printable(),
-			           tokens[3].printable());
+			           tokens[3].printable(),
+			           forcePurge ? " (force)" : "");
 			if (version.present()) {
 				fmt::print(" @ {0}", version.get());
 			}
 			fmt::print("\n");
 
 			if (purge) {
-				wait(doBlobPurge(localDb, begin, end, version));
+				wait(doBlobPurge(localDb, begin, end, version, forcePurge));
 			} else {
 				wait(doBlobCheck(localDb, begin, end, version));
 			}
@@ -247,8 +180,7 @@ ACTOR Future<bool> blobRangeCommandActor(Database localDb,
 	return true;
 }
 
-CommandFactory blobRangeFactory("blobrange",
-                                CommandHelp("blobrange <start|stop|purge|check> <startkey> <endkey> [version]",
-                                            "",
-                                            ""));
+CommandFactory blobRangeFactory(
+    "blobrange",
+    CommandHelp("blobrange <start|stop|check|purge|forcepurge> <startkey> <endkey> [version]", "", ""));
 } // namespace fdb_cli
diff --git a/fdbcli/ConfigureCommand.actor.cpp b/fdbcli/ConfigureCommand.actor.cpp
index 37474242e1..52521ea677 100644
--- a/fdbcli/ConfigureCommand.actor.cpp
+++ b/fdbcli/ConfigureCommand.actor.cpp
@@ -272,6 +272,10 @@ ACTOR Future<bool> configureCommandActor(Reference<IDatabase> db,
 		    stderr,
 		    "WARN: Sharded RocksDB storage engine type is still in experimental stage, not yet production tested.\n");
 		break;
+	case ConfigurationResult::DATABASE_IS_REGISTERED:
+		fprintf(stderr, "ERROR: A cluster cannot change its tenant mode while part of a metacluster.\n");
+		ret = false;
+		break;
 	default:
 		ASSERT(false);
 		ret = false;
diff --git a/fdbcli/ExpensiveDataCheckCommand.actor.cpp b/fdbcli/ExpensiveDataCheckCommand.actor.cpp
index e9d5c5b989..3be572d3d1 100644
--- a/fdbcli/ExpensiveDataCheckCommand.actor.cpp
+++ b/fdbcli/ExpensiveDataCheckCommand.actor.cpp
@@ -46,7 +46,7 @@ ACTOR Future<bool> expensiveDataCheckCommandActor(
 	if (tokens.size() == 1) {
 		// initialize worker interfaces
 		address_interface->clear();
-		wait(getWorkerInterfaces(tr, address_interface));
+		wait(getWorkerInterfaces(tr, address_interface, true));
 	}
 	if (tokens.size() == 1 || tokencmp(tokens[1], "list")) {
 		if (address_interface->size() == 0) {
diff --git a/fdbcli/KillCommand.actor.cpp b/fdbcli/KillCommand.actor.cpp
index d025b10388..c8fa75bb1c 100644
--- a/fdbcli/KillCommand.actor.cpp
+++ b/fdbcli/KillCommand.actor.cpp
@@ -44,7 +44,7 @@ ACTOR Future<bool> killCommandActor(Reference<IDatabase> db,
 	if (tokens.size() == 1) {
 		// initialize worker interfaces
 		address_interface->clear();
-		wait(getWorkerInterfaces(tr, address_interface));
+		wait(getWorkerInterfaces(tr, address_interface, true));
 	}
 	if (tokens.size() == 1 || tokencmp(tokens[1], "list")) {
 		if (address_interface->size() == 0) {
diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp
new file mode 100644
index 0000000000..da7c0f79fd
--- /dev/null
+++ b/fdbcli/MetaclusterCommands.actor.cpp
@@ -0,0 +1,432 @@
+/*
+ * MetaclusterCommands.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbcli/fdbcli.actor.h"
+
+#include "fdbclient/FDBOptions.g.h"
+#include "fdbclient/IClientApi.h"
+#include "fdbclient/Knobs.h"
+#include "fdbclient/MetaclusterManagement.actor.h"
+#include "fdbclient/Schemas.h"
+
+#include "flow/Arena.h"
+#include "flow/FastRef.h"
+#include "flow/ThreadHelper.actor.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+namespace fdb_cli {
+
+Optional<std::pair<Optional<ClusterConnectionString>, Optional<DataClusterEntry>>>
+parseClusterConfiguration(std::vector<StringRef> const& tokens, DataClusterEntry const& defaults, int startIndex) {
+	Optional<DataClusterEntry> entry;
+	Optional<ClusterConnectionString> connectionString;
+
+	std::set<std::string> usedParams;
+	for (int tokenNum = startIndex; tokenNum < tokens.size(); ++tokenNum) {
+		StringRef token = tokens[tokenNum];
+		bool foundEquals;
+		StringRef param = token.eat("=", &foundEquals);
+		if (!foundEquals) {
+			fmt::print(stderr,
+			           "ERROR: invalid configuration string `{}'. String must specify a value using `='.\n",
+			           param.toString().c_str());
+			return {};
+		}
+		std::string value = token.toString();
+		if (!usedParams.insert(value).second) {
+			fmt::print(
+			    stderr, "ERROR: configuration parameter `{}' specified more than once.\n", param.toString().c_str());
+			return {};
+		}
+		if (tokencmp(param, "max_tenant_groups")) {
+			entry = defaults;
+
+			int n;
+			if (sscanf(value.c_str(), "%d%n", &entry.get().capacity.numTenantGroups, &n) != 1 || n != value.size() ||
+			    entry.get().capacity.numTenantGroups < 0) {
+				fmt::print(stderr, "ERROR: invalid number of tenant groups `{}'.\n", value.c_str());
+				return {};
+			}
+		} else if (tokencmp(param, "connection_string")) {
+			connectionString = ClusterConnectionString(value);
+		} else {
+			fmt::print(stderr, "ERROR: unrecognized configuration parameter `{}'.\n", param.toString().c_str());
+			return {};
+		}
+	}
+
+	return std::make_pair(connectionString, entry);
+}
+
+void printMetaclusterConfigureOptionsUsage() {
+	fmt::print("max_tenant_groups sets the maximum number of tenant groups that can be assigned\n"
+	           "to the named data cluster.\n");
+	fmt::print("connection_string sets the connection string for the named data cluster.\n");
+}
+
+// metacluster create command
+ACTOR Future<bool> metaclusterCreateCommand(Reference<IDatabase> db, std::vector<StringRef> tokens) {
+	if (tokens.size() != 3) {
+		fmt::print("Usage: metacluster create_experimental <NAME>\n\n");
+		fmt::print("Configures the cluster to be a management cluster in a metacluster.\n");
+		fmt::print("NAME is an identifier used to distinguish this metacluster from other metaclusters.\n");
+		return false;
+	}
+
+	Optional<std::string> errorStr = wait(MetaclusterAPI::createMetacluster(db, tokens[2]));
+	if (errorStr.present()) {
+		fmt::print("ERROR: {}.\n", errorStr.get());
+	} else {
+		fmt::print("The cluster has been configured as a metacluster.\n");
+	}
+	return true;
+}
+
+// metacluster decommission command
+ACTOR Future<bool> metaclusterDecommissionCommand(Reference<IDatabase> db, std::vector<StringRef> tokens) {
+	if (tokens.size() != 2) {
+		fmt::print("Usage: metacluster decommission\n\n");
+		fmt::print("Converts the current cluster from a metacluster management cluster back into an\n");
+		fmt::print("ordinary cluster. It must be called on a cluster with no registered data clusters.\n");
+		return false;
+	}
+
+	wait(MetaclusterAPI::decommissionMetacluster(db));
+
+	fmt::print("The cluster is no longer a metacluster.\n");
+	return true;
+}
+
+// metacluster register command
+ACTOR Future<bool> metaclusterRegisterCommand(Reference<IDatabase> db, std::vector<StringRef> tokens) {
+	if (tokens.size() < 4) {
+		fmt::print("Usage: metacluster register <NAME> connection_string=<CONNECTION_STRING>\n"
+		           "[max_tenant_groups=<NUM_GROUPS>]\n\n");
+		fmt::print("Adds a data cluster to a metacluster.\n");
+		fmt::print("NAME is used to identify the cluster in future commands.\n");
+		printMetaclusterConfigureOptionsUsage();
+		return false;
+	}
+
+	DataClusterEntry defaultEntry;
+	auto config = parseClusterConfiguration(tokens, defaultEntry, 3);
+	if (!config.present()) {
+		return false;
+	} else if (!config.get().first.present()) {
+		fmt::print(stderr, "ERROR: connection_string must be configured when registering a cluster.\n");
+		return false;
+	}
+
+	wait(MetaclusterAPI::registerCluster(
+	    db, tokens[2], config.get().first.get(), config.get().second.orDefault(defaultEntry)));
+
+	fmt::print("The cluster `{}' has been added\n", printable(tokens[2]).c_str());
+	return true;
+}
+
+// metacluster remove command
+ACTOR Future<bool> metaclusterRemoveCommand(Reference<IDatabase> db, std::vector<StringRef> tokens) {
+	if (tokens.size() < 3 || tokens.size() > 4 || (tokens.size() == 4 && tokens[2] != "FORCE"_sr)) {
+		fmt::print("Usage: metacluster remove [FORCE] <NAME> \n\n");
+		fmt::print("Removes the specified data cluster from a metacluster.\n");
+		fmt::print("If FORCE is specified, then the cluster will be detached even if it has\n"
+		           "tenants assigned to it.\n");
+		return false;
+	}
+
+	state ClusterNameRef clusterName = tokens[tokens.size() - 1];
+	wait(MetaclusterAPI::removeCluster(db, clusterName, tokens.size() == 4));
+
+	fmt::print("The cluster `{}' has been removed\n", printable(clusterName).c_str());
+	return true;
+}
+
+// metacluster configure command
+ACTOR Future<bool> metaclusterConfigureCommand(Reference<IDatabase> db, std::vector<StringRef> tokens) {
+	if (tokens.size() < 4) {
+		fmt::print("Usage: metacluster configure <NAME> <max_tenant_groups=<NUM_GROUPS>|\n"
+		           "connection_string=<CONNECTION_STRING>> ...\n\n");
+		fmt::print("Updates the configuration of the metacluster.\n");
+		printMetaclusterConfigureOptionsUsage();
+		return false;
+	}
+
+	state Reference<ITransaction> tr = db->createTransaction();
+
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+			Optional<DataClusterMetadata> metadata = wait(MetaclusterAPI::tryGetClusterTransaction(tr, tokens[2]));
+			if (!metadata.present()) {
+				throw cluster_not_found();
+			}
+
+			auto config = parseClusterConfiguration(tokens, metadata.get().entry, 3);
+			if (!config.present()) {
+				return false;
+			}
+
+			MetaclusterAPI::updateClusterMetadata(
+			    tr, tokens[2], metadata.get(), config.get().first, config.get().second);
+
+			wait(safeThreadFutureToFuture(tr->commit()));
+			break;
+		} catch (Error& e) {
+			wait(safeThreadFutureToFuture(tr->onError(e)));
+		}
+	}
+
+	return true;
+}
+
+// metacluster list command
+ACTOR Future<bool> metaclusterListCommand(Reference<IDatabase> db, std::vector<StringRef> tokens) {
+	if (tokens.size() > 5) {
+		fmt::print("Usage: metacluster list [BEGIN] [END] [LIMIT]\n\n");
+		fmt::print("Lists the data clusters in a metacluster.\n");
+		fmt::print("Only cluster names in the range BEGIN - END will be printed.\n");
+		fmt::print("An optional LIMIT can be specified to limit the number of results (default 100).\n");
+		return false;
+	}
+
+	state ClusterNameRef begin = tokens.size() > 2 ? tokens[2] : ""_sr;
+	state ClusterNameRef end = tokens.size() > 3 ? tokens[3] : "\xff"_sr;
+	int limit = 100;
+
+	if (tokens.size() > 4) {
+		int n = 0;
+		if (sscanf(tokens[3].toString().c_str(), "%d%n", &limit, &n) != 1 || n != tokens[3].size() || limit < 0) {
+			fmt::print(stderr, "ERROR: invalid limit {}\n", tokens[3].toString().c_str());
+			return false;
+		}
+	}
+
+	std::map<ClusterName, DataClusterMetadata> clusters = wait(MetaclusterAPI::listClusters(db, begin, end, limit));
+	if (clusters.empty()) {
+		if (tokens.size() == 2) {
+			fmt::print("The metacluster has no registered data clusters\n");
+		} else {
+			fmt::print("The metacluster has no registered data clusters in the specified range\n");
+		}
+	}
+
+	int index = 0;
+	for (auto cluster : clusters) {
+		fmt::print("  {}. {}\n", ++index, printable(cluster.first).c_str());
+	}
+
+	return true;
+}
+
+// metacluster get command
+ACTOR Future<bool> metaclusterGetCommand(Reference<IDatabase> db, std::vector<StringRef> tokens) {
+	if (tokens.size() > 4 || (tokens.size() == 4 && tokens[3] != "JSON"_sr)) {
+		fmt::print("Usage: metacluster get <NAME> [JSON]\n\n");
+		fmt::print("Prints metadata associated with the given data cluster.\n");
+		fmt::print("If JSON is specified, then the output will be in JSON format.\n");
+		return false;
+	}
+
+	state bool useJson = tokens.size() == 4;
+
+	try {
+		DataClusterMetadata metadata = wait(MetaclusterAPI::getCluster(db, tokens[2]));
+
+		if (useJson) {
+			json_spirit::mObject obj;
+			obj["type"] = "success";
+			obj["cluster"] = metadata.toJson();
+			fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
+		} else {
+			fmt::print("  connection string: {}\n", metadata.connectionString.toString().c_str());
+			fmt::print("  cluster state: {}\n", DataClusterEntry::clusterStateToString(metadata.entry.clusterState));
+			fmt::print("  tenant group capacity: {}\n", metadata.entry.capacity.numTenantGroups);
+			fmt::print("  allocated tenant groups: {}\n", metadata.entry.allocated.numTenantGroups);
+		}
+	} catch (Error& e) {
+		if (useJson) {
+			json_spirit::mObject obj;
+			obj["type"] = "error";
+			obj["error"] = e.what();
+			fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
+			return false;
+		} else {
+			throw;
+		}
+	}
+
+	return true;
+}
+
+// metacluster status command
+ACTOR Future<bool> metaclusterStatusCommand(Reference<IDatabase> db, std::vector<StringRef> tokens) {
+	if (tokens.size() < 2 || tokens.size() > 3) {
+		fmt::print("Usage: metacluster status [JSON]\n\n");
+		fmt::print("Prints metacluster metadata.\n");
+		fmt::print("If JSON is specified, then the output will be in JSON format.\n");
+		return false;
+	}
+
+	state bool useJson = tokens.size() == 3;
+
+	try {
+		std::map<ClusterName, DataClusterMetadata> clusters =
+		    wait(MetaclusterAPI::listClusters(db, ""_sr, "\xff"_sr, CLIENT_KNOBS->MAX_DATA_CLUSTERS));
+
+		ClusterUsage totalCapacity;
+		ClusterUsage totalAllocated;
+		for (auto cluster : clusters) {
+			totalCapacity.numTenantGroups +=
+			    std::max(cluster.second.entry.capacity.numTenantGroups, cluster.second.entry.allocated.numTenantGroups);
+			totalAllocated.numTenantGroups += cluster.second.entry.allocated.numTenantGroups;
+		}
+
+		if (useJson) {
+			json_spirit::mObject obj;
+			obj["type"] = "success";
+
+			json_spirit::mObject metaclusterObj;
+			metaclusterObj["data_clusters"] = (int)clusters.size();
+			metaclusterObj["capacity"] = totalCapacity.toJson();
+			metaclusterObj["allocated"] = totalAllocated.toJson();
+
+			obj["metacluster"] = metaclusterObj;
+			fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
+		} else {
+			fmt::print("  number of data clusters: {}\n", clusters.size());
+			fmt::print("  tenant group capacity: {}\n", totalCapacity.numTenantGroups);
+			fmt::print("  allocated tenant groups: {}\n", totalAllocated.numTenantGroups);
+		}
+
+		return true;
+	} catch (Error& e) {
+		if (useJson) {
+			json_spirit::mObject obj;
+			obj["type"] = "error";
+			obj["error"] = e.what();
+			fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
+			return false;
+		} else {
+			throw;
+		}
+	}
+}
+
+// metacluster command
+Future<bool> metaclusterCommand(Reference<IDatabase> db, std::vector<StringRef> tokens) {
+	if (tokens.size() == 1) {
+		printUsage(tokens[0]);
+		return true;
+	} else if (tokencmp(tokens[1], "create_experimental")) {
+		return metaclusterCreateCommand(db, tokens);
+	} else if (tokencmp(tokens[1], "decommission")) {
+		return metaclusterDecommissionCommand(db, tokens);
+	} else if (tokencmp(tokens[1], "register")) {
+		return metaclusterRegisterCommand(db, tokens);
+	} else if (tokencmp(tokens[1], "remove")) {
+		return metaclusterRemoveCommand(db, tokens);
+	} else if (tokencmp(tokens[1], "configure")) {
+		return metaclusterConfigureCommand(db, tokens);
+	} else if (tokencmp(tokens[1], "list")) {
+		return metaclusterListCommand(db, tokens);
+	} else if (tokencmp(tokens[1], "get")) {
+		return metaclusterGetCommand(db, tokens);
+	} else if (tokencmp(tokens[1], "status")) {
+		return metaclusterStatusCommand(db, tokens);
+	} else {
+		printUsage(tokens[0]);
+		return true;
+	}
+}
+
+void metaclusterGenerator(const char* text,
+                          const char* line,
+                          std::vector<std::string>& lc,
+                          std::vector<StringRef> const& tokens) {
+	if (tokens.size() == 1) {
+		const char* opts[] = {
+			"create_experimental", "decommission", "register", "remove", "configure", "list", "get", "status", nullptr
+		};
+		arrayGenerator(text, line, opts, lc);
+	} else if (tokens.size() > 1 && (tokencmp(tokens[1], "register") || tokencmp(tokens[1], "configure"))) {
+		const char* opts[] = { "max_tenant_groups=", "connection_string=", nullptr };
+		arrayGenerator(text, line, opts, lc);
+	} else if ((tokens.size() == 2 && tokencmp(tokens[1], "status")) ||
+	           (tokens.size() == 3 && tokencmp(tokens[1], "get"))) {
+		const char* opts[] = { "JSON", nullptr };
+		arrayGenerator(text, line, opts, lc);
+	}
+}
+
+std::vector<const char*> metaclusterHintGenerator(std::vector<StringRef> const& tokens, bool inArgument) {
+	if (tokens.size() == 1) {
+		return { "<create_experimental|decommission|register|remove|configure|list|get|status>", "[ARGS]" };
+	} else if (tokencmp(tokens[1], "create_experimental")) {
+		return { "<NAME>" };
+	} else if (tokencmp(tokens[1], "decommission")) {
+		return {};
+	} else if (tokencmp(tokens[1], "register") && tokens.size() < 5) {
+		static std::vector<const char*> opts = { "<NAME>",
+			                                     "connection_string=<CONNECTION_STRING>",
+			                                     "[max_tenant_groups=<NUM_GROUPS>]" };
+		return std::vector<const char*>(opts.begin() + tokens.size() - 2, opts.end());
+	} else if (tokencmp(tokens[1], "remove") && tokens.size() < 4) {
+		static std::vector<const char*> opts = { "[FORCE]", "<NAME>" };
+		if (tokens.size() == 2) {
+			return opts;
+		} else if (tokens.size() == 3 && (inArgument || tokens[2].size() == "FORCE"_sr.size()) &&
+		           "FORCE"_sr.startsWith(tokens[2])) {
+			return std::vector<const char*>(opts.begin() + tokens.size() - 2, opts.end());
+		} else {
+			return {};
+		}
+	} else if (tokencmp(tokens[1], "configure")) {
+		static std::vector<const char*> opts = {
+			"<NAME>", "<max_tenant_groups=<NUM_GROUPS>|connection_string=<CONNECTION_STRING>>"
+		};
+		return std::vector<const char*>(opts.begin() + std::min<int>(1, tokens.size() - 2), opts.end());
+	} else if (tokencmp(tokens[1], "list") && tokens.size() < 5) {
+		static std::vector<const char*> opts = { "[BEGIN]", "[END]", "[LIMIT]" };
+		return std::vector<const char*>(opts.begin() + tokens.size() - 2, opts.end());
+	} else if (tokencmp(tokens[1], "get") && tokens.size() < 4) {
+		static std::vector<const char*> opts = { "<NAME>", "[JSON]" };
+		return std::vector<const char*>(opts.begin() + tokens.size() - 2, opts.end());
+	} else if (tokencmp(tokens[1], "status") && tokens.size() == 2) {
+		return { "[JSON]" };
+	} else {
+		return {};
+	}
+}
+
+CommandFactory metaclusterRegisterFactory(
+    "metacluster",
+    CommandHelp("metacluster <create_experimental|decommission|register|remove|configure|list|get|status> [ARGS]",
+                "view and manage a metacluster",
+                "`create_experimental' and `decommission' set up or deconfigure a metacluster.\n"
+                "`register' and `remove' add and remove data clusters from the metacluster.\n"
+                "`configure' updates the configuration of a data cluster.\n"
+                "`list' prints a list of data clusters in the metacluster.\n"
+                "`get' prints the metadata for a particular data cluster.\n"
+                "`status' prints metacluster metadata.\n"),
+    &metaclusterGenerator,
+    &metaclusterHintGenerator);
+
+} // namespace fdb_cli
diff --git a/fdbcli/StatusCommand.actor.cpp b/fdbcli/StatusCommand.actor.cpp
index f8749c0bce..67dcfa946d 100644
--- a/fdbcli/StatusCommand.actor.cpp
+++ b/fdbcli/StatusCommand.actor.cpp
@@ -411,6 +411,7 @@ void printStatus(StatusObjectReader statusObj,
 			outputString += "\nConfiguration:";
 			std::string outputStringCache = outputString;
 			bool isOldMemory = false;
+			bool blobGranuleEnabled{ false };
 			try {
 				// Configuration section
 				// FIXME: Should we suppress this if there are cluster messages implying that the database has no
@@ -434,7 +435,6 @@ void printStatus(StatusObjectReader statusObj,
 					outputString += "unknown";
 
 				int intVal = 0;
-				bool blobGranuleEnabled{ false };
 				if (statusObjConfig.get("blob_granules_enabled", intVal) && intVal) {
 					blobGranuleEnabled = true;
 				}
@@ -1110,6 +1110,15 @@ void printStatus(StatusObjectReader statusObj,
 					outputString += "\n\nCoordination servers:";
 					outputString += getCoordinatorsInfoString(statusObj);
 				}
+
+				if (blobGranuleEnabled) {
+					outputString += "\n\nBlob Granules:";
+					StatusObjectReader statusObjBlobGranules = statusObjCluster["blob_granules"];
+					auto numWorkers = statusObjBlobGranules["number_of_blob_workers"].get_int();
+					outputString += "\n  Number of Workers      - " + format("%d", numWorkers);
+					auto numKeyRanges = statusObjBlobGranules["number_of_key_ranges"].get_int();
+					outputString += "\n  Number of Key Ranges   - " + format("%d", numKeyRanges);
+				}
 			}
 
 			// client time
diff --git a/fdbcli/SuspendCommand.actor.cpp b/fdbcli/SuspendCommand.actor.cpp
index 78a7fa1ed9..483ad4e445 100644
--- a/fdbcli/SuspendCommand.actor.cpp
+++ b/fdbcli/SuspendCommand.actor.cpp
@@ -43,7 +43,7 @@ ACTOR Future<bool> suspendCommandActor(Reference<IDatabase> db,
 	if (tokens.size() == 1) {
 		// initialize worker interfaces
 		address_interface->clear();
-		wait(getWorkerInterfaces(tr, address_interface));
+		wait(getWorkerInterfaces(tr, address_interface, true));
 		if (address_interface->size() == 0) {
 			printf("\nNo addresses can be suspended.\n");
 		} else if (address_interface->size() == 1) {
diff --git a/fdbcli/TenantCommands.actor.cpp b/fdbcli/TenantCommands.actor.cpp
index eebc556133..7648d8dbd8 100644
--- a/fdbcli/TenantCommands.actor.cpp
+++ b/fdbcli/TenantCommands.actor.cpp
@@ -25,6 +25,7 @@
 #include "fdbclient/IClientApi.h"
 #include "fdbclient/Knobs.h"
 #include "fdbclient/ManagementAPI.actor.h"
+#include "fdbclient/MetaclusterManagement.actor.h"
 #include "fdbclient/TenantManagement.actor.h"
 #include "fdbclient/Schemas.h"
 
@@ -100,9 +101,9 @@ Key makeConfigKey(TenantNameRef tenantName, StringRef configName) {
 	return tenantConfigSpecialKeyRange.begin.withSuffix(Tuple().append(tenantName).append(configName).pack());
 }
 
-void applyConfiguration(Reference<ITransaction> tr,
-                        TenantNameRef tenantName,
-                        std::map<Standalone<StringRef>, Optional<Value>> configuration) {
+void applyConfigurationToSpecialKeys(Reference<ITransaction> tr,
+                                     TenantNameRef tenantName,
+                                     std::map<Standalone<StringRef>, Optional<Value>> configuration) {
 	for (auto [configName, value] : configuration) {
 		if (value.present()) {
 			tr->set(makeConfigKey(tenantName, configName), value.get());
@@ -136,21 +137,32 @@ ACTOR Future<bool> createTenantCommandActor(Reference<IDatabase> db, std::vector
 	}
 
 	loop {
-		tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
 		try {
-			if (!doneExistenceCheck) {
-				// Hold the reference to the standalone's memory
-				state ThreadFuture<Optional<Value>> existingTenantFuture = tr->get(tenantNameKey);
-				Optional<Value> existingTenant = wait(safeThreadFutureToFuture(existingTenantFuture));
-				if (existingTenant.present()) {
-					throw tenant_already_exists();
+			tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+			state ClusterType clusterType = wait(TenantAPI::getClusterType(tr));
+			if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) {
+				TenantMapEntry tenantEntry;
+				for (auto const& [name, value] : configuration.get()) {
+					tenantEntry.configure(name, value);
 				}
-				doneExistenceCheck = true;
+				wait(MetaclusterAPI::createTenant(db, tokens[1], tenantEntry));
+			} else {
+				if (!doneExistenceCheck) {
+					// Hold the reference to the standalone's memory
+					state ThreadFuture<Optional<Value>> existingTenantFuture = tr->get(tenantNameKey);
+					Optional<Value> existingTenant = wait(safeThreadFutureToFuture(existingTenantFuture));
+					if (existingTenant.present()) {
+						throw tenant_already_exists();
+					}
+					doneExistenceCheck = true;
+				}
+
+				tr->set(tenantNameKey, ValueRef());
+				applyConfigurationToSpecialKeys(tr, tokens[1], configuration.get());
+				wait(safeThreadFutureToFuture(tr->commit()));
 			}
 
-			tr->set(tenantNameKey, ValueRef());
-			applyConfiguration(tr, tokens[1], configuration.get());
-			wait(safeThreadFutureToFuture(tr->commit()));
 			break;
 		} catch (Error& e) {
 			state Error err(e);
@@ -167,10 +179,12 @@ ACTOR Future<bool> createTenantCommandActor(Reference<IDatabase> db, std::vector
 	return true;
 }
 
-CommandFactory createTenantFactory("createtenant",
-                                   CommandHelp("createtenant <TENANT_NAME> [tenant_group=<TENANT_GROUP>]",
-                                               "creates a new tenant in the cluster",
-                                               "Creates a new tenant in the cluster with the specified name."));
+CommandFactory createTenantFactory(
+    "createtenant",
+    CommandHelp("createtenant <TENANT_NAME> [tenant_group=<TENANT_GROUP>]",
+                "creates a new tenant in the cluster",
+                "Creates a new tenant in the cluster with the specified name. An optional group can be specified"
+                "that will require this tenant to be placed on the same cluster as other tenants in the same group."));
 
 // deletetenant command
 ACTOR Future<bool> deleteTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, int apiVersion) {
@@ -184,20 +198,27 @@ ACTOR Future<bool> deleteTenantCommandActor(Reference<IDatabase> db, std::vector
 	state bool doneExistenceCheck = false;
 
 	loop {
-		tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
 		try {
-			if (!doneExistenceCheck) {
-				// Hold the reference to the standalone's memory
-				state ThreadFuture<Optional<Value>> existingTenantFuture = tr->get(tenantNameKey);
-				Optional<Value> existingTenant = wait(safeThreadFutureToFuture(existingTenantFuture));
-				if (!existingTenant.present()) {
-					throw tenant_not_found();
+			tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+			state ClusterType clusterType = wait(TenantAPI::getClusterType(tr));
+			if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) {
+				wait(MetaclusterAPI::deleteTenant(db, tokens[1]));
+			} else {
+				if (!doneExistenceCheck) {
+					// Hold the reference to the standalone's memory
+					state ThreadFuture<Optional<Value>> existingTenantFuture = tr->get(tenantNameKey);
+					Optional<Value> existingTenant = wait(safeThreadFutureToFuture(existingTenantFuture));
+					if (!existingTenant.present()) {
+						throw tenant_not_found();
+					}
+					doneExistenceCheck = true;
 				}
-				doneExistenceCheck = true;
+
+				tr->clear(tenantNameKey);
+				wait(safeThreadFutureToFuture(tr->commit()));
 			}
 
-			tr->clear(tenantNameKey);
-			wait(safeThreadFutureToFuture(tr->commit()));
 			break;
 		} catch (Error& e) {
 			state Error err(e);
@@ -228,8 +249,8 @@ ACTOR Future<bool> listTenantsCommandActor(Reference<IDatabase> db, std::vector<
 		return false;
 	}
 
-	StringRef beginTenant = ""_sr;
-	StringRef endTenant = "\xff\xff"_sr;
+	state StringRef beginTenant = ""_sr;
+	state StringRef endTenant = "\xff\xff"_sr;
 	state int limit = 100;
 
 	if (tokens.size() >= 2) {
@@ -256,12 +277,26 @@ ACTOR Future<bool> listTenantsCommandActor(Reference<IDatabase> db, std::vector<
 
 	loop {
 		try {
-			// Hold the reference to the standalone's memory
-			state ThreadFuture<RangeResult> kvsFuture =
-			    tr->getRange(firstGreaterOrEqual(beginTenantKey), firstGreaterOrEqual(endTenantKey), limit);
-			RangeResult tenants = wait(safeThreadFutureToFuture(kvsFuture));
+			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+			state ClusterType clusterType = wait(TenantAPI::getClusterType(tr));
+			state std::vector<TenantNameRef> tenantNames;
+			if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) {
+				std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
+				    wait(MetaclusterAPI::listTenantsTransaction(tr, beginTenant, endTenant, limit));
+				for (auto tenant : tenants) {
+					tenantNames.push_back(tenant.first);
+				}
+			} else {
+				// Hold the reference to the standalone's memory
+				state ThreadFuture<RangeResult> kvsFuture =
+				    tr->getRange(firstGreaterOrEqual(beginTenantKey), firstGreaterOrEqual(endTenantKey), limit);
+				RangeResult tenants = wait(safeThreadFutureToFuture(kvsFuture));
+				for (auto tenant : tenants) {
+					tenantNames.push_back(tenant.key.removePrefix(tenantMapSpecialKeyRange(apiVersion).begin));
+				}
+			}
 
-			if (tenants.empty()) {
+			if (tenantNames.empty()) {
 				if (tokens.size() == 1) {
 					fmt::print("The cluster has no tenants\n");
 				} else {
@@ -270,10 +305,8 @@ ACTOR Future<bool> listTenantsCommandActor(Reference<IDatabase> db, std::vector<
 			}
 
 			int index = 0;
-			for (auto tenant : tenants) {
-				fmt::print("  {}. {}\n",
-				           ++index,
-				           printable(tenant.key.removePrefix(tenantMapSpecialKeyRange(apiVersion).begin)).c_str());
+			for (auto tenantName : tenantNames) {
+				fmt::print("  {}. {}\n", ++index, printable(tenantName).c_str());
 			}
 
 			return true;
@@ -309,15 +342,24 @@ ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<St
 
 	loop {
 		try {
-			// Hold the reference to the standalone's memory
-			state ThreadFuture<Optional<Value>> tenantFuture = tr->get(tenantNameKey);
-			Optional<Value> tenant = wait(safeThreadFutureToFuture(tenantFuture));
-			if (!tenant.present()) {
-				throw tenant_not_found();
+			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+			state ClusterType clusterType = wait(TenantAPI::getClusterType(tr));
+			state std::string tenantJson;
+			if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) {
+				TenantMapEntry entry = wait(MetaclusterAPI::getTenantTransaction(tr, tokens[1]));
+				tenantJson = entry.toJson(apiVersion);
+			} else {
+				// Hold the reference to the standalone's memory
+				state ThreadFuture<Optional<Value>> tenantFuture = tr->get(tenantNameKey);
+				Optional<Value> tenant = wait(safeThreadFutureToFuture(tenantFuture));
+				if (!tenant.present()) {
+					throw tenant_not_found();
+				}
+				tenantJson = tenant.get().toString();
 			}
 
 			json_spirit::mValue jsonObject;
-			json_spirit::read_string(tenant.get().toString(), jsonObject);
+			json_spirit::read_string(tenantJson, jsonObject);
 
 			if (useJson) {
 				json_spirit::mObject resultObj;
@@ -333,6 +375,7 @@ ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<St
 				std::string prefix;
 				std::string tenantState;
 				std::string tenantGroup;
+				std::string assignedCluster;
 
 				doc.get("id", id);
 
@@ -344,6 +387,7 @@ ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<St
 
 				doc.get("tenant_state", tenantState);
 				bool hasTenantGroup = doc.tryGet("tenant_group.printable", tenantGroup);
+				bool hasAssignedCluster = doc.tryGet("assigned_cluster", assignedCluster);
 
 				fmt::print("  id: {}\n", id);
 				fmt::print("  prefix: {}\n", printable(prefix).c_str());
@@ -351,8 +395,10 @@ ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<St
 				if (hasTenantGroup) {
 					fmt::print("  tenant group: {}\n", tenantGroup.c_str());
 				}
+				if (hasAssignedCluster) {
+					fmt::print("  assigned cluster: {}\n", printable(assignedCluster).c_str());
+				}
 			}
-
 			return true;
 		} catch (Error& e) {
 			try {
@@ -408,10 +454,17 @@ ACTOR Future<bool> configureTenantCommandActor(Reference<IDatabase> db, std::vec
 	state Reference<ITransaction> tr = db->createTransaction();
 
 	loop {
-		tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
 		try {
-			applyConfiguration(tr, tokens[1], configuration.get());
-			wait(safeThreadFutureToFuture(tr->commit()));
+			tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+			ClusterType clusterType = wait(TenantAPI::getClusterType(tr));
+			if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) {
+				TenantMapEntry tenantEntry;
+				wait(MetaclusterAPI::configureTenant(db, tokens[1], configuration.get()));
+			} else {
+				applyConfigurationToSpecialKeys(tr, tokens[1], configuration.get());
+				wait(safeThreadFutureToFuture(tr->commit()));
+			}
 			break;
 		} catch (Error& e) {
 			state Error err(e);
@@ -456,50 +509,56 @@ ACTOR Future<bool> renameTenantCommandActor(Reference<IDatabase> db, std::vector
 	state Key tenantOldNameKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(tokens[1]);
 	state Key tenantNewNameKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(tokens[2]);
 	state bool firstTry = true;
-	state int64_t id;
+	state int64_t id = -1;
 	loop {
-		tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
 		try {
-			// Hold the reference to the standalone's memory
-			state ThreadFuture<Optional<Value>> oldEntryFuture = tr->get(tenantOldNameKey);
-			state ThreadFuture<Optional<Value>> newEntryFuture = tr->get(tenantNewNameKey);
-			state Optional<Value> oldEntry = wait(safeThreadFutureToFuture(oldEntryFuture));
-			state Optional<Value> newEntry = wait(safeThreadFutureToFuture(newEntryFuture));
-			if (firstTry) {
-				if (!oldEntry.present()) {
-					throw tenant_not_found();
-				}
-				if (newEntry.present()) {
-					throw tenant_already_exists();
-				}
-				// Store the id we see when first reading this key
-				id = getTenantId(oldEntry.get());
-
-				firstTry = false;
+			tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+			state ClusterType clusterType = wait(TenantAPI::getClusterType(tr));
+			if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) {
+				wait(MetaclusterAPI::renameTenant(db, tokens[1], tokens[2]));
 			} else {
-				// If we got commit_unknown_result, the rename may have already occurred.
-				if (newEntry.present()) {
-					int64_t checkId = getTenantId(newEntry.get());
-					if (id == checkId) {
-						ASSERT(!oldEntry.present() || getTenantId(oldEntry.get()) != id);
-						return true;
+				// Hold the reference to the standalone's memory
+				state ThreadFuture<Optional<Value>> oldEntryFuture = tr->get(tenantOldNameKey);
+				state ThreadFuture<Optional<Value>> newEntryFuture = tr->get(tenantNewNameKey);
+				state Optional<Value> oldEntry = wait(safeThreadFutureToFuture(oldEntryFuture));
+				state Optional<Value> newEntry = wait(safeThreadFutureToFuture(newEntryFuture));
+				if (firstTry) {
+					if (!oldEntry.present()) {
+						throw tenant_not_found();
+					}
+					if (newEntry.present()) {
+						throw tenant_already_exists();
+					}
+					// Store the id we see when first reading this key
+					id = getTenantId(oldEntry.get());
+
+					firstTry = false;
+				} else {
+					// If we got commit_unknown_result, the rename may have already occurred.
+					if (newEntry.present()) {
+						int64_t checkId = getTenantId(newEntry.get());
+						if (id == checkId) {
+							ASSERT(!oldEntry.present() || getTenantId(oldEntry.get()) != id);
+							return true;
+						}
+						// If the new entry is present but does not match, then
+						// the rename should fail, so we throw an error.
+						throw tenant_already_exists();
+					}
+					if (!oldEntry.present()) {
+						throw tenant_not_found();
+					}
+					int64_t checkId = getTenantId(oldEntry.get());
+					// If the id has changed since we made our first attempt,
+					// then it's possible we've already moved the tenant. Don't move it again.
+					if (id != checkId) {
+						throw tenant_not_found();
 					}
-					// If the new entry is present but does not match, then
-					// the rename should fail, so we throw an error.
-					throw tenant_already_exists();
-				}
-				if (!oldEntry.present()) {
-					throw tenant_not_found();
-				}
-				int64_t checkId = getTenantId(oldEntry.get());
-				// If the id has changed since we made our first attempt,
-				// then it's possible we've already moved the tenant. Don't move it again.
-				if (id != checkId) {
-					throw tenant_not_found();
 				}
+				tr->set(tenantRenameKey, tokens[2]);
+				wait(safeThreadFutureToFuture(tr->commit()));
 			}
-			tr->set(tenantRenameKey, tokens[2]);
-			wait(safeThreadFutureToFuture(tr->commit()));
 			break;
 		} catch (Error& e) {
 			state Error err(e);
diff --git a/fdbcli/Util.actor.cpp b/fdbcli/Util.actor.cpp
index d40a5dcaeb..2d0e77d9fe 100644
--- a/fdbcli/Util.actor.cpp
+++ b/fdbcli/Util.actor.cpp
@@ -62,56 +62,52 @@ ACTOR Future<std::string> getSpecialKeysFailureErrorMessage(Reference<ITransacti
 	return valueObj["message"].get_str();
 }
 
-ACTOR Future<Void> verifyAndAddInterface(std::map<Key, std::pair<Value, ClientLeaderRegInterface>>* address_interface,
-                                         Reference<FlowLock> connectLock,
-                                         KeyValue kv) {
-	wait(connectLock->take());
-	state FlowLock::Releaser releaser(*connectLock);
-	state ClientWorkerInterface workerInterf;
-	try {
-		// the interface is back-ward compatible, thus if parsing failed, it needs to upgrade cli version
-		workerInterf = BinaryReader::fromStringRef<ClientWorkerInterface>(kv.value, IncludeVersion());
-	} catch (Error& e) {
-		fprintf(stderr, "Error: %s; CLI version is too old, please update to use a newer version\n", e.what());
-		return Void();
-	}
-	state ClientLeaderRegInterface leaderInterf(workerInterf.address());
-	choose {
-		when(Optional<LeaderInfo> rep =
-		         wait(brokenPromiseToNever(leaderInterf.getLeader.getReply(GetLeaderRequest())))) {
-			StringRef ip_port =
-			    (kv.key.endsWith(LiteralStringRef(":tls")) ? kv.key.removeSuffix(LiteralStringRef(":tls")) : kv.key)
-			        .removePrefix(LiteralStringRef("\xff\xff/worker_interfaces/"));
-			(*address_interface)[ip_port] = std::make_pair(kv.value, leaderInterf);
-
-			if (workerInterf.reboot.getEndpoint().addresses.secondaryAddress.present()) {
-				Key full_ip_port2 =
-				    StringRef(workerInterf.reboot.getEndpoint().addresses.secondaryAddress.get().toString());
-				StringRef ip_port2 = full_ip_port2.endsWith(LiteralStringRef(":tls"))
-				                         ? full_ip_port2.removeSuffix(LiteralStringRef(":tls"))
-				                         : full_ip_port2;
-				(*address_interface)[ip_port2] = std::make_pair(kv.value, leaderInterf);
-			}
+void addInterfacesFromKVs(RangeResult& kvs,
+                          std::map<Key, std::pair<Value, ClientLeaderRegInterface>>* address_interface) {
+	for (const auto& kv : kvs) {
+		ClientWorkerInterface workerInterf;
+		try {
+			// the interface is back-ward compatible, thus if parsing failed, it needs to upgrade cli version
+			workerInterf = BinaryReader::fromStringRef<ClientWorkerInterface>(kv.value, IncludeVersion());
+		} catch (Error& e) {
+			fprintf(stderr, "Error: %s; CLI version is too old, please update to use a newer version\n", e.what());
+			return;
+		}
+		ClientLeaderRegInterface leaderInterf(workerInterf.address());
+		StringRef ip_port =
+		    (kv.key.endsWith(LiteralStringRef(":tls")) ? kv.key.removeSuffix(LiteralStringRef(":tls")) : kv.key)
+		        .removePrefix(LiteralStringRef("\xff\xff/worker_interfaces/"));
+		(*address_interface)[ip_port] = std::make_pair(kv.value, leaderInterf);
+
+		if (workerInterf.reboot.getEndpoint().addresses.secondaryAddress.present()) {
+			Key full_ip_port2 =
+			    StringRef(workerInterf.reboot.getEndpoint().addresses.secondaryAddress.get().toString());
+			StringRef ip_port2 = full_ip_port2.endsWith(LiteralStringRef(":tls"))
+			                         ? full_ip_port2.removeSuffix(LiteralStringRef(":tls"))
+			                         : full_ip_port2;
+			(*address_interface)[ip_port2] = std::make_pair(kv.value, leaderInterf);
 		}
-		when(wait(delay(CLIENT_KNOBS->CLI_CONNECT_TIMEOUT))) {}
 	}
-	return Void();
 }
 
 ACTOR Future<Void> getWorkerInterfaces(Reference<ITransaction> tr,
-                                       std::map<Key, std::pair<Value, ClientLeaderRegInterface>>* address_interface) {
+                                       std::map<Key, std::pair<Value, ClientLeaderRegInterface>>* address_interface,
+                                       bool verify) {
+	if (verify) {
+		tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+		tr->set(workerInterfacesVerifyOptionSpecialKey, ValueRef());
+	}
 	// Hold the reference to the standalone's memory
 	state ThreadFuture<RangeResult> kvsFuture = tr->getRange(
 	    KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"), LiteralStringRef("\xff\xff/worker_interfaces0")),
 	    CLIENT_KNOBS->TOO_MANY);
-	RangeResult kvs = wait(safeThreadFutureToFuture(kvsFuture));
+	state RangeResult kvs = wait(safeThreadFutureToFuture(kvsFuture));
 	ASSERT(!kvs.more);
-	auto connectLock = makeReference<FlowLock>(CLIENT_KNOBS->CLI_CONNECT_PARALLELISM);
-	std::vector<Future<Void>> addInterfs;
-	for (auto it : kvs) {
-		addInterfs.push_back(verifyAndAddInterface(address_interface, connectLock, it));
+	if (verify) {
+		// remove the option if set
+		tr->clear(workerInterfacesVerifyOptionSpecialKey);
 	}
-	wait(waitForAll(addInterfs));
+	addInterfacesFromKVs(kvs, address_interface);
 	return Void();
 }
 
diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index 072b11fec0..b10ed32a20 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -103,6 +103,7 @@ enum {
 	OPT_DEBUG_TLS,
 	OPT_API_VERSION,
 	OPT_MEMORY,
+	OPT_USE_FUTURE_PROTOCOL_VERSION
 };
 
 CSimpleOpt::SOption g_rgOptions[] = { { OPT_CONNFILE, "-C", SO_REQ_SEP },
@@ -127,6 +128,7 @@ CSimpleOpt::SOption g_rgOptions[] = { { OPT_CONNFILE, "-C", SO_REQ_SEP },
 	                                  { OPT_DEBUG_TLS, "--debug-tls", SO_NONE },
 	                                  { OPT_API_VERSION, "--api-version", SO_REQ_SEP },
 	                                  { OPT_MEMORY, "--memory", SO_REQ_SEP },
+	                                  { OPT_USE_FUTURE_PROTOCOL_VERSION, "--use-future-protocol-version", SO_NONE },
 	                                  TLS_OPTION_FLAGS,
 	                                  SO_END_OF_OPTIONS };
 
@@ -475,6 +477,9 @@ static void printProgramUsage(const char* name) {
 	       "                 Useful in reporting and diagnosing TLS issues.\n"
 	       "  --build-flags  Print build information and exit.\n"
 	       "  --memory       Resident memory limit of the CLI (defaults to 8GiB).\n"
+	       "  --use-future-protocol-version\n"
+	       "                 Use the simulated future protocol version to connect to the cluster.\n"
+	       "                 This option can be used testing purposes only!\n"
 	       "  -v, --version  Print FoundationDB CLI version information and exit.\n"
 	       "  -h, --help     Display this help and exit.\n");
 }
@@ -578,7 +583,7 @@ void initHelp() {
 void printVersion() {
 	printf("FoundationDB CLI " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n");
 	printf("source version %s\n", getSourceVersion());
-	printf("protocol %" PRIx64 "\n", currentProtocolVersion.version());
+	printf("protocol %" PRIx64 "\n", currentProtocolVersion().version());
 }
 
 void printBuildInformation() {
@@ -872,6 +877,7 @@ struct CLIOptions {
 	Optional<std::string> exec;
 	bool initialStatusCheck = true;
 	bool cliHints = true;
+	bool useFutureProtocolVersion = false;
 	bool debugTLS = false;
 	std::string tlsCertPath;
 	std::string tlsKeyPath;
@@ -973,6 +979,10 @@ struct CLIOptions {
 			break;
 		case OPT_NO_HINTS:
 			cliHints = false;
+			break;
+		case OPT_USE_FUTURE_PROTOCOL_VERSION:
+			useFutureProtocolVersion = true;
+			break;
 
 		// TLS Options
 		case TLSConfig::OPT_TLS_PLUGIN:
@@ -1040,36 +1050,6 @@ Future<T> stopNetworkAfter(Future<T> what) {
 	}
 }
 
-ACTOR Future<Void> addInterface(std::map<Key, std::pair<Value, ClientLeaderRegInterface>>* address_interface,
-                                Reference<FlowLock> connectLock,
-                                KeyValue kv) {
-	wait(connectLock->take());
-	state FlowLock::Releaser releaser(*connectLock);
-	state ClientWorkerInterface workerInterf =
-	    BinaryReader::fromStringRef<ClientWorkerInterface>(kv.value, IncludeVersion());
-	state ClientLeaderRegInterface leaderInterf(workerInterf.address());
-	choose {
-		when(Optional<LeaderInfo> rep =
-		         wait(brokenPromiseToNever(leaderInterf.getLeader.getReply(GetLeaderRequest())))) {
-			StringRef ip_port =
-			    (kv.key.endsWith(LiteralStringRef(":tls")) ? kv.key.removeSuffix(LiteralStringRef(":tls")) : kv.key)
-			        .removePrefix(LiteralStringRef("\xff\xff/worker_interfaces/"));
-			(*address_interface)[ip_port] = std::make_pair(kv.value, leaderInterf);
-
-			if (workerInterf.reboot.getEndpoint().addresses.secondaryAddress.present()) {
-				Key full_ip_port2 =
-				    StringRef(workerInterf.reboot.getEndpoint().addresses.secondaryAddress.get().toString());
-				StringRef ip_port2 = full_ip_port2.endsWith(LiteralStringRef(":tls"))
-				                         ? full_ip_port2.removeSuffix(LiteralStringRef(":tls"))
-				                         : full_ip_port2;
-				(*address_interface)[ip_port2] = std::make_pair(kv.value, leaderInterf);
-			}
-		}
-		when(wait(delay(CLIENT_KNOBS->CLI_CONNECT_TIMEOUT))) {}
-	}
-	return Void();
-}
-
 ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 	state LineNoise& linenoise = *plinenoise;
 	state bool intrans = false;
@@ -1967,6 +1947,13 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 					continue;
 				}
 
+				if (tokencmp(tokens[0], "metacluster")) {
+					bool _result = wait(makeInterruptable(metaclusterCommand(db, tokens)));
+					if (!_result)
+						is_error = true;
+					continue;
+				}
+
 				fprintf(stderr, "ERROR: Unknown command `%s'. Try `help'?\n", formatStringRef(tokens[0]).c_str());
 				is_error = true;
 			}
@@ -2192,6 +2179,9 @@ int main(int argc, char** argv) {
 
 	try {
 		API->selectApiVersion(opt.apiVersion);
+		if (opt.useFutureProtocolVersion) {
+			API->useFutureProtocolVersion();
+		}
 		API->setupNetwork();
 		opt.setupKnobs();
 		if (opt.exit_code != -1) {
diff --git a/fdbcli/include/fdbcli/fdbcli.actor.h b/fdbcli/include/fdbcli/fdbcli.actor.h
index 2b56c216a0..3df51b4677 100644
--- a/fdbcli/include/fdbcli/fdbcli.actor.h
+++ b/fdbcli/include/fdbcli/fdbcli.actor.h
@@ -120,6 +120,7 @@ extern const KeyRangeRef processClassSourceSpecialKeyRange;
 extern const KeyRangeRef processClassTypeSpecialKeyRange;
 // Other special keys
 inline const KeyRef errorMsgSpecialKey = LiteralStringRef("\xff\xff/error_message");
+inline const KeyRef workerInterfacesVerifyOptionSpecialKey = "\xff\xff/management/options/worker_interfaces/verify"_sr;
 // help functions (Copied from fdbcli.actor.cpp)
 
 // get all workers' info
@@ -132,13 +133,14 @@ void printUsage(StringRef command);
 // Pre: tr failed with special_keys_api_failure error
 // Read the error message special key and return the message
 ACTOR Future<std::string> getSpecialKeysFailureErrorMessage(Reference<ITransaction> tr);
-// Using \xff\xff/worker_interfaces/ special key, get all worker interfaces
+// Using \xff\xff/worker_interfaces/ special key, get all worker interfaces.
+// A worker list will be returned from CC.
+// If verify, we will try to establish connections to all workers returned.
+// In particular, it will deserialize \xff\xff/worker_interfaces/<address>:=<ClientInterface> kv pairs and issue RPC
+// calls, then only return interfaces(kv pairs) the client can talk to
 ACTOR Future<Void> getWorkerInterfaces(Reference<ITransaction> tr,
-                                       std::map<Key, std::pair<Value, ClientLeaderRegInterface>>* address_interface);
-// Deserialize \xff\xff/worker_interfaces/<address>:=<ClientInterface> k-v pair and verify by a RPC call
-ACTOR Future<Void> verifyAndAddInterface(std::map<Key, std::pair<Value, ClientLeaderRegInterface>>* address_interface,
-                                         Reference<FlowLock> connectLock,
-                                         KeyValue kv);
+                                       std::map<Key, std::pair<Value, ClientLeaderRegInterface>>* address_interface,
+                                       bool verify = false);
 // print cluster status info
 void printStatus(StatusObjectReader statusObj,
                  StatusClient::StatusLevel level,
@@ -200,6 +202,10 @@ ACTOR Future<bool> listTenantsCommandActor(Reference<IDatabase> db, std::vector<
 // lock/unlock command
 ACTOR Future<bool> lockCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
 ACTOR Future<bool> unlockDatabaseActor(Reference<IDatabase> db, UID uid);
+
+// metacluster command
+Future<bool> metaclusterCommand(Reference<IDatabase> db, std::vector<StringRef> tokens);
+
 // changefeed command
 ACTOR Future<bool> changeFeedCommandActor(Database localDb,
                                           Optional<TenantMapEntry> tenantEntry,
diff --git a/fdbclient/BackupContainer.actor.cpp b/fdbclient/BackupContainer.actor.cpp
index 4b0c217293..c318a6591d 100644
--- a/fdbclient/BackupContainer.actor.cpp
+++ b/fdbclient/BackupContainer.actor.cpp
@@ -288,11 +288,46 @@ Reference<IBackupContainer> IBackupContainer::openContainer(const std::string& u
 #ifdef BUILD_AZURE_BACKUP
 		else if (u.startsWith("azure://"_sr)) {
 			u.eat("azure://"_sr);
-			auto accountName = u.eat("@"_sr).toString();
-			auto endpoint = u.eat("/"_sr).toString();
-			auto containerName = u.eat("/"_sr).toString();
-			r = makeReference<BackupContainerAzureBlobStore>(
-			    endpoint, accountName, containerName, encryptionKeyFileName);
+			auto address = u.eat("/"_sr);
+			if (address.endsWith(std::string(azure::storage_lite::constants::default_endpoint_suffix))) {
+				CODE_PROBE(true, "Azure backup url with standard azure storage account endpoint");
+				// <account>.<service>.core.windows.net/<resource_path>
+				auto endPoint = address.toString();
+				auto accountName = address.eat("."_sr).toString();
+				auto containerName = u.eat("/"_sr).toString();
+				r = makeReference<BackupContainerAzureBlobStore>(
+				    endPoint, accountName, containerName, encryptionKeyFileName);
+			} else {
+				// resolve the network address if necessary
+				std::string endpoint(address.toString());
+				Optional<NetworkAddress> parsedAddress = NetworkAddress::parseOptional(endpoint);
+				if (!parsedAddress.present()) {
+					try {
+						auto hostname = Hostname::parse(endpoint);
+						auto resolvedAddress = hostname.resolveBlocking();
+						if (resolvedAddress.present()) {
+							CODE_PROBE(true, "Azure backup url with hostname in the endpoint");
+							parsedAddress = resolvedAddress.get();
+						}
+					} catch (Error& e) {
+						TraceEvent(SevError, "InvalidAzureBackupUrl").error(e).detail("Endpoint", endpoint);
+						throw backup_invalid_url();
+					}
+				}
+				if (!parsedAddress.present()) {
+					TraceEvent(SevError, "InvalidAzureBackupUrl").detail("Endpoint", endpoint);
+					throw backup_invalid_url();
+				}
+				auto accountName = u.eat("/"_sr).toString();
+				// Avoid including ":tls" and "(fromHostname)"
+				// note: the endpoint needs to contain the account name
+				// so either "<account_name>.blob.core.windows.net" or "<ip>:<port>/<account_name>"
+				endpoint =
+				    fmt::format("{}/{}", formatIpPort(parsedAddress.get().ip, parsedAddress.get().port), accountName);
+				auto containerName = u.eat("/"_sr).toString();
+				r = makeReference<BackupContainerAzureBlobStore>(
+				    endpoint, accountName, containerName, encryptionKeyFileName);
+			}
 		}
 #endif
 		else {
diff --git a/fdbclient/BackupContainerFileSystem.actor.cpp b/fdbclient/BackupContainerFileSystem.actor.cpp
index b413bae0c8..b222153517 100644
--- a/fdbclient/BackupContainerFileSystem.actor.cpp
+++ b/fdbclient/BackupContainerFileSystem.actor.cpp
@@ -1523,11 +1523,46 @@ Reference<BackupContainerFileSystem> BackupContainerFileSystem::openContainerFS(
 #ifdef BUILD_AZURE_BACKUP
 		else if (u.startsWith("azure://"_sr)) {
 			u.eat("azure://"_sr);
-			auto accountName = u.eat("@"_sr).toString();
-			auto endpoint = u.eat("/"_sr).toString();
-			auto containerName = u.eat("/"_sr).toString();
-			r = makeReference<BackupContainerAzureBlobStore>(
-			    endpoint, accountName, containerName, encryptionKeyFileName);
+			auto address = u.eat("/"_sr);
+			if (address.endsWith(std::string(azure::storage_lite::constants::default_endpoint_suffix))) {
+				CODE_PROBE(true, "Azure backup url with standard azure storage account endpoint");
+				// <account>.<service>.core.windows.net/<resource_path>
+				auto endPoint = address.toString();
+				auto accountName = address.eat("."_sr).toString();
+				auto containerName = u.eat("/"_sr).toString();
+				r = makeReference<BackupContainerAzureBlobStore>(
+				    endPoint, accountName, containerName, encryptionKeyFileName);
+			} else {
+				// resolve the network address if necessary
+				std::string endpoint(address.toString());
+				Optional<NetworkAddress> parsedAddress = NetworkAddress::parseOptional(endpoint);
+				if (!parsedAddress.present()) {
+					try {
+						auto hostname = Hostname::parse(endpoint);
+						auto resolvedAddress = hostname.resolveBlocking();
+						if (resolvedAddress.present()) {
+							CODE_PROBE(true, "Azure backup url with hostname in the endpoint");
+							parsedAddress = resolvedAddress.get();
+						}
+					} catch (Error& e) {
+						TraceEvent(SevError, "InvalidAzureBackupUrl").error(e).detail("Endpoint", endpoint);
+						throw backup_invalid_url();
+					}
+				}
+				if (!parsedAddress.present()) {
+					TraceEvent(SevError, "InvalidAzureBackupUrl").detail("Endpoint", endpoint);
+					throw backup_invalid_url();
+				}
+				auto accountName = u.eat("/"_sr).toString();
+				// Avoid including ":tls" and "(fromHostname)"
+				// note: the endpoint needs to contain the account name
+				// so either "<account_name>.blob.core.windows.net" or "<ip>:<port>/<account_name>"
+				endpoint =
+				    fmt::format("{}/{}", formatIpPort(parsedAddress.get().ip, parsedAddress.get().port), accountName);
+				auto containerName = u.eat("/"_sr).toString();
+				r = makeReference<BackupContainerAzureBlobStore>(
+				    endpoint, accountName, containerName, encryptionKeyFileName);
+			}
 		}
 #endif
 		else {
diff --git a/fdbclient/BlobGranuleCommon.cpp b/fdbclient/BlobGranuleCommon.cpp
new file mode 100644
index 0000000000..44f32bcb25
--- /dev/null
+++ b/fdbclient/BlobGranuleCommon.cpp
@@ -0,0 +1,45 @@
+/*
+ * BlobGranuleCommon.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/BlobGranuleCommon.h"
+
+BlobGranuleSummaryRef summarizeGranuleChunk(Arena& ar, const BlobGranuleChunkRef& chunk) {
+	BlobGranuleSummaryRef summary;
+	ASSERT(chunk.snapshotFile.present());
+	ASSERT(chunk.snapshotVersion != invalidVersion);
+	ASSERT(chunk.includedVersion >= chunk.snapshotVersion);
+	ASSERT(chunk.newDeltas.empty());
+
+	if (chunk.tenantPrefix.present()) {
+		summary.keyRange = KeyRangeRef(ar, chunk.keyRange.removePrefix(chunk.tenantPrefix.get()));
+	} else {
+		summary.keyRange = KeyRangeRef(ar, chunk.keyRange);
+	}
+
+	summary.snapshotVersion = chunk.snapshotVersion;
+	summary.snapshotSize = chunk.snapshotFile.get().length;
+	summary.deltaVersion = chunk.includedVersion;
+	summary.deltaSize = 0;
+	for (auto& it : chunk.deltaFiles) {
+		summary.deltaSize += it.length;
+	}
+
+	return summary;
+}
\ No newline at end of file
diff --git a/fdbclient/BlobGranuleFiles.cpp b/fdbclient/BlobGranuleFiles.cpp
index d18c745ce4..0e402cedb1 100644
--- a/fdbclient/BlobGranuleFiles.cpp
+++ b/fdbclient/BlobGranuleFiles.cpp
@@ -40,6 +40,7 @@
 
 #include <cstring>
 #include <fstream> // for perf microbenchmark
+#include <limits>
 #include <vector>
 
 #define BG_READ_DEBUG false
@@ -209,16 +210,21 @@ namespace {
 BlobGranuleFileEncryptionKeys getEncryptBlobCipherKey(const BlobGranuleCipherKeysCtx cipherKeysCtx) {
 	BlobGranuleFileEncryptionKeys eKeys;
 
+	// Cipher key reconstructed is 'never' inserted into BlobCipherKey cache, choose 'neverExpire'
 	eKeys.textCipherKey = makeReference<BlobCipherKey>(cipherKeysCtx.textCipherKey.encryptDomainId,
 	                                                   cipherKeysCtx.textCipherKey.baseCipherId,
 	                                                   cipherKeysCtx.textCipherKey.baseCipher.begin(),
 	                                                   cipherKeysCtx.textCipherKey.baseCipher.size(),
-	                                                   cipherKeysCtx.textCipherKey.salt);
+	                                                   cipherKeysCtx.textCipherKey.salt,
+	                                                   std::numeric_limits<int64_t>::max(),
+	                                                   std::numeric_limits<int64_t>::max());
 	eKeys.headerCipherKey = makeReference<BlobCipherKey>(cipherKeysCtx.headerCipherKey.encryptDomainId,
 	                                                     cipherKeysCtx.headerCipherKey.baseCipherId,
 	                                                     cipherKeysCtx.headerCipherKey.baseCipher.begin(),
 	                                                     cipherKeysCtx.headerCipherKey.baseCipher.size(),
-	                                                     cipherKeysCtx.headerCipherKey.salt);
+	                                                     cipherKeysCtx.headerCipherKey.salt,
+	                                                     std::numeric_limits<int64_t>::max(),
+	                                                     std::numeric_limits<int64_t>::max());
 
 	return eKeys;
 }
@@ -346,7 +352,9 @@ struct IndexBlockRef {
 
 			decrypt(cipherKeysCtx.get(), *this, arena);
 		} else {
-			TraceEvent("IndexBlockSize").detail("Sz", buffer.size());
+			if (BG_ENCRYPT_COMPRESS_DEBUG) {
+				TraceEvent("IndexBlockSize").detail("Sz", buffer.size());
+			}
 
 			ObjectReader dataReader(buffer.begin(), IncludeVersion());
 			dataReader.deserialize(FileIdentifierFor<IndexBlock>::value, block, arena);
@@ -368,7 +376,11 @@ struct IndexBlockRef {
 			    arena, ObjectWriter::toValue(block, IncludeVersion(ProtocolVersion::withBlobGranuleFile())).contents());
 		}
 
-		TraceEvent(SevDebug, "IndexBlockSize").detail("Sz", buffer.size()).detail("Encrypted", cipherKeysCtx.present());
+		if (BG_ENCRYPT_COMPRESS_DEBUG) {
+			TraceEvent(SevDebug, "IndexBlockSize")
+			    .detail("Sz", buffer.size())
+			    .detail("Encrypted", cipherKeysCtx.present());
+		}
 	}
 
 	template <class Ar>
@@ -804,10 +816,6 @@ static Standalone<VectorRef<ParsedDeltaBoundaryRef>> loadSnapshotFile(
 
 	ASSERT(file.indexBlockRef.block.children.size() >= 2);
 
-	// TODO: refactor this out of delta tree
-	// int commonPrefixLen = commonPrefixLength(index.dataBlockOffsets.front().first,
-	// index.dataBlockOffsets.back().first);
-
 	// find range of blocks needed to read
 	ChildBlockPointerRef* currentBlock = file.findStartBlock(keyRange.begin);
 
@@ -1157,10 +1165,6 @@ Standalone<VectorRef<ParsedDeltaBoundaryRef>> loadChunkedDeltaFile(const Standal
 
 	ASSERT(file.indexBlockRef.block.children.size() >= 2);
 
-	// TODO: refactor this out of delta tree
-	// int commonPrefixLen = commonPrefixLength(index.dataBlockOffsets.front().first,
-	// index.dataBlockOffsets.back().first);
-
 	// find range of blocks needed to read
 	ChildBlockPointerRef* currentBlock = file.findStartBlock(keyRange.begin);
 
@@ -1169,7 +1173,8 @@ Standalone<VectorRef<ParsedDeltaBoundaryRef>> loadChunkedDeltaFile(const Standal
 		return deltas;
 	}
 
-	// TODO: could cpu optimize first block a bit more by seeking right to start
+	// FIXME: shared prefix for key comparison
+	// FIXME: could cpu optimize first block a bit more by seeking right to start
 	bool lastBlock = false;
 	bool prevClearAfter = false;
 	while (!lastBlock) {
@@ -1553,12 +1558,23 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
 	return mergeDeltaStreams(chunk, streams, startClears);
 }
 
+struct GranuleLoadFreeHandle : NonCopyable, ReferenceCounted<GranuleLoadFreeHandle> {
+	const ReadBlobGranuleContext* granuleContext;
+	int64_t loadId;
+
+	GranuleLoadFreeHandle(const ReadBlobGranuleContext* granuleContext, int64_t loadId)
+	  : granuleContext(granuleContext), loadId(loadId) {}
+
+	~GranuleLoadFreeHandle() { granuleContext->free_load_f(loadId, granuleContext->userContext); }
+};
+
 struct GranuleLoadIds {
 	Optional<int64_t> snapshotId;
 	std::vector<int64_t> deltaIds;
+	std::vector<Reference<GranuleLoadFreeHandle>> freeHandles;
 };
 
-static void startLoad(const ReadBlobGranuleContext granuleContext,
+static void startLoad(const ReadBlobGranuleContext* granuleContext,
                       const BlobGranuleChunkRef& chunk,
                       GranuleLoadIds& loadIds) {
 
@@ -1568,12 +1584,13 @@ static void startLoad(const ReadBlobGranuleContext granuleContext,
 		// FIXME: remove when we implement file multiplexing
 		ASSERT(chunk.snapshotFile.get().offset == 0);
 		ASSERT(chunk.snapshotFile.get().length == chunk.snapshotFile.get().fullFileLength);
-		loadIds.snapshotId = granuleContext.start_load_f(snapshotFname.c_str(),
-		                                                 snapshotFname.size(),
-		                                                 chunk.snapshotFile.get().offset,
-		                                                 chunk.snapshotFile.get().length,
-		                                                 chunk.snapshotFile.get().fullFileLength,
-		                                                 granuleContext.userContext);
+		loadIds.snapshotId = granuleContext->start_load_f(snapshotFname.c_str(),
+		                                                  snapshotFname.size(),
+		                                                  chunk.snapshotFile.get().offset,
+		                                                  chunk.snapshotFile.get().length,
+		                                                  chunk.snapshotFile.get().fullFileLength,
+		                                                  granuleContext->userContext);
+		loadIds.freeHandles.push_back(makeReference<GranuleLoadFreeHandle>(granuleContext, loadIds.snapshotId.get()));
 	}
 	loadIds.deltaIds.reserve(chunk.deltaFiles.size());
 	for (int deltaFileIdx = 0; deltaFileIdx < chunk.deltaFiles.size(); deltaFileIdx++) {
@@ -1581,13 +1598,14 @@ static void startLoad(const ReadBlobGranuleContext granuleContext,
 		// FIXME: remove when we implement file multiplexing
 		ASSERT(chunk.deltaFiles[deltaFileIdx].offset == 0);
 		ASSERT(chunk.deltaFiles[deltaFileIdx].length == chunk.deltaFiles[deltaFileIdx].fullFileLength);
-		int64_t deltaLoadId = granuleContext.start_load_f(deltaFName.c_str(),
-		                                                  deltaFName.size(),
-		                                                  chunk.deltaFiles[deltaFileIdx].offset,
-		                                                  chunk.deltaFiles[deltaFileIdx].length,
-		                                                  chunk.deltaFiles[deltaFileIdx].fullFileLength,
-		                                                  granuleContext.userContext);
+		int64_t deltaLoadId = granuleContext->start_load_f(deltaFName.c_str(),
+		                                                   deltaFName.size(),
+		                                                   chunk.deltaFiles[deltaFileIdx].offset,
+		                                                   chunk.deltaFiles[deltaFileIdx].length,
+		                                                   chunk.deltaFiles[deltaFileIdx].fullFileLength,
+		                                                   granuleContext->userContext);
 		loadIds.deltaIds.push_back(deltaLoadId);
+		loadIds.freeHandles.push_back(makeReference<GranuleLoadFreeHandle>(granuleContext, deltaLoadId));
 	}
 }
 
@@ -1606,17 +1624,16 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
 
 	GranuleLoadIds loadIds[files.size()];
 
-	// Kick off first file reads if parallelism > 1
-	for (int i = 0; i < parallelism - 1 && i < files.size(); i++) {
-		startLoad(granuleContext, files[i], loadIds[i]);
-	}
-
 	try {
+		// Kick off first file reads if parallelism > 1
+		for (int i = 0; i < parallelism - 1 && i < files.size(); i++) {
+			startLoad(&granuleContext, files[i], loadIds[i]);
+		}
 		RangeResult results;
 		for (int chunkIdx = 0; chunkIdx < files.size(); chunkIdx++) {
 			// Kick off files for this granule if parallelism == 1, or future granule if parallelism > 1
 			if (chunkIdx + parallelism - 1 < files.size()) {
-				startLoad(granuleContext, files[chunkIdx + parallelism - 1], loadIds[chunkIdx + parallelism - 1]);
+				startLoad(&granuleContext, files[chunkIdx + parallelism - 1], loadIds[chunkIdx + parallelism - 1]);
 			}
 
 			RangeResult chunkRows;
@@ -1632,7 +1649,8 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
 				}
 			}
 
-			StringRef deltaData[files[chunkIdx].deltaFiles.size()];
+			// +1 to avoid UBSAN variable length array of size zero
+			StringRef deltaData[files[chunkIdx].deltaFiles.size() + 1];
 			for (int i = 0; i < files[chunkIdx].deltaFiles.size(); i++) {
 				deltaData[i] =
 				    StringRef(granuleContext.get_load_f(loadIds[chunkIdx].deltaIds[i], granuleContext.userContext),
@@ -1650,12 +1668,8 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
 			results.arena().dependsOn(chunkRows.arena());
 			results.append(results.arena(), chunkRows.begin(), chunkRows.size());
 
-			if (loadIds[chunkIdx].snapshotId.present()) {
-				granuleContext.free_load_f(loadIds[chunkIdx].snapshotId.get(), granuleContext.userContext);
-			}
-			for (int i = 0; i < loadIds[chunkIdx].deltaIds.size(); i++) {
-				granuleContext.free_load_f(loadIds[chunkIdx].deltaIds[i], granuleContext.userContext);
-			}
+			// free once done by forcing FreeHandles to trigger
+			loadIds[chunkIdx].freeHandles.clear();
 		}
 		return ErrorOr<RangeResult>(results);
 	} catch (Error& e) {
@@ -2372,7 +2386,6 @@ void checkDeltaRead(const KeyValueGen& kvGen,
 	std::string filename = randomBGFilename(
 	    deterministicRandom()->randomUniqueID(), deterministicRandom()->randomUniqueID(), readVersion, ".delta");
 	Standalone<BlobGranuleChunkRef> chunk;
-	// TODO need to add cipher keys meta
 	chunk.deltaFiles.emplace_back_deep(
 	    chunk.arena(), filename, 0, serialized->size(), serialized->size(), kvGen.cipherKeys);
 	chunk.keyRange = kvGen.allRange;
@@ -2429,7 +2442,6 @@ static std::tuple<KeyRange, Version, Version> randomizeKeyAndVersions(const KeyV
 		}
 	}
 
-	// TODO randomize begin and read version to sometimes +/- 1 and readRange begin and end to keyAfter sometimes
 	return { readRange, beginVersion, readVersion };
 }
 
@@ -2653,7 +2665,11 @@ TEST_CASE("/blobgranule/files/granuleReadUnitTest") {
 	                 serializedDeltaFiles,
 	                 inMemoryDeltas);
 
-	for (int i = 0; i < std::min(100, 5 + snapshotData.size() * deltaData.size()); i++) {
+	// prevent overflow by doing min before multiply
+	int maxRuns = 100;
+	int snapshotAndDeltaSize = 5 + std::min(maxRuns, snapshotData.size()) * std::min(maxRuns, deltaData.size());
+	int lim = std::min(maxRuns, snapshotAndDeltaSize);
+	for (int i = 0; i < lim; i++) {
 		auto params = randomizeKeyAndVersions(kvGen, deltaData);
 		fmt::print("Partial test {0}: [{1} - {2}) @ {3} - {4}\n",
 		           i,
diff --git a/fdbclient/BlobGranuleReader.actor.cpp b/fdbclient/BlobGranuleReader.actor.cpp
index e0f627a9da..9b24380d2c 100644
--- a/fdbclient/BlobGranuleReader.actor.cpp
+++ b/fdbclient/BlobGranuleReader.actor.cpp
@@ -31,13 +31,6 @@
 #include "fdbclient/FDBTypes.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
 
-// TODO more efficient data structure besides std::map? PTree is unnecessary since this isn't versioned, but some other
-// sorted thing could work. And if it used arenas it'd probably be more efficient with allocations, since everything
-// else is in 1 arena and discarded at the end.
-
-// TODO could refactor the file reading code from here and the delta file function into another actor,
-// then this part would also be testable? but meh
-
 ACTOR Future<Standalone<StringRef>> readFile(Reference<BlobConnectionProvider> bstoreProvider, BlobFilePointerRef f) {
 	try {
 		state Arena arena;
@@ -140,3 +133,66 @@ ACTOR Future<Void> readBlobGranules(BlobGranuleFileRequest request,
 
 	return Void();
 }
+
+// Return true if a given range is fully covered by blob chunks
+bool isRangeFullyCovered(KeyRange range, Standalone<VectorRef<BlobGranuleChunkRef>> blobChunks) {
+	std::vector<KeyRangeRef> blobRanges;
+	for (const BlobGranuleChunkRef& chunk : blobChunks) {
+		blobRanges.push_back(chunk.keyRange);
+	}
+
+	return range.isCovered(blobRanges);
+}
+
+void testAddChunkRange(KeyRef begin, KeyRef end, Standalone<VectorRef<BlobGranuleChunkRef>>& chunks) {
+	BlobGranuleChunkRef chunk;
+	chunk.keyRange = KeyRangeRef(begin, end);
+	chunks.push_back(chunks.arena(), chunk);
+}
+
+TEST_CASE("/fdbserver/blobgranule/isRangeCoveredByBlob") {
+	Standalone<VectorRef<BlobGranuleChunkRef>> chunks;
+	// chunk1 key_a1 - key_a9
+	testAddChunkRange("key_a1"_sr, "key_a9"_sr, chunks);
+	// chunk2 key_b1 - key_b9
+	testAddChunkRange("key_b1"_sr, "key_b9"_sr, chunks);
+
+	// check empty range. not covered
+	{ ASSERT(isRangeFullyCovered(KeyRangeRef(), chunks) == false); }
+
+	// check empty chunks. not covered
+	{
+		Standalone<VectorRef<BlobGranuleChunkRef>> empyChunks;
+		ASSERT(isRangeFullyCovered(KeyRangeRef(), empyChunks) == false);
+	}
+
+	// check '' to \xff
+	{ ASSERT(isRangeFullyCovered(KeyRangeRef(LiteralStringRef(""), LiteralStringRef("\xff")), chunks) == false); }
+
+	// check {key_a1, key_a9}
+	{ ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_a9"_sr), chunks)); }
+
+	// check {key_a1, key_a3}
+	{ ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_a3"_sr), chunks)); }
+
+	// check {key_a0, key_a3}
+	{ ASSERT(isRangeFullyCovered(KeyRangeRef("key_a0"_sr, "key_a3"_sr), chunks) == false); }
+
+	// check {key_a5, key_b2}
+	{
+		auto range = KeyRangeRef("key_a5"_sr, "key_b5"_sr);
+		ASSERT(isRangeFullyCovered(range, chunks) == false);
+		ASSERT(range.begin == "key_a5"_sr);
+		ASSERT(range.end == "key_b5"_sr);
+	}
+
+	// check continued chunks
+	{
+		Standalone<VectorRef<BlobGranuleChunkRef>> continuedChunks;
+		testAddChunkRange("key_a1"_sr, "key_a9"_sr, continuedChunks);
+		testAddChunkRange("key_a9"_sr, "key_b1"_sr, continuedChunks);
+		testAddChunkRange("key_b1"_sr, "key_b9"_sr, continuedChunks);
+		ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_b9"_sr), continuedChunks) == false);
+	}
+	return Void();
+}
diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt
index 2a1713878f..2953a360e7 100644
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@@ -90,8 +90,8 @@ add_flow_target(LINK_TEST NAME fdbclientlinktest SRCS LinkTest.cpp)
 target_link_libraries(fdbclientlinktest PRIVATE fdbclient rapidxml) # re-link rapidxml due to private link interface
 
 if(BUILD_AZURE_BACKUP)
-  target_link_libraries(fdbclient PRIVATE curl uuid azure-storage-lite)
-  target_link_libraries(fdbclient_sampling PRIVATE curl uuid azure-storage-lite)
+  target_link_libraries(fdbclient PRIVATE curl azure-storage-lite)
+  target_link_libraries(fdbclient_sampling PRIVATE curl azure-storage-lite)
 endif()
 
 if(BUILD_AWS_BACKUP)
diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp
index 537f91f0aa..7c8a69a337 100644
--- a/fdbclient/ClientKnobs.cpp
+++ b/fdbclient/ClientKnobs.cpp
@@ -42,10 +42,6 @@ void ClientKnobs::initialize(Randomize randomize) {
 
 	init( FAILURE_MAX_DELAY,                       5.0 );
 	init( FAILURE_MIN_DELAY,                       4.0 ); if( randomize && BUGGIFY ) FAILURE_MIN_DELAY = 1.0;
-	init( FAILURE_TIMEOUT_DELAY,     FAILURE_MIN_DELAY );
-	init( CLIENT_FAILURE_TIMEOUT_DELAY, FAILURE_MIN_DELAY );
-	init( FAILURE_EMERGENCY_DELAY,                30.0 );
-	init( FAILURE_MAX_GENERATIONS,                  10 );
 	init( RECOVERY_DELAY_START_GENERATION,          70 );
 	init( RECOVERY_DELAY_SECONDS_PER_GENERATION,  60.0 );
 	init( MAX_GENERATIONS,                         100 );
@@ -64,6 +60,7 @@ void ClientKnobs::initialize(Randomize randomize) {
 
 	init( WRONG_SHARD_SERVER_DELAY,                .01 ); if( randomize && BUGGIFY ) WRONG_SHARD_SERVER_DELAY = deterministicRandom()->random01(); // FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is mostly wrong (e.g. dumping the database after a test)
 	init( FUTURE_VERSION_RETRY_DELAY,              .01 ); if( randomize && BUGGIFY ) FUTURE_VERSION_RETRY_DELAY = deterministicRandom()->random01();// FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY;
+	init( GRV_ERROR_RETRY_DELAY,                   5.0 ); if( randomize && BUGGIFY ) GRV_ERROR_RETRY_DELAY = 0.01 + 5 * deterministicRandom()->random01();
 	init( UNKNOWN_TENANT_RETRY_DELAY,              0.0 ); if( randomize && BUGGIFY ) UNKNOWN_TENANT_RETRY_DELAY = deterministicRandom()->random01();
 	init( REPLY_BYTE_LIMIT,                      80000 );
 	init( DEFAULT_BACKOFF,                         .01 ); if( randomize && BUGGIFY ) DEFAULT_BACKOFF = deterministicRandom()->random01();
@@ -84,6 +81,7 @@ void ClientKnobs::initialize(Randomize randomize) {
 	init( CHANGE_FEED_CACHE_SIZE,               100000 ); if( randomize && BUGGIFY ) CHANGE_FEED_CACHE_SIZE = 1;
 	init( CHANGE_FEED_POP_TIMEOUT,                10.0 );
 	init( CHANGE_FEED_STREAM_MIN_BYTES,            1e4 ); if( randomize && BUGGIFY ) CHANGE_FEED_STREAM_MIN_BYTES = 1;
+	init( CHANGE_FEED_START_INTERVAL,             10.0 );
 
 	init( MAX_BATCH_SIZE,                         1000 ); if( randomize && BUGGIFY ) MAX_BATCH_SIZE = 1;
 	init( GRV_BATCH_TIMEOUT,                     0.005 ); if( randomize && BUGGIFY ) GRV_BATCH_TIMEOUT = 0.1;
@@ -159,8 +157,6 @@ void ClientKnobs::initialize(Randomize randomize) {
 	init( BACKUP_AGGREGATE_POLL_RATE_UPDATE_INTERVAL, 60);
 	init( BACKUP_AGGREGATE_POLL_RATE,              2.0 ); // polls per second target for all agents on the cluster
 	init( BACKUP_LOG_WRITE_BATCH_MAX_SIZE,         1e6 ); //Must be much smaller than TRANSACTION_SIZE_LIMIT
-	init( BACKUP_LOG_ATOMIC_OPS_SIZE,			  1000 );
-	init( BACKUP_OPERATION_COST_OVERHEAD,		    50 );
 	init( BACKUP_MAX_LOG_RANGES,                    21 ); if( randomize && BUGGIFY ) BACKUP_MAX_LOG_RANGES = 4;
 	init( BACKUP_SIM_COPY_LOG_RANGES,              100 );
 	init( BACKUP_VERSION_DELAY,           5*CORE_VERSIONSPERSECOND );
@@ -279,18 +275,21 @@ void ClientKnobs::initialize(Randomize randomize) {
 	init( BUSYNESS_SPIKE_START_THRESHOLD,         0.100 );
 	init( BUSYNESS_SPIKE_SATURATED_THRESHOLD,     0.500 );
 
-	// multi-version client control
-	init( MVC_CLIENTLIB_CHUNK_SIZE,              8*1024 );
-	init( MVC_CLIENTLIB_CHUNKS_PER_TRANSACTION,      32 );
-
 	// Blob granules
 	init( BG_MAX_GRANULE_PARALLELISM,                10 );
+	init( BG_TOO_MANY_GRANULES,                   10000 );
 
 	init( CHANGE_QUORUM_BAD_STATE_RETRY_TIMES,        3 );
 	init( CHANGE_QUORUM_BAD_STATE_RETRY_DELAY,      2.0 );
 
 	// Tenants and Metacluster
-	init( MAX_TENANTS_PER_CLUSTER,                  1e6 ); if ( randomize && BUGGIFY ) MAX_TENANTS_PER_CLUSTER = deterministicRandom()->randomInt(20, 100);
+	init( MAX_TENANTS_PER_CLUSTER,                  1e6 );
+	init( TENANT_TOMBSTONE_CLEANUP_INTERVAL,         60 ); if ( randomize && BUGGIFY ) TENANT_TOMBSTONE_CLEANUP_INTERVAL = deterministicRandom()->random01() * 30;
+	init( MAX_DATA_CLUSTERS,                        1e5 );
+	init( REMOVE_CLUSTER_TENANT_BATCH_SIZE,         1e4 ); if ( randomize && BUGGIFY ) REMOVE_CLUSTER_TENANT_BATCH_SIZE = 1;
+	init( METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK,   5 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK = 1;
+	init( METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY, 1.0 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY = deterministicRandom()->random01() * 60;
+	init( METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT, 10.0 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT = 1 + deterministicRandom()->random01() * 59;
 
 	// clang-format on
 }
diff --git a/fdbclient/KeyRangeMap.actor.cpp b/fdbclient/KeyRangeMap.actor.cpp
index c736c714bf..cb1f0558c1 100644
--- a/fdbclient/KeyRangeMap.actor.cpp
+++ b/fdbclient/KeyRangeMap.actor.cpp
@@ -23,6 +23,7 @@
 #include "fdbclient/CommitTransaction.h"
 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/ReadYourWrites.h"
+#include "flow/UnitTest.h"
 #include "flow/actorcompiler.h" // has to be last include
 
 void KeyRangeActorMap::getRangesAffectedByInsertion(const KeyRangeRef& keys, std::vector<KeyRange>& affectedRanges) {
@@ -35,32 +36,54 @@ void KeyRangeActorMap::getRangesAffectedByInsertion(const KeyRangeRef& keys, std
 		affectedRanges.push_back(KeyRangeRef(keys.end, e.end()));
 }
 
-RangeResult krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, RangeResult kv) {
+RangeResult krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, RangeResult kv, bool align) {
 	ASSERT(!kv.more || kv.size() > 1);
 	KeyRange withPrefix =
 	    KeyRangeRef(mapPrefix.toString() + keys.begin.toString(), mapPrefix.toString() + keys.end.toString());
 
-	ValueRef beginValue, endValue;
-	if (kv.size() && kv[0].key.startsWith(mapPrefix))
-		beginValue = kv[0].value;
-	if (kv.size() && kv.end()[-1].key.startsWith(mapPrefix))
-		endValue = kv.end()[-1].value;
-
 	RangeResult result;
 	result.arena().dependsOn(kv.arena());
 	result.arena().dependsOn(keys.arena());
 
-	result.push_back(result.arena(), KeyValueRef(keys.begin, beginValue));
+	// Always push a kv pair <= keys.begin.
+	KeyRef beginKey = keys.begin;
+	if (!align && !kv.empty() && kv.front().key.startsWith(mapPrefix) && kv.front().key < withPrefix.begin) {
+		beginKey = kv[0].key.removePrefix(mapPrefix);
+	}
+	ValueRef beginValue;
+	if (!kv.empty() && kv.front().key.startsWith(mapPrefix) && kv.front().key <= withPrefix.begin) {
+		beginValue = kv.front().value;
+	}
+	result.push_back(result.arena(), KeyValueRef(beginKey, beginValue));
+
 	for (int i = 0; i < kv.size(); i++) {
 		if (kv[i].key > withPrefix.begin && kv[i].key < withPrefix.end) {
 			KeyRef k = kv[i].key.removePrefix(mapPrefix);
 			result.push_back(result.arena(), KeyValueRef(k, kv[i].value));
-		} else if (kv[i].key >= withPrefix.end)
+		} else if (kv[i].key >= withPrefix.end) {
 			kv.more = false;
+			// There should be at most 1 value past mapPrefix + keys.end.
+			ASSERT(i == kv.size() - 1);
+			break;
+		}
 	}
 
-	if (!kv.more)
-		result.push_back(result.arena(), KeyValueRef(keys.end, endValue));
+	if (!kv.more) {
+		KeyRef endKey = keys.end;
+		if (!align && !kv.empty() && kv.back().key.startsWith(mapPrefix) && kv.back().key >= withPrefix.end) {
+			endKey = kv.back().key.removePrefix(mapPrefix);
+		}
+		ValueRef endValue;
+		if (!kv.empty()) {
+			// In the aligned case, carry the last value to be the end value.
+			if (align && kv.back().key.startsWith(mapPrefix) && kv.back().key > withPrefix.end) {
+				endValue = result.back().value;
+			} else {
+				endValue = kv.back().value;
+			}
+		}
+		result.push_back(result.arena(), KeyValueRef(endKey, endValue));
+	}
 	result.more = kv.more;
 
 	return result;
@@ -93,6 +116,37 @@ ACTOR Future<RangeResult> krmGetRanges(Reference<ReadYourWritesTransaction> tr,
 	return krmDecodeRanges(mapPrefix, keys, kv);
 }
 
+// Returns keys.begin, all transitional points in keys, and keys.end, and their values
+ACTOR Future<RangeResult> krmGetRangesUnaligned(Transaction* tr,
+                                                Key mapPrefix,
+                                                KeyRange keys,
+                                                int limit,
+                                                int limitBytes) {
+	KeyRange withPrefix =
+	    KeyRangeRef(mapPrefix.toString() + keys.begin.toString(), mapPrefix.toString() + keys.end.toString());
+
+	state GetRangeLimits limits(limit, limitBytes);
+	limits.minRows = 2;
+	RangeResult kv = wait(tr->getRange(lastLessOrEqual(withPrefix.begin), firstGreaterThan(withPrefix.end), limits));
+
+	return krmDecodeRanges(mapPrefix, keys, kv, false);
+}
+
+ACTOR Future<RangeResult> krmGetRangesUnaligned(Reference<ReadYourWritesTransaction> tr,
+                                                Key mapPrefix,
+                                                KeyRange keys,
+                                                int limit,
+                                                int limitBytes) {
+	KeyRange withPrefix =
+	    KeyRangeRef(mapPrefix.toString() + keys.begin.toString(), mapPrefix.toString() + keys.end.toString());
+
+	state GetRangeLimits limits(limit, limitBytes);
+	limits.minRows = 2;
+	RangeResult kv = wait(tr->getRange(lastLessOrEqual(withPrefix.begin), firstGreaterThan(withPrefix.end), limits));
+
+	return krmDecodeRanges(mapPrefix, keys, kv, false);
+}
+
 void krmSetPreviouslyEmptyRange(Transaction* tr,
                                 const KeyRef& mapPrefix,
                                 const KeyRangeRef& keys,
@@ -254,3 +308,87 @@ Future<Void> krmSetRangeCoalescing(Reference<ReadYourWritesTransaction> const& t
                                    Value const& value) {
 	return holdWhile(tr, krmSetRangeCoalescing_(tr.getPtr(), mapPrefix, range, maxRange, value));
 }
+
+TEST_CASE("/keyrangemap/decoderange/aligned") {
+	Arena arena;
+	Key prefix = LiteralStringRef("/prefix/");
+	StringRef fullKeyA = StringRef(arena, LiteralStringRef("/prefix/a"));
+	StringRef fullKeyB = StringRef(arena, LiteralStringRef("/prefix/b"));
+	StringRef fullKeyC = StringRef(arena, LiteralStringRef("/prefix/c"));
+	StringRef fullKeyD = StringRef(arena, LiteralStringRef("/prefix/d"));
+
+	StringRef keyA = StringRef(arena, LiteralStringRef("a"));
+	StringRef keyB = StringRef(arena, LiteralStringRef("b"));
+	StringRef keyC = StringRef(arena, LiteralStringRef("c"));
+	StringRef keyD = StringRef(arena, LiteralStringRef("d"));
+	StringRef keyE = StringRef(arena, LiteralStringRef("e"));
+	StringRef keyAB = StringRef(arena, LiteralStringRef("ab"));
+	StringRef keyCD = StringRef(arena, LiteralStringRef("cd"));
+
+	// Fake getRange() call.
+	RangeResult kv;
+	kv.push_back(arena, KeyValueRef(fullKeyA, keyA));
+	kv.push_back(arena, KeyValueRef(fullKeyB, keyB));
+	kv.push_back(arena, KeyValueRef(fullKeyC, keyC));
+	kv.push_back(arena, KeyValueRef(fullKeyD, keyD));
+
+	// [A, AB(start), B, C, CD(end), D]
+	RangeResult decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(keyAB, keyCD), kv);
+	ASSERT(decodedRanges.size() == 4);
+	ASSERT(decodedRanges.front().key == keyAB);
+	ASSERT(decodedRanges.front().value == keyA);
+	ASSERT(decodedRanges.back().key == keyCD);
+	ASSERT(decodedRanges.back().value == keyC);
+
+	// [""(start), A, B, C, D, E(end)]
+	decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(StringRef(), keyE), kv);
+	ASSERT(decodedRanges.size() == 6);
+	ASSERT(decodedRanges.front().key == StringRef());
+	ASSERT(decodedRanges.front().value == StringRef());
+	ASSERT(decodedRanges.back().key == keyE);
+	ASSERT(decodedRanges.back().value == keyD);
+
+	return Void();
+}
+
+TEST_CASE("/keyrangemap/decoderange/unaligned") {
+	Arena arena;
+	Key prefix = LiteralStringRef("/prefix/");
+	StringRef fullKeyA = StringRef(arena, LiteralStringRef("/prefix/a"));
+	StringRef fullKeyB = StringRef(arena, LiteralStringRef("/prefix/b"));
+	StringRef fullKeyC = StringRef(arena, LiteralStringRef("/prefix/c"));
+	StringRef fullKeyD = StringRef(arena, LiteralStringRef("/prefix/d"));
+
+	StringRef keyA = StringRef(arena, LiteralStringRef("a"));
+	StringRef keyB = StringRef(arena, LiteralStringRef("b"));
+	StringRef keyC = StringRef(arena, LiteralStringRef("c"));
+	StringRef keyD = StringRef(arena, LiteralStringRef("d"));
+	StringRef keyE = StringRef(arena, LiteralStringRef("e"));
+	StringRef keyAB = StringRef(arena, LiteralStringRef("ab"));
+	StringRef keyCD = StringRef(arena, LiteralStringRef("cd"));
+
+	// Fake getRange() call.
+	RangeResult kv;
+	kv.push_back(arena, KeyValueRef(fullKeyA, keyA));
+	kv.push_back(arena, KeyValueRef(fullKeyB, keyB));
+	kv.push_back(arena, KeyValueRef(fullKeyC, keyC));
+	kv.push_back(arena, KeyValueRef(fullKeyD, keyD));
+
+	// [A, AB(start), B, C, CD(end), D]
+	RangeResult decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(keyAB, keyCD), kv, false);
+	ASSERT(decodedRanges.size() == 4);
+	ASSERT(decodedRanges.front().key == keyA);
+	ASSERT(decodedRanges.front().value == keyA);
+	ASSERT(decodedRanges.back().key == keyD);
+	ASSERT(decodedRanges.back().value == keyD);
+
+	// [""(start), A, B, C, D, E(end)]
+	decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(StringRef(), keyE), kv, false);
+	ASSERT(decodedRanges.size() == 6);
+	ASSERT(decodedRanges.front().key == StringRef());
+	ASSERT(decodedRanges.front().value == StringRef());
+	ASSERT(decodedRanges.back().key == keyE);
+	ASSERT(decodedRanges.back().value == keyD);
+
+	return Void();
+}
\ No newline at end of file
diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp
index 2a5c9ac910..6270cc0b88 100644
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@@ -2559,7 +2559,7 @@ TEST_CASE("/ManagementAPI/AutoQuorumChange/checkLocality") {
 			                       ProcessClass(ProcessClass::CoordinatorClass, ProcessClass::CommandLineSource),
 			                       "",
 			                       "",
-			                       currentProtocolVersion);
+			                       currentProtocolVersion());
 		}
 
 		workers.push_back(data);
diff --git a/fdbclient/Metacluster.cpp b/fdbclient/Metacluster.cpp
new file mode 100644
index 0000000000..6463033db8
--- /dev/null
+++ b/fdbclient/Metacluster.cpp
@@ -0,0 +1,71 @@
+/*
+ * Metacluster.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/Metacluster.h"
+#include "fdbclient/MetaclusterManagement.actor.h"
+
+FDB_DEFINE_BOOLEAN_PARAM(AddNewTenants);
+FDB_DEFINE_BOOLEAN_PARAM(RemoveMissingTenants);
+
+std::string DataClusterEntry::clusterStateToString(DataClusterState clusterState) {
+	switch (clusterState) {
+	case DataClusterState::READY:
+		return "ready";
+	case DataClusterState::REMOVING:
+		return "removing";
+	case DataClusterState::RESTORING:
+		return "restoring";
+	default:
+		UNREACHABLE();
+	}
+}
+
+DataClusterState DataClusterEntry::stringToClusterState(std::string stateStr) {
+	if (stateStr == "ready") {
+		return DataClusterState::READY;
+	} else if (stateStr == "removing") {
+		return DataClusterState::REMOVING;
+	} else if (stateStr == "restoring") {
+		return DataClusterState::RESTORING;
+	}
+
+	UNREACHABLE();
+}
+
+json_spirit::mObject DataClusterEntry::toJson() const {
+	json_spirit::mObject obj;
+	obj["capacity"] = capacity.toJson();
+	obj["allocated"] = allocated.toJson();
+	obj["cluster_state"] = DataClusterEntry::clusterStateToString(clusterState);
+	return obj;
+}
+
+json_spirit::mObject ClusterUsage::toJson() const {
+	json_spirit::mObject obj;
+	obj["num_tenant_groups"] = numTenantGroups;
+	return obj;
+}
+
+KeyBackedObjectProperty<MetaclusterRegistrationEntry, decltype(IncludeVersion())>&
+MetaclusterMetadata::metaclusterRegistration() {
+	static KeyBackedObjectProperty<MetaclusterRegistrationEntry, decltype(IncludeVersion())> instance(
+	    "\xff/metacluster/clusterRegistration"_sr, IncludeVersion());
+	return instance;
+}
\ No newline at end of file
diff --git a/fdbclient/MetaclusterManagement.actor.cpp b/fdbclient/MetaclusterManagement.actor.cpp
new file mode 100644
index 0000000000..33403300bd
--- /dev/null
+++ b/fdbclient/MetaclusterManagement.actor.cpp
@@ -0,0 +1,67 @@
+/*
+ * MetaclusterManagement.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/ClusterConnectionMemoryRecord.h"
+#include "fdbclient/DatabaseContext.h"
+#include "fdbclient/FDBTypes.h"
+#include "fdbclient/MetaclusterManagement.actor.h"
+#include "fdbclient/ThreadSafeTransaction.h"
+#include "flow/actorcompiler.h" // has to be last include
+
+namespace MetaclusterAPI {
+
+ACTOR Future<Reference<IDatabase>> openDatabase(ClusterConnectionString connectionString) {
+	if (g_network->isSimulated()) {
+		Reference<IClusterConnectionRecord> clusterFile =
+		    makeReference<ClusterConnectionMemoryRecord>(connectionString);
+		Database nativeDb = Database::createDatabase(clusterFile, -1);
+		Reference<IDatabase> threadSafeDb =
+		    wait(unsafeThreadFutureToFuture(ThreadSafeDatabase::createFromExistingDatabase(nativeDb)));
+		return MultiVersionDatabase::debugCreateFromExistingDatabase(threadSafeDb);
+	} else {
+		return MultiVersionApi::api->createDatabaseFromConnectionString(connectionString.toString().c_str());
+	}
+}
+
+KeyBackedObjectMap<ClusterName, DataClusterEntry, decltype(IncludeVersion())>&
+ManagementClusterMetadata::dataClusters() {
+	static KeyBackedObjectMap<ClusterName, DataClusterEntry, decltype(IncludeVersion())> instance(
+	    "metacluster/dataCluster/metadata/"_sr, IncludeVersion());
+	return instance;
+}
+
+KeyBackedMap<ClusterName,
+             ClusterConnectionString,
+             TupleCodec<ClusterName>,
+             ManagementClusterMetadata::ConnectionStringCodec>
+    ManagementClusterMetadata::dataClusterConnectionRecords("metacluster/dataCluster/connectionString/"_sr);
+
+KeyBackedSet<Tuple> ManagementClusterMetadata::clusterCapacityIndex("metacluster/clusterCapacityIndex/"_sr);
+KeyBackedMap<ClusterName, int64_t, TupleCodec<ClusterName>, BinaryCodec<int64_t>>
+    ManagementClusterMetadata::clusterTenantCount("metacluster/clusterTenantCount/"_sr);
+KeyBackedSet<Tuple> ManagementClusterMetadata::clusterTenantIndex("metacluster/dataCluster/tenantMap/"_sr);
+KeyBackedSet<Tuple> ManagementClusterMetadata::clusterTenantGroupIndex("metacluster/dataCluster/tenantGroupMap/"_sr);
+
+TenantMetadataSpecification& ManagementClusterMetadata::tenantMetadata() {
+	static TenantMetadataSpecification instance(""_sr);
+	return instance;
+}
+
+}; // namespace MetaclusterAPI
\ No newline at end of file
diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp
index f72913f7ee..977d465908 100644
--- a/fdbclient/MonitorLeader.actor.cpp
+++ b/fdbclient/MonitorLeader.actor.cpp
@@ -663,69 +663,43 @@ ACTOR Future<Void> asyncDeserializeClusterInterface(Reference<AsyncVar<Value>> s
 	}
 }
 
-struct ClientStatusStats {
-	int count;
-	std::vector<std::pair<NetworkAddress, Key>> examples;
+namespace {
 
-	ClientStatusStats() : count(0) { examples.reserve(CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT); }
-};
+void tryInsertIntoSamples(OpenDatabaseRequest::Samples& samples,
+                          const NetworkAddress& networkAddress,
+                          const Key& traceLogGroup) {
+	++samples.count;
+	if (samples.samples.size() < static_cast<size_t>(CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT)) {
+		samples.samples.insert({ networkAddress, traceLogGroup });
+	}
+}
+
+} // namespace
 
 OpenDatabaseRequest ClientData::getRequest() {
 	OpenDatabaseRequest req;
 
-	std::map<StringRef, ClientStatusStats> issueMap;
-	std::map<ClientVersionRef, ClientStatusStats> versionMap;
-	std::map<StringRef, ClientStatusStats> maxProtocolMap;
-	int clientCount = 0;
-
-	// SOMEDAY: add a yield in this loop
 	for (auto& ci : clientStatusInfoMap) {
-		for (auto& it : ci.second.issues) {
-			auto& entry = issueMap[it];
-			entry.count++;
-			if (entry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) {
-				entry.examples.emplace_back(ci.first, ci.second.traceLogGroup);
-			}
-		}
-		if (ci.second.versions.size()) {
-			clientCount++;
-			StringRef maxProtocol;
-			for (auto& it : ci.second.versions) {
-				maxProtocol = std::max(maxProtocol, it.protocolVersion);
-				auto& entry = versionMap[it];
-				entry.count++;
-				if (entry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) {
-					entry.examples.emplace_back(ci.first, ci.second.traceLogGroup);
-				}
-			}
-			auto& maxEntry = maxProtocolMap[maxProtocol];
-			maxEntry.count++;
-			if (maxEntry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) {
-				maxEntry.examples.emplace_back(ci.first, ci.second.traceLogGroup);
-			}
-		} else {
-			auto& entry = versionMap[ClientVersionRef()];
-			entry.count++;
-			if (entry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) {
-				entry.examples.emplace_back(ci.first, ci.second.traceLogGroup);
-			}
-		}
-	}
+		const auto& networkAddress = ci.first;
+		const auto& traceLogGroup = ci.second.traceLogGroup;
 
-	req.issues.reserve(issueMap.size());
-	for (auto& it : issueMap) {
-		req.issues.push_back(ItemWithExamples<Key>(it.first, it.second.count, it.second.examples));
+		for (auto& issue : ci.second.issues) {
+			tryInsertIntoSamples(req.issues[issue], networkAddress, traceLogGroup);
+		}
+
+		if (!ci.second.versions.size()) {
+			tryInsertIntoSamples(req.supportedVersions[ClientVersionRef()], networkAddress, traceLogGroup);
+			continue;
+		}
+
+		++req.clientCount;
+		StringRef maxProtocol;
+		for (auto& it : ci.second.versions) {
+			maxProtocol = std::max(maxProtocol, it.protocolVersion);
+			tryInsertIntoSamples(req.supportedVersions[it], networkAddress, traceLogGroup);
+		}
+		tryInsertIntoSamples(req.maxProtocolSupported[maxProtocol], networkAddress, traceLogGroup);
 	}
-	req.supportedVersions.reserve(versionMap.size());
-	for (auto& it : versionMap) {
-		req.supportedVersions.push_back(
-		    ItemWithExamples<Standalone<ClientVersionRef>>(it.first, it.second.count, it.second.examples));
-	}
-	req.maxProtocolSupported.reserve(maxProtocolMap.size());
-	for (auto& it : maxProtocolMap) {
-		req.maxProtocolSupported.push_back(ItemWithExamples<Key>(it.first, it.second.count, it.second.examples));
-	}
-	req.clientCount = clientCount;
 
 	return req;
 }
diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index b08457a3ac..51271dd09e 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -257,13 +257,14 @@ ThreadFuture<Standalone<VectorRef<KeyRef>>> DLTransaction::getRangeSplitPoints(c
 	});
 }
 
-ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> DLTransaction::getBlobGranuleRanges(const KeyRangeRef& keyRange) {
+ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> DLTransaction::getBlobGranuleRanges(const KeyRangeRef& keyRange,
+                                                                                     int rangeLimit) {
 	if (!api->transactionGetBlobGranuleRanges) {
 		return unsupported_operation();
 	}
 
 	FdbCApi::FDBFuture* f = api->transactionGetBlobGranuleRanges(
-	    tr, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size());
+	    tr, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), rangeLimit);
 	return toThreadFuture<Standalone<VectorRef<KeyRangeRef>>>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
 		const FdbCApi::FDBKeyRange* keyRanges;
 		int keyRangesLength;
@@ -279,10 +280,46 @@ ThreadResult<RangeResult> DLTransaction::readBlobGranules(const KeyRangeRef& key
                                                           Version beginVersion,
                                                           Optional<Version> readVersion,
                                                           ReadBlobGranuleContext granuleContext) {
-	if (!api->transactionReadBlobGranules) {
+	return unsupported_operation();
+}
+
+ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>> DLTransaction::readBlobGranulesStart(
+    const KeyRangeRef& keyRange,
+    Version beginVersion,
+    Optional<Version> readVersion,
+    Version* readVersionOut) {
+	if (!api->transactionReadBlobGranulesStart) {
 		return unsupported_operation();
 	}
 
+	int64_t rv = readVersion.present() ? readVersion.get() : latestVersion;
+
+	FdbCApi::FDBFuture* f = api->transactionReadBlobGranulesStart(tr,
+	                                                              keyRange.begin.begin(),
+	                                                              keyRange.begin.size(),
+	                                                              keyRange.end.begin(),
+	                                                              keyRange.end.size(),
+	                                                              beginVersion,
+	                                                              rv,
+	                                                              readVersionOut);
+
+	return ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>>(
+	    (ThreadSingleAssignmentVar<Standalone<VectorRef<BlobGranuleChunkRef>>>*)(f));
+};
+
+ThreadResult<RangeResult> DLTransaction::readBlobGranulesFinish(
+    ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>> startFuture,
+    const KeyRangeRef& keyRange,
+    Version beginVersion,
+    Version readVersion,
+    ReadBlobGranuleContext granuleContext) {
+	if (!api->transactionReadBlobGranulesFinish) {
+		return unsupported_operation();
+	}
+
+	// convert back to fdb future for API
+	FdbCApi::FDBFuture* f = (FdbCApi::FDBFuture*)(startFuture.extractPtr());
+
 	// FIXME: better way to convert here?
 	FdbCApi::FDBReadBlobGranuleContext context;
 	context.userContext = granuleContext.userContext;
@@ -292,18 +329,18 @@ ThreadResult<RangeResult> DLTransaction::readBlobGranules(const KeyRangeRef& key
 	context.debugNoMaterialize = granuleContext.debugNoMaterialize;
 	context.granuleParallelism = granuleContext.granuleParallelism;
 
-	int64_t rv = readVersion.present() ? readVersion.get() : latestVersion;
+	FdbCApi::FDBResult* r = api->transactionReadBlobGranulesFinish(tr,
+	                                                               f,
+	                                                               keyRange.begin.begin(),
+	                                                               keyRange.begin.size(),
+	                                                               keyRange.end.begin(),
+	                                                               keyRange.end.size(),
+	                                                               beginVersion,
+	                                                               readVersion,
+	                                                               &context);
 
-	FdbCApi::FDBResult* r = api->transactionReadBlobGranules(tr,
-	                                                         keyRange.begin.begin(),
-	                                                         keyRange.begin.size(),
-	                                                         keyRange.end.begin(),
-	                                                         keyRange.end.size(),
-	                                                         beginVersion,
-	                                                         rv,
-	                                                         context);
 	return ThreadResult<RangeResult>((ThreadSingleAssignmentVar<RangeResult>*)(r));
-}
+};
 
 void DLTransaction::addReadConflictRange(const KeyRangeRef& keys) {
 	throwIfError(api->transactionAddConflictRange(
@@ -583,6 +620,71 @@ ThreadFuture<Void> DLDatabase::waitPurgeGranulesComplete(const KeyRef& purgeKey)
 	return toThreadFuture<Void>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { return Void(); });
 }
 
+ThreadFuture<bool> DLDatabase::blobbifyRange(const KeyRangeRef& keyRange) {
+	if (!api->databaseBlobbifyRange) {
+		return unsupported_operation();
+	}
+
+	FdbCApi::FDBFuture* f = api->databaseBlobbifyRange(
+	    db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size());
+
+	return toThreadFuture<bool>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
+		bool ret = false;
+		ASSERT(!api->futureGetBool(f, &ret));
+		return ret;
+	});
+}
+
+ThreadFuture<bool> DLDatabase::unblobbifyRange(const KeyRangeRef& keyRange) {
+	if (!api->databaseUnblobbifyRange) {
+		return unsupported_operation();
+	}
+
+	FdbCApi::FDBFuture* f = api->databaseUnblobbifyRange(
+	    db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size());
+
+	return toThreadFuture<bool>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
+		bool ret = false;
+		ASSERT(!api->futureGetBool(f, &ret));
+		return ret;
+	});
+}
+
+ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> DLDatabase::listBlobbifiedRanges(const KeyRangeRef& keyRange,
+                                                                                  int rangeLimit) {
+	if (!api->databaseListBlobbifiedRanges) {
+		return unsupported_operation();
+	}
+
+	FdbCApi::FDBFuture* f = api->databaseListBlobbifiedRanges(
+	    db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), rangeLimit);
+
+	return toThreadFuture<Standalone<VectorRef<KeyRangeRef>>>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
+		const FdbCApi::FDBKeyRange* keyRanges;
+		int keyRangesLength;
+		FdbCApi::fdb_error_t error = api->futureGetKeyRangeArray(f, &keyRanges, &keyRangesLength);
+		ASSERT(!error);
+		// The memory for this is stored in the FDBFuture and is released when the future gets destroyed.
+		return Standalone<VectorRef<KeyRangeRef>>(VectorRef<KeyRangeRef>((KeyRangeRef*)keyRanges, keyRangesLength),
+		                                          Arena());
+	});
+}
+
+ThreadFuture<Version> DLDatabase::verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) {
+	if (!api->databaseVerifyBlobRange) {
+		return unsupported_operation();
+	}
+
+	FdbCApi::FDBFuture* f = api->databaseVerifyBlobRange(
+	    db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), version);
+
+	return toThreadFuture<Version>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
+		Version version = invalidVersion;
+		ASSERT(!api->futureGetInt64(f, &version));
+		return version;
+	});
+}
+
 // DLApi
 
 // Loads the specified function from a dynamic library
@@ -626,6 +728,8 @@ void DLApi::init() {
 
 	loadClientFunction(&api->selectApiVersion, lib, fdbCPath, "fdb_select_api_version_impl", headerVersion >= 0);
 	loadClientFunction(&api->getClientVersion, lib, fdbCPath, "fdb_get_client_version", headerVersion >= 410);
+	loadClientFunction(
+	    &api->useFutureProtocolVersion, lib, fdbCPath, "fdb_use_future_protocol_version", headerVersion >= 720);
 	loadClientFunction(&api->setNetworkOption, lib, fdbCPath, "fdb_network_set_option", headerVersion >= 0);
 	loadClientFunction(&api->setupNetwork, lib, fdbCPath, "fdb_setup_network", headerVersion >= 0);
 	loadClientFunction(&api->runNetwork, lib, fdbCPath, "fdb_run_network", headerVersion >= 0);
@@ -668,6 +772,13 @@ void DLApi::init() {
 	                   fdbCPath,
 	                   "fdb_database_wait_purge_granules_complete",
 	                   headerVersion >= 710);
+	loadClientFunction(&api->databaseBlobbifyRange, lib, fdbCPath, "fdb_database_blobbify_range", headerVersion >= 720);
+	loadClientFunction(
+	    &api->databaseUnblobbifyRange, lib, fdbCPath, "fdb_database_unblobbify_range", headerVersion >= 720);
+	loadClientFunction(
+	    &api->databaseListBlobbifiedRanges, lib, fdbCPath, "fdb_database_list_blobbified_ranges", headerVersion >= 720);
+	loadClientFunction(
+	    &api->databaseVerifyBlobRange, lib, fdbCPath, "fdb_database_verify_blob_range", headerVersion >= 720);
 
 	loadClientFunction(
 	    &api->tenantCreateTransaction, lib, fdbCPath, "fdb_tenant_create_transaction", headerVersion >= 710);
@@ -737,11 +848,22 @@ void DLApi::init() {
 	                   headerVersion >= 710);
 	loadClientFunction(
 	    &api->transactionReadBlobGranules, lib, fdbCPath, "fdb_transaction_read_blob_granules", headerVersion >= 710);
+	loadClientFunction(&api->transactionReadBlobGranulesStart,
+	                   lib,
+	                   fdbCPath,
+	                   "fdb_transaction_read_blob_granules_start",
+	                   headerVersion >= 720);
+	loadClientFunction(&api->transactionReadBlobGranulesFinish,
+	                   lib,
+	                   fdbCPath,
+	                   "fdb_transaction_read_blob_granules_finish",
+	                   headerVersion >= 720);
 	loadClientFunction(&api->futureGetInt64,
 	                   lib,
 	                   fdbCPath,
 	                   headerVersion >= 620 ? "fdb_future_get_int64" : "fdb_future_get_version",
 	                   headerVersion >= 0);
+	loadClientFunction(&api->futureGetBool, lib, fdbCPath, "fdb_future_get_bool", headerVersion >= 720);
 	loadClientFunction(&api->futureGetUInt64, lib, fdbCPath, "fdb_future_get_uint64", headerVersion >= 700);
 	loadClientFunction(&api->futureGetError, lib, fdbCPath, "fdb_future_get_error", headerVersion >= 0);
 	loadClientFunction(&api->futureGetKey, lib, fdbCPath, "fdb_future_get_key", headerVersion >= 0);
@@ -788,6 +910,14 @@ const char* DLApi::getClientVersion() {
 	return api->getClientVersion();
 }
 
+void DLApi::useFutureProtocolVersion() {
+	if (!api->useFutureProtocolVersion) {
+		return;
+	}
+
+	api->useFutureProtocolVersion();
+}
+
 void DLApi::setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value) {
 	throwIfError(api->setNetworkOption(static_cast<FDBNetworkOption>(option),
 	                                   value.present() ? value.get().begin() : nullptr,
@@ -1069,9 +1199,10 @@ ThreadFuture<Standalone<VectorRef<KeyRef>>> MultiVersionTransaction::getRangeSpl
 }
 
 ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> MultiVersionTransaction::getBlobGranuleRanges(
-    const KeyRangeRef& keyRange) {
+    const KeyRangeRef& keyRange,
+    int rangeLimit) {
 	auto tr = getTransaction();
-	auto f = tr.transaction ? tr.transaction->getBlobGranuleRanges(keyRange)
+	auto f = tr.transaction ? tr.transaction->getBlobGranuleRanges(keyRange, rangeLimit)
 	                        : makeTimeout<Standalone<VectorRef<KeyRangeRef>>>();
 	return abortableFuture(f, tr.onChange);
 }
@@ -1080,14 +1211,45 @@ ThreadResult<RangeResult> MultiVersionTransaction::readBlobGranules(const KeyRan
                                                                     Version beginVersion,
                                                                     Optional<Version> readVersion,
                                                                     ReadBlobGranuleContext granuleContext) {
+	// FIXME: prevent from calling this from another main thread?
 	auto tr = getTransaction();
 	if (tr.transaction) {
-		return tr.transaction->readBlobGranules(keyRange, beginVersion, readVersion, granuleContext);
+		Version readVersionOut;
+		auto f = tr.transaction->readBlobGranulesStart(keyRange, beginVersion, readVersion, &readVersionOut);
+		auto abortableF = abortableFuture(f, tr.onChange);
+		abortableF.blockUntilReadyCheckOnMainThread();
+		if (abortableF.isError()) {
+			return ThreadResult<RangeResult>(abortableF.getError());
+		}
+		if (granuleContext.debugNoMaterialize) {
+			return ThreadResult<RangeResult>(blob_granule_not_materialized());
+		}
+		return tr.transaction->readBlobGranulesFinish(
+		    abortableF, keyRange, beginVersion, readVersionOut, granuleContext);
 	} else {
 		return abortableTimeoutResult<RangeResult>(tr.onChange);
 	}
 }
 
+ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>> MultiVersionTransaction::readBlobGranulesStart(
+    const KeyRangeRef& keyRange,
+    Version beginVersion,
+    Optional<Version> readVersion,
+    Version* readVersionOut) {
+	// can't call this directly
+	return ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>>(unsupported_operation());
+}
+
+ThreadResult<RangeResult> MultiVersionTransaction::readBlobGranulesFinish(
+    ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>> startFuture,
+    const KeyRangeRef& keyRange,
+    Version beginVersion,
+    Version readVersion,
+    ReadBlobGranuleContext granuleContext) {
+	// can't call this directly
+	return ThreadResult<RangeResult>(unsupported_operation());
+}
+
 void MultiVersionTransaction::atomicOp(const KeyRef& key, const ValueRef& value, uint32_t operationType) {
 	auto tr = getTransaction();
 	if (tr.transaction) {
@@ -1579,6 +1741,32 @@ ThreadFuture<Void> MultiVersionDatabase::waitPurgeGranulesComplete(const KeyRef&
 	return abortableFuture(f, dbState->dbVar->get().onChange);
 }
 
+ThreadFuture<bool> MultiVersionDatabase::blobbifyRange(const KeyRangeRef& keyRange) {
+	auto dbVar = dbState->dbVar->get();
+	auto f = dbVar.value ? dbVar.value->blobbifyRange(keyRange) : ThreadFuture<bool>(Never());
+	return abortableFuture(f, dbVar.onChange);
+}
+
+ThreadFuture<bool> MultiVersionDatabase::unblobbifyRange(const KeyRangeRef& keyRange) {
+	auto dbVar = dbState->dbVar->get();
+	auto f = dbVar.value ? dbVar.value->unblobbifyRange(keyRange) : ThreadFuture<bool>(Never());
+	return abortableFuture(f, dbVar.onChange);
+}
+
+ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> MultiVersionDatabase::listBlobbifiedRanges(const KeyRangeRef& keyRange,
+                                                                                            int rangeLimit) {
+	auto dbVar = dbState->dbVar->get();
+	auto f = dbVar.value ? dbVar.value->listBlobbifiedRanges(keyRange, rangeLimit)
+	                     : ThreadFuture<Standalone<VectorRef<KeyRangeRef>>>(Never());
+	return abortableFuture(f, dbVar.onChange);
+}
+
+ThreadFuture<Version> MultiVersionDatabase::verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) {
+	auto dbVar = dbState->dbVar->get();
+	auto f = dbVar.value ? dbVar.value->verifyBlobRange(keyRange, version) : ThreadFuture<Version>(Never());
+	return abortableFuture(f, dbVar.onChange);
+}
+
 // Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is given, the future won't return until the protocol version is different than expected
 // Note: this will never return if the server is running a protocol from FDB 5.0 or older
@@ -1644,7 +1832,7 @@ ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion()
 		}
 
 		ProtocolVersion clusterVersion =
-		    !cv.isError() ? cv.get() : self->dbProtocolVersion.orDefault(currentProtocolVersion);
+		    !cv.isError() ? cv.get() : self->dbProtocolVersion.orDefault(currentProtocolVersion());
 		onMainThreadVoid([self, clusterVersion]() { self->protocolVersionChanged(clusterVersion); });
 		return ErrorOr<Void>(Void());
 	});
@@ -1974,6 +2162,10 @@ const char* MultiVersionApi::getClientVersion() {
 	return localClient->api->getClientVersion();
 }
 
+void MultiVersionApi::useFutureProtocolVersion() {
+	localClient->api->useFutureProtocolVersion();
+}
+
 namespace {
 
 void validateOption(Optional<StringRef> value, bool canBePresent, bool canBeAbsent, bool canBeEmpty = true) {
@@ -2006,7 +2198,7 @@ void MultiVersionApi::setCallbacksOnExternalThreads() {
 
 	callbackOnMainThread = false;
 }
-void MultiVersionApi::addExternalLibrary(std::string path) {
+void MultiVersionApi::addExternalLibrary(std::string path, bool useFutureVersion) {
 	std::string filename = basename(path);
 
 	if (filename.empty() || !fileExists(path)) {
@@ -2023,8 +2215,8 @@ void MultiVersionApi::addExternalLibrary(std::string path) {
 	threadCount = std::max(threadCount, 1);
 
 	if (externalClientDescriptions.count(filename) == 0) {
-		TraceEvent("AddingExternalClient").detail("LibraryPath", filename);
-		externalClientDescriptions.emplace(std::make_pair(filename, ClientDesc(path, true)));
+		TraceEvent("AddingExternalClient").detail("LibraryPath", filename).detail("UseFutureVersion", useFutureVersion);
+		externalClientDescriptions.emplace(std::make_pair(filename, ClientDesc(path, true, useFutureVersion)));
 	}
 }
 
@@ -2044,7 +2236,7 @@ void MultiVersionApi::addExternalLibraryDirectory(std::string path) {
 		std::string lib = abspath(joinPath(path, filename));
 		if (externalClientDescriptions.count(filename) == 0) {
 			TraceEvent("AddingExternalClient").detail("LibraryPath", filename);
-			externalClientDescriptions.emplace(std::make_pair(filename, ClientDesc(lib, true)));
+			externalClientDescriptions.emplace(std::make_pair(filename, ClientDesc(lib, true, false)));
 		}
 	}
 }
@@ -2182,7 +2374,7 @@ void MultiVersionApi::setNetworkOptionInternal(FDBNetworkOptions::Option option,
 		setCallbacksOnExternalThreads();
 	} else if (option == FDBNetworkOptions::EXTERNAL_CLIENT_LIBRARY) {
 		validateOption(value, true, false, false);
-		addExternalLibrary(abspath(value.get().toString()));
+		addExternalLibrary(abspath(value.get().toString()), false);
 	} else if (option == FDBNetworkOptions::EXTERNAL_CLIENT_DIRECTORY) {
 		validateOption(value, true, false, false);
 		addExternalLibraryDirectory(value.get().toString());
@@ -2213,6 +2405,9 @@ void MultiVersionApi::setNetworkOptionInternal(FDBNetworkOptions::Option option,
 	} else if (option == FDBNetworkOptions::CLIENT_TMP_DIR) {
 		validateOption(value, true, false, false);
 		tmpDir = abspath(value.get().toString());
+	} else if (option == FDBNetworkOptions::FUTURE_VERSION_CLIENT_LIBRARY) {
+		validateOption(value, true, false, false);
+		addExternalLibrary(abspath(value.get().toString()), true);
 	} else {
 		forwardOption = true;
 	}
@@ -2251,13 +2446,14 @@ void MultiVersionApi::setupNetwork() {
 		for (auto i : externalClientDescriptions) {
 			std::string path = i.second.libPath;
 			std::string filename = basename(path);
+			bool useFutureVersion = i.second.useFutureVersion;
 
 			// Copy external lib for each thread
 			if (externalClients.count(filename) == 0) {
 				externalClients[filename] = {};
 				for (const auto& tmp : copyExternalLibraryPerThread(path)) {
 					externalClients[filename].push_back(Reference<ClientInfo>(
-					    new ClientInfo(new DLApi(tmp.first, tmp.second /*unlink on load*/), path)));
+					    new ClientInfo(new DLApi(tmp.first, tmp.second /*unlink on load*/), path, useFutureVersion)));
 				}
 			}
 		}
@@ -2297,6 +2493,9 @@ void MultiVersionApi::setupNetwork() {
 		runOnExternalClientsAllThreads([this](Reference<ClientInfo> client) {
 			TraceEvent("InitializingExternalClient").detail("LibraryPath", client->libPath);
 			client->api->selectApiVersion(apiVersion);
+			if (client->useFutureVersion) {
+				client->api->useFutureProtocolVersion();
+			}
 			client->loadVersion();
 		});
 
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 7c69c628c2..d41e5a0260 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -23,6 +23,7 @@
 #include <algorithm>
 #include <cstdio>
 #include <iterator>
+#include <limits>
 #include <memory>
 #include <regex>
 #include <unordered_set>
@@ -102,6 +103,8 @@
 #endif
 #include "flow/actorcompiler.h" // This must be the last #include.
 
+FDB_DEFINE_BOOLEAN_PARAM(CacheResult);
+
 extern const char* getSourceVersion();
 
 namespace {
@@ -230,8 +233,9 @@ void DatabaseContext::getLatestCommitVersions(const Reference<LocationInfo>& loc
                                               VersionVector& latestCommitVersions) {
 	latestCommitVersions.clear();
 
-	if (info->debugID.present()) {
-		g_traceBatch.addEvent("TransactionDebug", info->debugID.get().first(), "NativeAPI.getLatestCommitVersions");
+	if (info->readOptions.present() && info->readOptions.get().debugID.present()) {
+		g_traceBatch.addEvent(
+		    "TransactionDebug", info->readOptions.get().debugID.get().first(), "NativeAPI.getLatestCommitVersions");
 	}
 
 	if (!info->readVersionObtainedFromGrvProxy) {
@@ -269,8 +273,8 @@ void DatabaseContext::getLatestCommitVersions(const Reference<LocationInfo>& loc
 				}
 			}
 		}
-		// commitVersion == readVersion is common, do not log.
-		if (!updatedVersionMap && commitVersion != readVersion) {
+		// Do not log if commitVersion >= readVersion.
+		if (!updatedVersionMap && commitVersion == invalidVersion) {
 			TraceEvent(SevDebug, "CommitVersionNotFoundForSS")
 			    .detail("InSSIDMap", iter != ssidTagMapping.end() ? 1 : 0)
 			    .detail("Tag", tag)
@@ -1278,32 +1282,6 @@ void DatabaseContext::registerSpecialKeysImpl(SpecialKeySpace::MODULE module,
 ACTOR Future<RangeResult> getWorkerInterfaces(Reference<IClusterConnectionRecord> clusterRecord);
 ACTOR Future<Optional<Value>> getJSON(Database db);
 
-struct WorkerInterfacesSpecialKeyImpl : SpecialKeyRangeReadImpl {
-	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw,
-	                             KeyRangeRef kr,
-	                             GetRangeLimits limitsHint) const override {
-		if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) {
-			Key prefix = Key(getKeyRange().begin);
-			return map(getWorkerInterfaces(ryw->getDatabase()->getConnectionRecord()),
-			           [prefix = prefix, kr = KeyRange(kr)](const RangeResult& in) {
-				           RangeResult result;
-				           for (const auto& [k_, v] : in) {
-					           auto k = k_.withPrefix(prefix);
-					           if (kr.contains(k))
-						           result.push_back_deep(result.arena(), KeyValueRef(k, v));
-				           }
-
-				           std::sort(result.begin(), result.end(), KeyValueRef::OrderByKey{});
-				           return result;
-			           });
-		} else {
-			return RangeResult();
-		}
-	}
-
-	explicit WorkerInterfacesSpecialKeyImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
-};
-
 struct SingleSpecialKeyImpl : SpecialKeyRangeReadImpl {
 	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw,
 	                             KeyRangeRef kr,
@@ -1826,6 +1804,12 @@ DatabaseContext::~DatabaseContext() {
 		it->second->notifyContextDestroyed();
 	ASSERT_ABORT(server_interf.empty());
 	locationCache.insert(allKeys, Reference<LocationInfo>());
+	for (auto& it : notAtLatestChangeFeeds) {
+		it.second->context = nullptr;
+	}
+	for (auto& it : changeFeedUpdaters) {
+		it.second->context = nullptr;
+	}
 
 	TraceEvent("DatabaseContextDestructed", dbId).backtrace();
 }
@@ -2987,16 +2971,14 @@ Future<KeyRangeLocationInfo> getKeyLocation(Reference<TransactionState> trState,
 	                        key,
 	                        member,
 	                        trState->spanContext,
-	                        trState->debugID,
+	                        trState->readOptions.present() ? trState->readOptions.get().debugID : Optional<UID>(),
 	                        trState->useProvisionalProxies,
 	                        isBackward,
 	                        version);
 
-	if (trState->tenant().present() && useTenant && trState->tenantId == TenantInfo::INVALID_TENANT) {
+	if (trState->tenant().present() && useTenant && trState->tenantId() == TenantInfo::INVALID_TENANT) {
 		return map(f, [trState](const KeyRangeLocationInfo& locationInfo) {
-			if (trState->tenantId == TenantInfo::INVALID_TENANT) {
-				trState->tenantId = locationInfo.tenantEntry.id;
-			}
+			trState->trySetTenantId(locationInfo.tenantEntry.id);
 			return locationInfo;
 		});
 	} else {
@@ -3130,16 +3112,14 @@ Future<std::vector<KeyRangeLocationInfo>> getKeyRangeLocations(Reference<Transac
 	                              reverse,
 	                              member,
 	                              trState->spanContext,
-	                              trState->debugID,
+	                              trState->readOptions.present() ? trState->readOptions.get().debugID : Optional<UID>(),
 	                              trState->useProvisionalProxies,
 	                              version);
 
-	if (trState->tenant().present() && useTenant && trState->tenantId == TenantInfo::INVALID_TENANT) {
+	if (trState->tenant().present() && useTenant && trState->tenantId() == TenantInfo::INVALID_TENANT) {
 		return map(f, [trState](const std::vector<KeyRangeLocationInfo>& locationInfo) {
 			ASSERT(!locationInfo.empty());
-			if (trState->tenantId == TenantInfo::INVALID_TENANT) {
-				trState->tenantId = locationInfo[0].tenantEntry.id;
-			}
+			trState->trySetTenantId(locationInfo[0].tenantEntry.id);
 			return locationInfo;
 		});
 	} else {
@@ -3154,16 +3134,16 @@ ACTOR Future<Void> warmRange_impl(Reference<TransactionState> trState, KeyRange
 	state Version version = wait(fVersion);
 
 	loop {
-		std::vector<KeyRangeLocationInfo> locations =
-		    wait(getKeyRangeLocations_internal(trState->cx,
-		                                       trState->getTenantInfo(),
-		                                       keys,
-		                                       CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT,
-		                                       Reverse::False,
-		                                       trState->spanContext,
-		                                       trState->debugID,
-		                                       trState->useProvisionalProxies,
-		                                       version));
+		std::vector<KeyRangeLocationInfo> locations = wait(getKeyRangeLocations_internal(
+		    trState->cx,
+		    trState->getTenantInfo(),
+		    keys,
+		    CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT,
+		    Reverse::False,
+		    trState->spanContext,
+		    trState->readOptions.present() ? trState->readOptions.get().debugID : Optional<UID>(),
+		    trState->useProvisionalProxies,
+		    version));
 		totalRanges += CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT;
 		totalRequests++;
 		if (locations.size() == 0 || totalRanges >= trState->cx->locationCacheSize ||
@@ -3242,6 +3222,8 @@ TenantInfo TransactionState::getTenantInfo(AllowInvalidTenantID allowInvalidId /
 
 	if (options.rawAccess) {
 		return TenantInfo();
+	} else if (!cx->internal && cx->clientInfo->get().clusterType == ClusterType::METACLUSTER_MANAGEMENT) {
+		throw management_cluster_invalid_access();
 	} else if (!cx->internal && cx->clientInfo->get().tenantMode == TenantMode::REQUIRED && !t.present()) {
 		throw tenant_name_required();
 	} else if (!t.present()) {
@@ -3257,8 +3239,8 @@ TenantInfo TransactionState::getTenantInfo(AllowInvalidTenantID allowInvalidId /
 		}
 	}
 
-	ASSERT(allowInvalidId || tenantId != TenantInfo::INVALID_TENANT);
-	return TenantInfo(t, authToken, tenantId);
+	ASSERT(allowInvalidId || tenantId_ != TenantInfo::INVALID_TENANT);
+	return TenantInfo(t, authToken, tenantId_);
 }
 
 // Returns the tenant used in this transaction. If the tenant is unset and raw access isn't specified, then the default
@@ -3286,6 +3268,13 @@ bool TransactionState::hasTenant() const {
 	return tenantSet && tenant_.present();
 }
 
+Future<Void> TransactionState::handleUnknownTenant() {
+	tenantId_ = TenantInfo::INVALID_TENANT;
+	ASSERT(tenant().present());
+	cx->invalidateCachedTenant(tenant().get());
+	return delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, taskID);
+}
+
 Future<Void> Transaction::warmRange(KeyRange keys) {
 	return warmRange_impl(trState, keys, getReadVersion());
 }
@@ -3312,12 +3301,16 @@ ACTOR Future<Optional<Value>> getValue(Reference<TransactionState> trState,
 		state uint64_t startTime;
 		state double startTimeD;
 		state VersionVector ssLatestCommitVersions;
+		state Optional<ReadOptions> readOptions = trState->readOptions;
+
 		trState->cx->getLatestCommitVersions(locationInfo.locations, ver, trState, ssLatestCommitVersions);
 		try {
-			if (trState->debugID.present()) {
+			if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) {
 				getValueID = nondeterministicRandom()->randomUniqueID();
+				readOptions.get().debugID = getValueID;
 
-				g_traceBatch.addAttach("GetValueAttachID", trState->debugID.get().first(), getValueID.get().first());
+				g_traceBatch.addAttach(
+				    "GetValueAttachID", trState->readOptions.get().debugID.get().first(), getValueID.get().first());
 				g_traceBatch.addEvent("GetValueDebug",
 				                      getValueID.get().first(),
 				                      "NativeAPI.getValue.Before"); //.detail("TaskID", g_network->getCurrentTask());
@@ -3345,13 +3338,12 @@ ACTOR Future<Optional<Value>> getValue(Reference<TransactionState> trState,
 					         locationInfo.locations,
 					         &StorageServerInterface::getValue,
 					         GetValueRequest(span.context,
-					                         useTenant ? trState->getTenantInfo() : TenantInfo(),
+					         useTenant ? trState->getTenantInfo() : TenantInfo(),
 					                         key,
 					                         ver,
-					                         trState->readType,
 					                         trState->cx->sampleReadTags() ? trState->options.readTags
 					                                                       : Optional<TagSet>(),
-					                         getValueID,
+					                         readOptions,
 					                         ssLatestCommitVersions),
 					         TaskPriority::DefaultPromiseEndpoint,
 					         AtMostOnce::False,
@@ -3405,9 +3397,8 @@ ACTOR Future<Optional<Value>> getValue(Reference<TransactionState> trState,
 				trState->cx->invalidateCache(locationInfo.tenantEntry.prefix, key);
 				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID));
 			} else if (e.code() == error_code_unknown_tenant) {
-				ASSERT(useTenant && trState->tenant().present());
-				trState->cx->invalidateCachedTenant(trState->tenant().get());
-				wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID));
+				ASSERT(useTenant);
+				wait(trState->handleUnknownTenant());
 			} else {
 				if (trState->trLogInfo && recordLogInfo)
 					trState->trLogInfo->addLog(FdbClientLogEvents::EventGetError(startTimeD,
@@ -3427,12 +3418,16 @@ ACTOR Future<Key> getKey(Reference<TransactionState> trState,
                          UseTenant useTenant = UseTenant::True) {
 	wait(success(version));
 
-	state Optional<UID> getKeyID = Optional<UID>();
-	state Span span("NAPI:getKey"_loc, trState->spanContext);
-	if (trState->debugID.present()) {
-		getKeyID = nondeterministicRandom()->randomUniqueID();
+	state Optional<UID> getKeyID;
+	state Optional<ReadOptions> readOptions = trState->readOptions;
 
-		g_traceBatch.addAttach("GetKeyAttachID", trState->debugID.get().first(), getKeyID.get().first());
+	state Span span("NAPI:getKey"_loc, trState->spanContext);
+	if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) {
+		getKeyID = nondeterministicRandom()->randomUniqueID();
+		readOptions.get().debugID = getKeyID;
+
+		g_traceBatch.addAttach(
+		    "GetKeyAttachID", trState->readOptions.get().debugID.get().first(), getKeyID.get().first());
 		g_traceBatch.addEvent(
 		    "GetKeyDebug",
 		    getKeyID.get().first(),
@@ -3474,9 +3469,8 @@ ACTOR Future<Key> getKey(Reference<TransactionState> trState,
 			                  useTenant ? trState->getTenantInfo() : TenantInfo(),
 			                  k,
 			                  version.get(),
-			                  trState->readType,
 			                  trState->cx->sampleReadTags() ? trState->options.readTags : Optional<TagSet>(),
-			                  getKeyID,
+			                  readOptions,
 			                  ssLatestCommitVersions);
 			req.arena.dependsOn(k.arena());
 
@@ -3517,9 +3511,8 @@ ACTOR Future<Key> getKey(Reference<TransactionState> trState,
 
 				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID));
 			} else if (e.code() == error_code_unknown_tenant) {
-				ASSERT(useTenant && trState->tenant().present());
-				trState->cx->invalidateCachedTenant(trState->tenant().get());
-				wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID));
+				ASSERT(useTenant);
+				wait(trState->handleUnknownTenant());
 			} else {
 				TraceEvent(SevInfo, "GetKeyError").error(e).detail("AtKey", k.getKey()).detail("Offset", k.offset);
 				throw e;
@@ -3530,8 +3523,8 @@ ACTOR Future<Key> getKey(Reference<TransactionState> trState,
 
 ACTOR Future<Version> waitForCommittedVersion(Database cx, Version version, SpanContext spanContext) {
 	state Span span("NAPI:waitForCommittedVersion"_loc, spanContext);
-	try {
-		loop {
+	loop {
+		try {
 			choose {
 				when(wait(cx->onProxiesChanged())) {}
 				when(GetReadVersionReply v = wait(basicLoadBalance(
@@ -3557,10 +3550,16 @@ ACTOR Future<Version> waitForCommittedVersion(Database cx, Version version, Span
 					wait(delay(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, cx->taskID));
 				}
 			}
+		} catch (Error& e) {
+			if (e.code() == error_code_batch_transaction_throttled ||
+			    e.code() == error_code_grv_proxy_memory_limit_exceeded) {
+				// GRV Proxy returns an error
+				wait(delayJittered(CLIENT_KNOBS->GRV_ERROR_RETRY_DELAY));
+			} else {
+				TraceEvent(SevError, "WaitForCommittedVersionError").error(e);
+				throw;
+			}
 		}
-	} catch (Error& e) {
-		TraceEvent(SevError, "WaitForCommittedVersionError").error(e);
-		throw;
 	}
 }
 
@@ -3753,7 +3752,7 @@ ACTOR Future<Void> sameVersionDiffValue(Database cx, Reference<WatchParameters>
 			}
 
 			// val_3 == val_2 (storage server value matches value passed into the function -> new watch)
-			if (valSS == parameters->value && tr.getTransactionState()->tenantId == parameters->tenant.tenantId) {
+			if (valSS == parameters->value && tr.getTransactionState()->tenantId() == parameters->tenant.tenantId) {
 				metadata = makeReference<WatchMetadata>(parameters);
 				cx->setWatchMetadata(metadata);
 
@@ -3923,7 +3922,7 @@ Future<RangeResultFamily> getExactRange(Reference<TransactionState> trState,
 			req.version = version;
 			req.begin = firstGreaterOrEqual(range.begin);
 			req.end = firstGreaterOrEqual(range.end);
-			req.readType = trState->readType;
+
 			setMatchIndex<GetKeyValuesFamilyRequest>(req, matchIndex);
 			req.spanContext = span.context;
 			trState->cx->getLatestCommitVersions(
@@ -3937,13 +3936,15 @@ Future<RangeResultFamily> getExactRange(Reference<TransactionState> trState,
 
 			// FIXME: buggify byte limits on internal functions that use them, instead of globally
 			req.tags = trState->cx->sampleReadTags() ? trState->options.readTags : Optional<TagSet>();
-			req.debugID = trState->debugID;
+
+			req.options = trState->readOptions;
 
 			try {
-				if (trState->debugID.present()) {
-					g_traceBatch.addEvent(
-					    "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getExactRange.Before");
-					/*TraceEvent("TransactionDebugGetExactRangeInfo", trState->debugID.get())
+				if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) {
+					g_traceBatch.addEvent("TransactionDebug",
+					                      trState->readOptions.get().debugID.get().first(),
+					                      "NativeAPI.getExactRange.Before");
+					/*TraceEvent("TransactionDebugGetExactRangeInfo", trState->readOptions.debugID.get())
 					    .detail("ReqBeginKey", req.begin.getKey())
 					    .detail("ReqEndKey", req.end.getKey())
 					    .detail("ReqLimit", req.limit)
@@ -3973,9 +3974,10 @@ Future<RangeResultFamily> getExactRange(Reference<TransactionState> trState,
 					++trState->cx->transactionPhysicalReadsCompleted;
 					throw;
 				}
-				if (trState->debugID.present())
-					g_traceBatch.addEvent(
-					    "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getExactRange.After");
+				if (trState->readOptions.present() && trState->readOptions.get().debugID.present())
+					g_traceBatch.addEvent("TransactionDebug",
+					                      trState->readOptions.get().debugID.get().first(),
+					                      "NativeAPI.getExactRange.After");
 				output.arena().dependsOn(rep.arena);
 				output.append(output.arena(), rep.data.begin(), rep.data.size());
 
@@ -4062,9 +4064,8 @@ Future<RangeResultFamily> getExactRange(Reference<TransactionState> trState,
 					wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID));
 					break;
 				} else if (e.code() == error_code_unknown_tenant) {
-					ASSERT(useTenant && trState->tenant().present());
-					trState->cx->invalidateCachedTenant(trState->tenant().get());
-					wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID));
+					ASSERT(useTenant);
+					wait(trState->handleUnknownTenant());
 					break;
 				} else {
 					TraceEvent(SevInfo, "GetExactRangeError")
@@ -4304,7 +4305,7 @@ Future<RangeResultFamily> getRange(Reference<TransactionState> trState,
 			req.arena.dependsOn(mapper.arena());
 			setMatchIndex<GetKeyValuesFamilyRequest>(req, matchIndex);
 			req.tenantInfo = useTenant ? trState->getTenantInfo() : TenantInfo();
-			req.readType = trState->readType;
+			req.options = trState->readOptions;
 			req.version = readVersion;
 
 			trState->cx->getLatestCommitVersions(
@@ -4342,13 +4343,13 @@ Future<RangeResultFamily> getRange(Reference<TransactionState> trState,
 			ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse);
 
 			req.tags = trState->cx->sampleReadTags() ? trState->options.readTags : Optional<TagSet>();
-			req.debugID = trState->debugID;
 			req.spanContext = span.context;
 			try {
-				if (trState->debugID.present()) {
-					g_traceBatch.addEvent(
-					    "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getRange.Before");
-					/*TraceEvent("TransactionDebugGetRangeInfo", trState->debugID.get())
+				if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) {
+					g_traceBatch.addEvent("TransactionDebug",
+					                      trState->readOptions.get().debugID.get().first(),
+					                      "NativeAPI.getRange.Before");
+					/*TraceEvent("TransactionDebugGetRangeInfo", trState->readOptions.debugID.get())
 					    .detail("ReqBeginKey", req.begin.getKey())
 					    .detail("ReqEndKey", req.end.getKey())
 					    .detail("OriginalBegin", originalBegin.toString())
@@ -4387,11 +4388,11 @@ Future<RangeResultFamily> getRange(Reference<TransactionState> trState,
 					throw;
 				}
 
-				if (trState->debugID.present()) {
+				if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) {
 					g_traceBatch.addEvent("TransactionDebug",
-					                      trState->debugID.get().first(),
+					                      trState->readOptions.get().debugID.get().first(),
 					                      "NativeAPI.getRange.After"); //.detail("SizeOf", rep.data.size());
-					/*TraceEvent("TransactionDebugGetRangeDone", trState->debugID.get())
+					/*TraceEvent("TransactionDebugGetRangeDone", trState->readOptions.debugID.get())
 					    .detail("ReqBeginKey", req.begin.getKey())
 					    .detail("ReqEndKey", req.end.getKey())
 					    .detail("RepIsMore", rep.more)
@@ -4503,10 +4504,11 @@ Future<RangeResultFamily> getRange(Reference<TransactionState> trState,
 				}
 
 			} catch (Error& e) {
-				if (trState->debugID.present()) {
-					g_traceBatch.addEvent(
-					    "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getRange.Error");
-					TraceEvent("TransactionDebugError", trState->debugID.get()).error(e);
+				if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) {
+					g_traceBatch.addEvent("TransactionDebug",
+					                      trState->readOptions.get().debugID.get().first(),
+					                      "NativeAPI.getRange.Error");
+					TraceEvent("TransactionDebugError", trState->readOptions.get().debugID.get()).error(e);
 				}
 				if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed ||
 				    (e.code() == error_code_transaction_too_old && readVersion == latestVersion)) {
@@ -4533,9 +4535,8 @@ Future<RangeResultFamily> getRange(Reference<TransactionState> trState,
 
 					wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID));
 				} else if (e.code() == error_code_unknown_tenant) {
-					ASSERT(useTenant && trState->tenant().present());
-					trState->cx->invalidateCachedTenant(trState->tenant().get());
-					wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID));
+					ASSERT(useTenant);
+					wait(trState->handleUnknownTenant());
 				} else {
 					if (trState->trLogInfo)
 						trState->trLogInfo->addLog(
@@ -4759,9 +4760,8 @@ ACTOR Future<Void> getRangeStreamFragment(Reference<TransactionState> trState,
 			req.spanContext = spanContext;
 			req.limit = reverse ? -CLIENT_KNOBS->REPLY_BYTE_LIMIT : CLIENT_KNOBS->REPLY_BYTE_LIMIT;
 			req.limitBytes = std::numeric_limits<int>::max();
-			// it is used to inform the storage that the rangeRead is for Fetch
-			// req.isFetchKeys = (trState->readType == ReadType::FETCH);
-			req.readType = trState->readType;
+			req.options = trState->readOptions;
+
 			trState->cx->getLatestCommitVersions(
 			    locations[shard].locations, req.version, trState, req.ssLatestCommitVersions);
 
@@ -4772,12 +4772,12 @@ ACTOR Future<Void> getRangeStreamFragment(Reference<TransactionState> trState,
 
 			// FIXME: buggify byte limits on internal functions that use them, instead of globally
 			req.tags = trState->cx->sampleReadTags() ? trState->options.readTags : Optional<TagSet>();
-			req.debugID = trState->debugID;
 
 			try {
-				if (trState->debugID.present()) {
-					g_traceBatch.addEvent(
-					    "TransactionDebug", trState->debugID.get().first(), "NativeAPI.RangeStream.Before");
+				if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) {
+					g_traceBatch.addEvent("TransactionDebug",
+					                      trState->readOptions.get().debugID.get().first(),
+					                      "NativeAPI.RangeStream.Before");
 				}
 				++trState->cx->transactionPhysicalReads;
 				state GetKeyValuesStreamReply rep;
@@ -4871,9 +4871,10 @@ ACTOR Future<Void> getRangeStreamFragment(Reference<TransactionState> trState,
 						}
 						rep = GetKeyValuesStreamReply();
 					}
-					if (trState->debugID.present())
-						g_traceBatch.addEvent(
-						    "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getExactRange.After");
+					if (trState->readOptions.present() && trState->readOptions.get().debugID.present())
+						g_traceBatch.addEvent("TransactionDebug",
+						                      trState->readOptions.get().debugID.get().first(),
+						                      "NativeAPI.getExactRange.After");
 					RangeResult output(RangeResultRef(rep.data, rep.more), rep.arena);
 
 					if (tssDuplicateStream.present() && !tssDuplicateStream.get().done()) {
@@ -4994,9 +4995,7 @@ ACTOR Future<Void> getRangeStreamFragment(Reference<TransactionState> trState,
 					wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID));
 					break;
 				} else if (e.code() == error_code_unknown_tenant) {
-					ASSERT(trState->tenant().present());
-					trState->cx->invalidateCachedTenant(trState->tenant().get());
-					wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID));
+					wait(trState->handleUnknownTenant());
 					break;
 				} else {
 					results->sendError(e);
@@ -5279,7 +5278,7 @@ ACTOR Future<TenantInfo> getTenantMetadata(Reference<TransactionState> trState,
 Future<TenantInfo> populateAndGetTenant(Reference<TransactionState> trState, Key const& key, Version version) {
 	if (!trState->tenant().present() || key == metadataVersionKey) {
 		return TenantInfo();
-	} else if (trState->tenantId != TenantInfo::INVALID_TENANT) {
+	} else if (trState->tenantId() != TenantInfo::INVALID_TENANT) {
 		return trState->getTenantInfo();
 	} else {
 		return getTenantMetadata(trState, key, version);
@@ -5354,7 +5353,7 @@ Future<Void> Transaction::watch(Reference<Watch> watch) {
 	    trState->options.readTags,
 	    trState->spanContext,
 	    trState->taskID,
-	    trState->debugID,
+	    trState->readOptions.present() ? trState->readOptions.get().debugID : Optional<UID>(),
 	    trState->useProvisionalProxies);
 }
 
@@ -5773,7 +5772,9 @@ double Transaction::getBackoff(int errCode) {
 	returnedBackoff *= deterministicRandom()->random01();
 
 	// Set backoff for next time
-	if (errCode == error_code_proxy_memory_limit_exceeded) {
+	if (errCode == error_code_commit_proxy_memory_limit_exceeded ||
+	    errCode == error_code_grv_proxy_memory_limit_exceeded) {
+
 		backoff = std::min(backoff * CLIENT_KNOBS->BACKOFF_GROWTH_RATE, CLIENT_KNOBS->RESOURCE_CONSTRAINED_MAX_BACKOFF);
 	} else {
 		backoff = std::min(backoff * CLIENT_KNOBS->BACKOFF_GROWTH_RATE, trState->options.maxBackoff);
@@ -5979,7 +5980,7 @@ ACTOR static Future<Void> commitDummyTransaction(Reference<TransactionState> trS
 			tr.trState->options = trState->options;
 			tr.trState->taskID = trState->taskID;
 			tr.trState->authToken = trState->authToken;
-			tr.trState->tenantId = trState->tenantId;
+			tr.trState->trySetTenantId(trState->tenantId());
 			if (!trState->hasTenant()) {
 				tr.setOption(FDBTransactionOptions::RAW_ACCESS);
 			} else {
@@ -6020,16 +6021,17 @@ void Transaction::setupWatches() {
 		Future<Version> watchVersion = getCommittedVersion() > 0 ? getCommittedVersion() : getReadVersion();
 
 		for (int i = 0; i < watches.size(); ++i)
-			watches[i]->setWatch(watchValueMap(watchVersion,
-			                                   trState->getTenantInfo(),
-			                                   watches[i]->key,
-			                                   watches[i]->value,
-			                                   trState->cx,
-			                                   trState->options.readTags,
-			                                   trState->spanContext,
-			                                   trState->taskID,
-			                                   trState->debugID,
-			                                   trState->useProvisionalProxies));
+			watches[i]->setWatch(
+			    watchValueMap(watchVersion,
+			                  trState->getTenantInfo(),
+			                  watches[i]->key,
+			                  watches[i]->value,
+			                  trState->cx,
+			                  trState->options.readTags,
+			                  trState->spanContext,
+			                  trState->taskID,
+			                  trState->readOptions.present() ? trState->readOptions.get().debugID : Optional<UID>(),
+			                  trState->useProvisionalProxies));
 
 		watches.clear();
 	} catch (Error&) {
@@ -6150,15 +6152,18 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
 	state TraceInterval interval("TransactionCommit");
 	state double startTime = now();
 	state Span span("NAPI:tryCommit"_loc, trState->spanContext);
-	state Optional<UID> debugID = trState->debugID;
+	state Optional<UID> debugID = trState->readOptions.present() ? trState->readOptions.get().debugID : Optional<UID>();
 	state TenantPrefixPrepended tenantPrefixPrepended = TenantPrefixPrepended::False;
 	if (debugID.present()) {
 		TraceEvent(interval.begin()).detail("Parent", debugID.get());
 	}
 	try {
 		if (CLIENT_BUGGIFY) {
-			throw deterministicRandom()->randomChoice(std::vector<Error>{
-			    not_committed(), transaction_too_old(), proxy_memory_limit_exceeded(), commit_unknown_result() });
+			throw deterministicRandom()->randomChoice(std::vector<Error>{ not_committed(),
+			                                                              transaction_too_old(),
+			                                                              commit_proxy_memory_limit_exceeded(),
+			                                                              grv_proxy_memory_limit_exceeded(),
+			                                                              commit_unknown_result() });
 		}
 
 		if (req.tagSet.present() && trState->options.priority < TransactionPriority::IMMEDIATE) {
@@ -6317,12 +6322,15 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
 			// retry it anyway (relying on transaction idempotence) but a client might do something else.
 			throw commit_unknown_result();
 		} else if (e.code() == error_code_unknown_tenant) {
+			// Rather than reset the tenant and retry just the commit, we need to throw this error to the user and let
+			// them retry the whole transaction
 			ASSERT(trState->tenant().present());
 			trState->cx->invalidateCachedTenant(trState->tenant().get());
 			throw;
 		} else {
 			if (e.code() != error_code_transaction_too_old && e.code() != error_code_not_committed &&
-			    e.code() != error_code_database_locked && e.code() != error_code_proxy_memory_limit_exceeded &&
+			    e.code() != error_code_database_locked && e.code() != error_code_commit_proxy_memory_limit_exceeded &&
+			    e.code() != error_code_grv_proxy_memory_limit_exceeded &&
 			    e.code() != error_code_batch_transaction_throttled && e.code() != error_code_tag_throttled &&
 			    e.code() != error_code_process_behind && e.code() != error_code_future_version &&
 			    e.code() != error_code_tenant_not_found) {
@@ -6548,10 +6556,10 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optional<Strin
 			    makeReference<TransactionLogInfo>(value.get().printable(), TransactionLogInfo::DONT_LOG);
 			trState->trLogInfo->maxFieldLength = trState->options.maxTransactionLoggingFieldLength;
 		}
-		if (trState->debugID.present()) {
+		if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) {
 			TraceEvent(SevInfo, "TransactionBeingTraced")
 			    .detail("DebugTransactionID", trState->trLogInfo->identifier)
-			    .detail("ServerTraceID", trState->debugID.get());
+			    .detail("ServerTraceID", trState->readOptions.get().debugID.get());
 		}
 		break;
 
@@ -6583,10 +6591,11 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optional<Strin
 	case FDBTransactionOptions::SERVER_REQUEST_TRACING:
 		validateOptionValueNotPresent(value);
 		debugTransaction(deterministicRandom()->randomUniqueID());
-		if (trState->trLogInfo && !trState->trLogInfo->identifier.empty()) {
+		if (trState->trLogInfo && !trState->trLogInfo->identifier.empty() && trState->readOptions.present() &&
+		    trState->readOptions.get().debugID.present()) {
 			TraceEvent(SevInfo, "TransactionBeingTraced")
 			    .detail("DebugTransactionID", trState->trLogInfo->identifier)
-			    .detail("ServerTraceID", trState->debugID.get());
+			    .detail("ServerTraceID", trState->readOptions.get().debugID.get());
 		}
 		break;
 
@@ -6766,9 +6775,12 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion(SpanContext parentSpa
 				}
 			}
 		} catch (Error& e) {
-			if (e.code() != error_code_broken_promise && e.code() != error_code_batch_transaction_throttled)
+			if (e.code() != error_code_broken_promise && e.code() != error_code_batch_transaction_throttled &&
+			    e.code() != error_code_grv_proxy_memory_limit_exceeded)
 				TraceEvent(SevError, "GetConsistentReadVersionError").error(e);
-			if (e.code() == error_code_batch_transaction_throttled && !cx->apiVersionAtLeast(630)) {
+			if ((e.code() == error_code_batch_transaction_throttled ||
+			     e.code() == error_code_grv_proxy_memory_limit_exceeded) &&
+			    !cx->apiVersionAtLeast(630)) {
 				wait(delayJittered(5.0));
 			} else {
 				throw;
@@ -7054,7 +7066,9 @@ Future<Version> Transaction::getReadVersion(uint32_t flags) {
 
 		Location location = "NAPI:getReadVersion"_loc;
 		SpanContext spanContext = generateSpanID(trState->cx->transactionTracingSample, trState->spanContext);
-		auto const req = DatabaseContext::VersionRequest(spanContext, trState->options.tags, trState->debugID);
+		Optional<UID> versionDebugID =
+		    trState->readOptions.present() ? trState->readOptions.get().debugID : Optional<UID>();
+		auto const req = DatabaseContext::VersionRequest(spanContext, trState->options.tags, versionDebugID);
 		batcher.stream.send(req);
 		trState->startTime = now();
 		readVersion = extractReadVersion(trState, location, spanContext, req.reply.getFuture(), metadataVersion);
@@ -7212,14 +7226,16 @@ Future<Void> Transaction::onError(Error const& e) {
 		return client_invalid_operation();
 	}
 	if (e.code() == error_code_not_committed || e.code() == error_code_commit_unknown_result ||
-	    e.code() == error_code_database_locked || e.code() == error_code_proxy_memory_limit_exceeded ||
-	    e.code() == error_code_process_behind || e.code() == error_code_batch_transaction_throttled ||
-	    e.code() == error_code_tag_throttled) {
+	    e.code() == error_code_database_locked || e.code() == error_code_commit_proxy_memory_limit_exceeded ||
+	    e.code() == error_code_grv_proxy_memory_limit_exceeded || e.code() == error_code_process_behind ||
+	    e.code() == error_code_batch_transaction_throttled || e.code() == error_code_tag_throttled ||
+	    e.code() == error_code_blob_granule_request_failed) {
 		if (e.code() == error_code_not_committed)
 			++trState->cx->transactionsNotCommitted;
 		else if (e.code() == error_code_commit_unknown_result)
 			++trState->cx->transactionsMaybeCommitted;
-		else if (e.code() == error_code_proxy_memory_limit_exceeded)
+		else if (e.code() == error_code_commit_proxy_memory_limit_exceeded ||
+		         e.code() == error_code_grv_proxy_memory_limit_exceeded)
 			++trState->cx->transactionsResourceConstrained;
 		else if (e.code() == error_code_process_behind)
 			++trState->cx->transactionsProcessBehind;
@@ -7607,9 +7623,7 @@ ACTOR Future<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(Reference<Transa
 				trState->cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
 				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
 			} else if (e.code() == error_code_unknown_tenant) {
-				ASSERT(trState->tenant().present());
-				trState->cx->invalidateCachedTenant(trState->tenant().get());
-				wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID));
+				wait(trState->handleUnknownTenant());
 			} else {
 				TraceEvent(SevError, "GetRangeSplitPoints").error(e);
 				throw;
@@ -7623,22 +7637,19 @@ ACTOR Future<TenantMapEntry> blobGranuleGetTenantEntry(Transaction* self, Key ra
 	Optional<KeyRangeLocationInfo> cachedLocationInfo =
 	    self->trState->cx->getCachedLocation(self->getTenant().get(), rangeStartKey, Reverse::False);
 	if (!cachedLocationInfo.present()) {
-		KeyRangeLocationInfo l = wait(getKeyLocation_internal(self->trState->cx,
-		                                                      self->trState->getTenantInfo(AllowInvalidTenantID::True),
-		                                                      rangeStartKey,
-		                                                      self->trState->spanContext,
-		                                                      self->trState->debugID,
-		                                                      self->trState->useProvisionalProxies,
-		                                                      Reverse::False,
-		                                                      latestVersion));
-		if (self->trState->tenantId == TenantInfo::INVALID_TENANT) {
-			self->trState->tenantId = l.tenantEntry.id;
-		}
+		KeyRangeLocationInfo l = wait(getKeyLocation_internal(
+		    self->trState->cx,
+		    self->trState->getTenantInfo(AllowInvalidTenantID::True),
+		    rangeStartKey,
+		    self->trState->spanContext,
+		    self->trState->readOptions.present() ? self->trState->readOptions.get().debugID : Optional<UID>(),
+		    self->trState->useProvisionalProxies,
+		    Reverse::False,
+		    latestVersion));
+		self->trState->trySetTenantId(l.tenantEntry.id);
 		return l.tenantEntry;
 	} else {
-		if (self->trState->tenantId == TenantInfo::INVALID_TENANT) {
-			self->trState->tenantId = cachedLocationInfo.get().tenantEntry.id;
-		}
+		self->trState->trySetTenantId(cachedLocationInfo.get().tenantEntry.id);
 		return cachedLocationInfo.get().tenantEntry;
 	}
 }
@@ -7652,7 +7663,9 @@ Future<Standalone<VectorRef<KeyRef>>> Transaction::getRangeSplitPoints(KeyRange
 
 // the blob granule requests are a bit funky because they piggyback off the existing transaction to read from the system
 // keyspace
-ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRangesActor(Transaction* self, KeyRange keyRange) {
+ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRangesActor(Transaction* self,
+                                                                           KeyRange keyRange,
+                                                                           int rangeLimit) {
 	// FIXME: use streaming range read
 	state KeyRange currentRange = keyRange;
 	state Standalone<VectorRef<KeyRangeRef>> results;
@@ -7675,7 +7688,7 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRangesActor(Trans
 
 			// basically krmGetRange, but enable it to not use tenant without RAW_ACCESS by doing manual getRange with
 			// UseTenant::False
-			GetRangeLimits limits(1000);
+			GetRangeLimits limits(2 * rangeLimit + 2);
 			limits.minRows = 2;
 			RangeResult rawMapping = wait(getRange(self->trState,
 			                                       self->getReadVersion(),
@@ -7697,6 +7710,9 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRangesActor(Trans
 			if (blobGranuleMapping[i].value.size()) {
 				results.push_back(results.arena(),
 				                  KeyRangeRef(blobGranuleMapping[i].key, blobGranuleMapping[i + 1].key));
+				if (results.size() == rangeLimit) {
+					return results;
+				}
 			}
 		}
 		results.arena().dependsOn(blobGranuleMapping.arena());
@@ -7708,8 +7724,8 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRangesActor(Trans
 	}
 }
 
-Future<Standalone<VectorRef<KeyRangeRef>>> Transaction::getBlobGranuleRanges(const KeyRange& range) {
-	return ::getBlobGranuleRangesActor(this, range);
+Future<Standalone<VectorRef<KeyRangeRef>>> Transaction::getBlobGranuleRanges(const KeyRange& range, int rangeLimit) {
+	return ::getBlobGranuleRangesActor(this, range, rangeLimit);
 }
 
 // hack (for now) to get blob worker interface into load balance
@@ -7723,7 +7739,11 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
     KeyRange range,
     Version begin,
     Optional<Version> read,
-    Version* readVersionOut) { // read not present is "use transaction version"
+    Version* readVersionOut,
+    int chunkLimit,
+    bool summarize) { // read not present is "use transaction version"
+
+	ASSERT(chunkLimit > 0);
 
 	state RangeResult blobGranuleMapping;
 	state Key granuleStartKey;
@@ -7764,7 +7784,7 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
 
 		// basically krmGetRange, but enable it to not use tenant without RAW_ACCESS by doing manual getRange with
 		// UseTenant::False
-		GetRangeLimits limits(1000);
+		GetRangeLimits limits(CLIENT_KNOBS->BG_TOO_MANY_GRANULES);
 		limits.minRows = 2;
 		RangeResult rawMapping = wait(getRange(self->trState,
 		                                       self->getReadVersion(),
@@ -7779,19 +7799,24 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
 		blobGranuleMapping = krmDecodeRanges(prefix, range, rawMapping);
 	} else {
 		self->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-		wait(store(
-		    blobGranuleMapping,
-		    krmGetRanges(self, blobGranuleMappingKeys.begin, keyRange, 1000, GetRangeLimits::BYTE_LIMIT_UNLIMITED)));
+		wait(store(blobGranuleMapping,
+		           krmGetRanges(self,
+		                        blobGranuleMappingKeys.begin,
+		                        keyRange,
+		                        CLIENT_KNOBS->BG_TOO_MANY_GRANULES,
+		                        GetRangeLimits::BYTE_LIMIT_UNLIMITED)));
 	}
 	if (blobGranuleMapping.more) {
 		if (BG_REQUEST_DEBUG) {
 			fmt::print(
 			    "BG Mapping for [{0} - %{1}) too large!\n", keyRange.begin.printable(), keyRange.end.printable());
 		}
-		TraceEvent(SevWarn, "BGMappingTooLarge").detail("Range", range).detail("Max", 1000);
+		TraceEvent(SevWarn, "BGMappingTooLarge")
+		    .detail("Range", range)
+		    .detail("Max", CLIENT_KNOBS->BG_TOO_MANY_GRANULES);
 		throw unsupported_operation();
 	}
-	ASSERT(!blobGranuleMapping.more && blobGranuleMapping.size() < CLIENT_KNOBS->TOO_MANY);
+	ASSERT(!blobGranuleMapping.more && blobGranuleMapping.size() <= CLIENT_KNOBS->BG_TOO_MANY_GRANULES);
 
 	if (blobGranuleMapping.size() < 2) {
 		throw blob_granule_transaction_too_old();
@@ -7810,7 +7835,6 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
 				fmt::print("Key range [{0} - {1}) missing worker assignment!\n",
 				           granuleStartKey.printable(),
 				           granuleEndKey.printable());
-				// TODO probably new exception type instead
 			}
 			throw blob_granule_transaction_too_old();
 		}
@@ -7837,11 +7861,9 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
 			    getValue(self->trState, blobWorkerListKeyFor(workerId), self->getReadVersion(), UseTenant::False)));
 			// from the time the mapping was read from the db, the associated blob worker
 			// could have died and so its interface wouldn't be present as part of the blobWorkerList
-			// we persist in the db. So throw wrong_shard_server to get the new mapping
+			// we persist in the db. So throw blob_granule_request_failed to get the new mapping
 			if (!workerInterface.present()) {
-				// need to re-read mapping, throw transaction_too_old so client retries. TODO better error?
-				// throw wrong_shard_server();
-				throw transaction_too_old();
+				throw blob_granule_request_failed();
 			}
 			// FIXME: maybe just want to insert here if there are racing queries for the same worker or something?
 			self->trState->cx->blobWorker_interf[workerId] = decodeBlobWorkerListValue(workerInterface.get());
@@ -7874,6 +7896,7 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
 		req.readVersion = rv;
 		req.tenantInfo = self->getTenant().present() ? self->trState->getTenantInfo() : TenantInfo();
 		req.canCollapseBegin = true; // TODO make this a parameter once we support it
+		req.summarize = summarize;
 
 		std::vector<Reference<ReferencedInterface<BlobWorkerInterface>>> v;
 		v.push_back(
@@ -7944,6 +7967,12 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
 							chunkEndKey = chunkEndKey.removePrefix(tenantPrefix.get());
 						}
 						keyRange = KeyRangeRef(std::min(chunkEndKey, keyRange.end), keyRange.end);
+						if (summarize && results.size() == chunkLimit) {
+							break;
+						}
+					}
+					if (summarize && results.size() == chunkLimit) {
+						break;
 					}
 				}
 				// if we detect that this blob worker fails, cancel the request, as otherwise load balance will
@@ -7969,10 +7998,8 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
 				           e.name());
 			}
 			// worker is up but didn't actually have granule, or connection failed
-			if (e.code() == error_code_wrong_shard_server || e.code() == error_code_connection_failed ||
-			    e.code() == error_code_unknown_tenant) {
-				// need to re-read mapping, throw transaction_too_old so client retries. TODO better error?
-				throw transaction_too_old();
+			if (e.code() == error_code_wrong_shard_server || e.code() == error_code_connection_failed) {
+				throw blob_granule_request_failed();
 			}
 			throw e;
 		}
@@ -7992,7 +8019,32 @@ Future<Standalone<VectorRef<BlobGranuleChunkRef>>> Transaction::readBlobGranules
                                                                                  Version begin,
                                                                                  Optional<Version> readVersion,
                                                                                  Version* readVersionOut) {
-	return readBlobGranulesActor(this, range, begin, readVersion, readVersionOut);
+	return readBlobGranulesActor(
+	    this, range, begin, readVersion, readVersionOut, std::numeric_limits<int>::max(), false);
+}
+
+ACTOR Future<Standalone<VectorRef<BlobGranuleSummaryRef>>> summarizeBlobGranulesActor(Transaction* self,
+                                                                                      KeyRange range,
+                                                                                      Version summaryVersion,
+                                                                                      int rangeLimit) {
+	state Version readVersionOut;
+	Standalone<VectorRef<BlobGranuleChunkRef>> chunks =
+	    wait(readBlobGranulesActor(self, range, 0, summaryVersion, &readVersionOut, rangeLimit, true));
+	ASSERT(chunks.size() <= rangeLimit);
+	ASSERT(readVersionOut == summaryVersion);
+	Standalone<VectorRef<BlobGranuleSummaryRef>> summaries;
+	summaries.reserve(summaries.arena(), chunks.size());
+	for (auto& it : chunks) {
+		summaries.push_back(summaries.arena(), summarizeGranuleChunk(summaries.arena(), it));
+	}
+
+	return summaries;
+}
+
+Future<Standalone<VectorRef<BlobGranuleSummaryRef>>> Transaction::summarizeBlobGranules(const KeyRange& range,
+                                                                                        Version summaryVersion,
+                                                                                        int rangeLimit) {
+	return summarizeBlobGranulesActor(this, range, summaryVersion, rangeLimit);
 }
 
 ACTOR Future<Version> setPerpetualStorageWiggle(Database cx, bool enable, LockAware lockAware) {
@@ -8016,6 +8068,93 @@ ACTOR Future<Version> setPerpetualStorageWiggle(Database cx, bool enable, LockAw
 	return version;
 }
 
+ACTOR Future<Version> checkBlobSubrange(Database db, KeyRange keyRange, Optional<Version> version) {
+	state Transaction tr(db);
+	loop {
+		try {
+			state Version summaryVersion;
+			if (version.present()) {
+				summaryVersion = version.get();
+			} else {
+				wait(store(summaryVersion, tr.getReadVersion()));
+			}
+			// same properties as a read for validating granule is readable, just much less memory and network bandwidth
+			// used
+			wait(success(tr.summarizeBlobGranules(keyRange, summaryVersion, std::numeric_limits<int>::max())));
+			return summaryVersion;
+		} catch (Error& e) {
+			wait(tr.onError(e));
+		}
+	}
+}
+
+ACTOR Future<Version> verifyBlobRangeActor(Reference<DatabaseContext> cx, KeyRange range, Optional<Version> version) {
+	state Database db(cx);
+	state Transaction tr(db);
+	state Standalone<VectorRef<KeyRangeRef>> allRanges;
+	state KeyRange curRegion = KeyRangeRef(range.begin, range.begin);
+	state Version readVersionOut = invalidVersion;
+	state int batchSize = BUGGIFY ? deterministicRandom()->randomInt(2, 10) : CLIENT_KNOBS->BG_TOO_MANY_GRANULES / 2;
+	state int loadSize = (BUGGIFY ? deterministicRandom()->randomInt(1, 20) : 20) * batchSize;
+	loop {
+		if (curRegion.begin >= range.end) {
+			return readVersionOut;
+		}
+		loop {
+			try {
+				wait(store(allRanges, tr.getBlobGranuleRanges(KeyRangeRef(curRegion.begin, range.end), loadSize)));
+				break;
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+
+		if (allRanges.empty()) {
+			if (curRegion.begin < range.end) {
+				return invalidVersion;
+			}
+			return readVersionOut;
+		}
+
+		state std::vector<Future<Version>> checkParts;
+		// Chunk up to smaller ranges than this limit. Must be smaller than BG_TOO_MANY_GRANULES to not hit the limit
+		int batchCount = 0;
+		for (auto& it : allRanges) {
+			if (it.begin != curRegion.end) {
+				return invalidVersion;
+			}
+
+			curRegion = KeyRangeRef(curRegion.begin, it.end);
+			batchCount++;
+
+			if (batchCount == batchSize) {
+				checkParts.push_back(checkBlobSubrange(db, curRegion, version));
+				batchCount = 0;
+				curRegion = KeyRangeRef(curRegion.end, curRegion.end);
+			}
+		}
+		if (!curRegion.empty()) {
+			checkParts.push_back(checkBlobSubrange(db, curRegion, version));
+		}
+
+		try {
+			wait(waitForAll(checkParts));
+		} catch (Error& e) {
+			if (e.code() == error_code_blob_granule_transaction_too_old) {
+				return invalidVersion;
+			}
+			throw e;
+		}
+		ASSERT(!checkParts.empty());
+		readVersionOut = checkParts.back().get();
+		curRegion = KeyRangeRef(curRegion.end, curRegion.end);
+	}
+}
+
+Future<Version> DatabaseContext::verifyBlobRange(const KeyRange& range, Optional<Version> version) {
+	return verifyBlobRangeActor(Reference<DatabaseContext>::addRef(this), range, version);
+}
+
 ACTOR Future<std::vector<std::pair<UID, StorageWiggleValue>>> readStorageWiggleValues(Database cx,
                                                                                       bool primary,
                                                                                       bool use_system_priority) {
@@ -8645,38 +8784,28 @@ Future<DatabaseSharedState*> DatabaseContext::initSharedState() {
 }
 
 void DatabaseContext::setSharedState(DatabaseSharedState* p) {
-	ASSERT(p->protocolVersion == currentProtocolVersion);
+	ASSERT(p->protocolVersion == currentProtocolVersion());
 	sharedStatePtr = p;
 	sharedStatePtr->refCount++;
 }
 
 ACTOR Future<Void> storageFeedVersionUpdater(StorageServerInterface interf, ChangeFeedStorageData* self) {
-	state Promise<Void> destroyed = self->destroyed;
 	loop {
-		if (destroyed.isSet()) {
-			return Void();
-		}
 		if (self->version.get() < self->desired.get()) {
 			wait(delay(CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME) || self->version.whenAtLeast(self->desired.get()));
-			if (destroyed.isSet()) {
-				return Void();
-			}
 			if (self->version.get() < self->desired.get()) {
 				try {
 					ChangeFeedVersionUpdateReply rep = wait(brokenPromiseToNever(
 					    interf.changeFeedVersionUpdate.getReply(ChangeFeedVersionUpdateRequest(self->desired.get()))));
-
 					if (rep.version > self->version.get()) {
 						self->version.set(rep.version);
 					}
 				} catch (Error& e) {
-					if (e.code() == error_code_server_overloaded) {
-						if (FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY > CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME) {
-							wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY -
-							           CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME));
-						}
-					} else {
-						throw e;
+					if (e.code() != error_code_server_overloaded) {
+						throw;
+					}
+					if (FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY > CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME) {
+						wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY - CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME));
 					}
 				}
 			}
@@ -8695,10 +8824,53 @@ Reference<ChangeFeedStorageData> DatabaseContext::getStorageData(StorageServerIn
 		newStorageUpdater->id = interf.id();
 		newStorageUpdater->interfToken = token;
 		newStorageUpdater->updater = storageFeedVersionUpdater(interf, newStorageUpdater.getPtr());
-		changeFeedUpdaters[token] = newStorageUpdater;
+		newStorageUpdater->context = this;
+		newStorageUpdater->created = now();
+		changeFeedUpdaters[token] = newStorageUpdater.getPtr();
 		return newStorageUpdater;
 	}
-	return it->second;
+	return Reference<ChangeFeedStorageData>::addRef(it->second);
+}
+
+Version DatabaseContext::getMinimumChangeFeedVersion() {
+	Version minVersion = std::numeric_limits<Version>::max();
+	for (auto& it : changeFeedUpdaters) {
+		if (now() - it.second->created > CLIENT_KNOBS->CHANGE_FEED_START_INTERVAL) {
+			minVersion = std::min(minVersion, it.second->version.get());
+		}
+	}
+	for (auto& it : notAtLatestChangeFeeds) {
+		if (now() - it.second->created > CLIENT_KNOBS->CHANGE_FEED_START_INTERVAL) {
+			minVersion = std::min(minVersion, it.second->getVersion());
+		}
+	}
+	return minVersion;
+}
+
+void DatabaseContext::setDesiredChangeFeedVersion(Version v) {
+	for (auto& it : changeFeedUpdaters) {
+		if (it.second->version.get() < v && it.second->desired.get() < v) {
+			it.second->desired.set(v);
+		}
+	}
+}
+
+ChangeFeedStorageData::~ChangeFeedStorageData() {
+	if (context) {
+		context->changeFeedUpdaters.erase(interfToken);
+	}
+}
+
+ChangeFeedData::ChangeFeedData(DatabaseContext* context)
+  : dbgid(deterministicRandom()->randomUniqueID()), context(context), notAtLatest(1), created(now()) {
+	if (context) {
+		context->notAtLatestChangeFeeds[dbgid] = this;
+	}
+}
+ChangeFeedData::~ChangeFeedData() {
+	if (context) {
+		context->notAtLatestChangeFeeds.erase(dbgid);
+	}
 }
 
 Version ChangeFeedData::getVersion() {
@@ -8892,6 +9064,9 @@ ACTOR Future<Void> partialChangeFeedStream(StorageServerInterface interf,
 					if (refresh.canBeSet() && !atLatestVersion && rep.atLatestVersion) {
 						atLatestVersion = true;
 						feedData->notAtLatest.set(feedData->notAtLatest.get() - 1);
+						if (feedData->notAtLatest.get() == 0 && feedData->context) {
+							feedData->context->notAtLatestChangeFeeds.erase(feedData->dbgid);
+						}
 					}
 					if (refresh.canBeSet() && rep.minStreamVersion > storageData->version.get()) {
 						storageData->version.set(rep.minStreamVersion);
@@ -9082,11 +9257,6 @@ ACTOR Future<Void> mergeChangeFeedStream(Reference<DatabaseContext> db,
 		results->streams.push_back(it.first.changeFeedStream.getReplyStream(req));
 	}
 
-	for (auto& it : results->storageData) {
-		if (it->debugGetReferenceCount() == 2) {
-			db->changeFeedUpdaters.erase(it->interfToken);
-		}
-	}
 	results->maxSeenVersion = invalidVersion;
 	results->storageData.clear();
 	Promise<Void> refresh = results->refresh;
@@ -9095,6 +9265,10 @@ ACTOR Future<Void> mergeChangeFeedStream(Reference<DatabaseContext> db,
 		results->storageData.push_back(db->getStorageData(interfs[i].first));
 	}
 	results->notAtLatest.set(interfs.size());
+	if (results->context) {
+		results->context->notAtLatestChangeFeeds[results->dbgid] = results.getPtr();
+		results->created = now();
+	}
 	refresh.send(Void());
 
 	for (int i = 0; i < interfs.size(); i++) {
@@ -9137,6 +9311,8 @@ ACTOR Future<KeyRange> getChangeFeedRange(Reference<DatabaseContext> db, Databas
 	loop {
 		try {
 			tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 			Version readVer = wait(tr.getReadVersion());
 			if (readVer < begin) {
 				wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
@@ -9183,10 +9359,21 @@ ACTOR Future<Void> singleChangeFeedStreamInternal(KeyRange range,
 	// update lastReturned once the previous mutation has been consumed
 	if (*begin - 1 > results->lastReturnedVersion.get()) {
 		results->lastReturnedVersion.set(*begin - 1);
+		if (!refresh.canBeSet()) {
+			try {
+				// refresh is set if and only if this actor is cancelled
+				wait(Future<Void>(Void()));
+				// Catch any unexpected behavior if the above contract is broken
+				ASSERT(false);
+			} catch (Error& e) {
+				ASSERT(e.code() == error_code_actor_cancelled);
+				throw;
+			}
+		}
 	}
 
 	loop {
-
+		ASSERT(refresh.canBeSet());
 		state ChangeFeedStreamReply feedReply = waitNext(results->streams[0].getFuture());
 		*begin = feedReply.mutations.back().version + 1;
 
@@ -9236,6 +9423,9 @@ ACTOR Future<Void> singleChangeFeedStreamInternal(KeyRange range,
 		if (!atLatest && feedReply.atLatestVersion) {
 			atLatest = true;
 			results->notAtLatest.set(0);
+			if (results->context) {
+				results->context->notAtLatestChangeFeeds.erase(results->dbgid);
+			}
 		}
 
 		if (feedReply.minStreamVersion > results->storageData[0]->version.get()) {
@@ -9274,11 +9464,6 @@ ACTOR Future<Void> singleChangeFeedStream(Reference<DatabaseContext> db,
 
 	results->streams.clear();
 
-	for (auto& it : results->storageData) {
-		if (it->debugGetReferenceCount() == 2) {
-			db->changeFeedUpdaters.erase(it->interfToken);
-		}
-	}
 	results->streams.push_back(interf.changeFeedStream.getReplyStream(req));
 
 	results->maxSeenVersion = invalidVersion;
@@ -9287,6 +9472,10 @@ ACTOR Future<Void> singleChangeFeedStream(Reference<DatabaseContext> db,
 	Promise<Void> refresh = results->refresh;
 	results->refresh = Promise<Void>();
 	results->notAtLatest.set(1);
+	if (results->context) {
+		results->context->notAtLatestChangeFeeds[results->dbgid] = results.getPtr();
+		results->created = now();
+	}
 	refresh.send(Void());
 
 	wait(results->streams[0].onError() || singleChangeFeedStreamInternal(range, results, rangeID, begin, end));
@@ -9395,11 +9584,6 @@ ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
 			}
 		} catch (Error& e) {
 			if (e.code() == error_code_actor_cancelled || e.code() == error_code_change_feed_popped) {
-				for (auto& it : results->storageData) {
-					if (it->debugGetReferenceCount() == 2) {
-						db->changeFeedUpdaters.erase(it->interfToken);
-					}
-				}
 				results->streams.clear();
 				results->storageData.clear();
 				if (e.code() == error_code_change_feed_popped) {
@@ -9413,11 +9597,15 @@ ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
 			}
 			if (results->notAtLatest.get() == 0) {
 				results->notAtLatest.set(1);
+				if (results->context) {
+					results->context->notAtLatestChangeFeeds[results->dbgid] = results.getPtr();
+					results->created = now();
+				}
 			}
 
 			if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed ||
 			    e.code() == error_code_connection_failed || e.code() == error_code_unknown_change_feed ||
-			    e.code() == error_code_broken_promise) {
+			    e.code() == error_code_broken_promise || e.code() == error_code_future_version) {
 				db->changeFeedCache.erase(rangeID);
 				cx->invalidateCache(Key(), keys);
 				if (begin == lastBeginVersion) {
@@ -9431,11 +9619,6 @@ ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
 			} else {
 				results->mutations.sendError(e);
 				results->refresh.sendError(change_feed_cancelled());
-				for (auto& it : results->storageData) {
-					if (it->debugGetReferenceCount() == 2) {
-						db->changeFeedUpdaters.erase(it->interfToken);
-					}
-				}
 				results->streams.clear();
 				results->storageData.clear();
 				return Void();
@@ -9544,7 +9727,8 @@ ACTOR Future<OverlappingChangeFeedsInfo> getOverlappingChangeFeedsActor(Referenc
 			}
 			return result;
 		} catch (Error& e) {
-			if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) {
+			if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed ||
+			    e.code() == error_code_future_version) {
 				cx->invalidateCache(Key(), range);
 				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY));
 			} else {
@@ -9563,6 +9747,8 @@ ACTOR static Future<Void> popChangeFeedBackup(Database cx, Key rangeID, Version
 	loop {
 		try {
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 			state Key rangeIDKey = rangeID.withPrefix(changeFeedPrefix);
 			Optional<Value> val = wait(tr.get(rangeIDKey));
 			if (val.present()) {
@@ -9666,6 +9852,7 @@ Reference<DatabaseContext::TransactionT> DatabaseContext::createTransaction() {
 	return makeReference<ReadYourWritesTransaction>(Database(Reference<DatabaseContext>::addRef(this)));
 }
 
+// BlobGranule API.
 ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
                                          KeyRange range,
                                          Version purgeVersion,
@@ -9677,15 +9864,11 @@ ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
 	state KeyRange purgeRange = range;
 	state bool loadedTenantPrefix = false;
 
-	// FIXME: implement force
-	if (force) {
-		throw unsupported_operation();
-	}
-
 	loop {
 		try {
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 
 			if (tenant.present() && !loadedTenantPrefix) {
 				TenantMapEntry tenantEntry = wait(blobGranuleGetTenantEntry(&tr, range.begin));
@@ -9762,6 +9945,111 @@ Future<Void> DatabaseContext::waitPurgeGranulesComplete(Key purgeKey) {
 	return waitPurgeGranulesCompleteActor(Reference<DatabaseContext>::addRef(this), purgeKey);
 }
 
+ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobRanges(Reference<ReadYourWritesTransaction> tr,
+                                                               KeyRange range,
+                                                               int batchLimit) {
+	state Standalone<VectorRef<KeyRangeRef>> blobRanges;
+	state Key beginKey = range.begin;
+
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+
+			state RangeResult results = wait(
+			    krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2));
+
+			blobRanges.arena().dependsOn(results.arena());
+			for (int i = 0; i < results.size() - 1; i++) {
+				if (results[i].value == blobRangeActive) {
+					blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key));
+				}
+				if (blobRanges.size() == batchLimit) {
+					return blobRanges;
+				}
+			}
+
+			if (!results.more) {
+				return blobRanges;
+			}
+			beginKey = results.back().key;
+		} catch (Error& e) {
+			wait(tr->onError(e));
+		}
+	}
+}
+
+ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx, KeyRange range, bool active) {
+	state Database db(cx);
+	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
+
+	state Value value = active ? blobRangeActive : blobRangeInactive;
+
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+
+			state Standalone<VectorRef<KeyRangeRef>> startBlobRanges = wait(getBlobRanges(tr, range, 10));
+			state Standalone<VectorRef<KeyRangeRef>> endBlobRanges =
+			    wait(getBlobRanges(tr, KeyRangeRef(range.end, keyAfter(range.end)), 10));
+
+			if (active) {
+				// Idempotent request.
+				if (!startBlobRanges.empty() && !endBlobRanges.empty()) {
+					return startBlobRanges.front().begin == range.begin && endBlobRanges.front().end == range.end;
+				}
+			} else {
+				// An unblobbify request must be aligned to boundaries.
+				// It is okay to unblobbify multiple regions all at once.
+				if (startBlobRanges.empty() && endBlobRanges.empty()) {
+					return true;
+				}
+				// If there is a blob at the beginning of the range and it isn't aligned,
+				// or there is a blob range that begins before the end of the range, then fail.
+				if ((!startBlobRanges.empty() && startBlobRanges.front().begin != range.begin) ||
+				    (!endBlobRanges.empty() && endBlobRanges.front().begin < range.end)) {
+					return false;
+				}
+			}
+
+			tr->set(blobRangeChangeKey, deterministicRandom()->randomUniqueID().toString());
+			// This is not coalescing because we want to keep each range logically separate.
+			wait(krmSetRange(tr, blobRangeKeys.begin, range, value));
+			wait(tr->commit());
+			printf("Successfully updated blob range [%s - %s) to %s\n",
+			       range.begin.printable().c_str(),
+			       range.end.printable().c_str(),
+			       value.printable().c_str());
+			return true;
+		} catch (Error& e) {
+			wait(tr->onError(e));
+		}
+	}
+}
+
+Future<bool> DatabaseContext::blobbifyRange(KeyRange range) {
+	return setBlobRangeActor(Reference<DatabaseContext>::addRef(this), range, true);
+}
+
+Future<bool> DatabaseContext::unblobbifyRange(KeyRange range) {
+	return setBlobRangeActor(Reference<DatabaseContext>::addRef(this), range, false);
+}
+
+ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRangesActor(Reference<DatabaseContext> cx,
+                                                                           KeyRange range,
+                                                                           int rangeLimit) {
+	state Database db(cx);
+	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
+
+	state Standalone<VectorRef<KeyRangeRef>> blobRanges = wait(getBlobRanges(tr, range, rangeLimit));
+
+	return blobRanges;
+}
+
+Future<Standalone<VectorRef<KeyRangeRef>>> DatabaseContext::listBlobbifiedRanges(KeyRange range, int rowLimit) {
+	return listBlobbifiedRangesActor(Reference<DatabaseContext>::addRef(this), range, rowLimit);
+}
+
 int64_t getMaxKeySize(KeyRef const& key) {
 	return getMaxWriteKeySize(key, true);
 }
diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp
index 0635358402..892c9b84a0 100644
--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@@ -681,7 +681,8 @@ public:
 				break;
 
 			if (it.is_unknown_range()) {
-				if (limits.hasByteLimit() && result.size() && itemsPastEnd >= 1 - end.offset) {
+				if (limits.hasByteLimit() && limits.hasSatisfiedMinRows() && result.size() &&
+				    itemsPastEnd >= 1 - end.offset) {
 					result.more = true;
 					break;
 				}
@@ -1783,7 +1784,8 @@ Future<Standalone<VectorRef<KeyRef>>> ReadYourWritesTransaction::getRangeSplitPo
 	return waitOrError(tr.getRangeSplitPoints(range, chunkSize), resetPromise.getFuture());
 }
 
-Future<Standalone<VectorRef<KeyRangeRef>>> ReadYourWritesTransaction::getBlobGranuleRanges(const KeyRange& range) {
+Future<Standalone<VectorRef<KeyRangeRef>>> ReadYourWritesTransaction::getBlobGranuleRanges(const KeyRange& range,
+                                                                                           int rangeLimit) {
 	if (checkUsedDuringCommit()) {
 		return used_during_commit();
 	}
@@ -1794,7 +1796,7 @@ Future<Standalone<VectorRef<KeyRangeRef>>> ReadYourWritesTransaction::getBlobGra
 	if (range.begin > maxKey || range.end > maxKey)
 		return key_outside_legal_range();
 
-	return waitOrError(tr.getBlobGranuleRanges(range), resetPromise.getFuture());
+	return waitOrError(tr.getBlobGranuleRanges(range, rangeLimit), resetPromise.getFuture());
 }
 
 Future<Standalone<VectorRef<BlobGranuleChunkRef>>> ReadYourWritesTransaction::readBlobGranules(
diff --git a/fdbclient/S3BlobStore.actor.cpp b/fdbclient/S3BlobStore.actor.cpp
index 8054b778c8..ce99e30ac8 100644
--- a/fdbclient/S3BlobStore.actor.cpp
+++ b/fdbclient/S3BlobStore.actor.cpp
@@ -735,16 +735,21 @@ ACTOR Future<S3BlobStoreEndpoint::ReusableConnection> connect_impl(Reference<S3B
 		service = b->knobs.secure_connection ? "https" : "http";
 	}
 	bool isTLS = b->knobs.secure_connection == 1;
+	state Reference<IConnection> conn;
 	if (b->useProxy) {
-		// TODO(renxuan): Support http proxy + TLS
-		if (isTLS || b->service == "443") {
-			fprintf(stderr, "ERROR: TLS is not supported yet when using HTTP proxy.\n");
-			throw connection_failed();
+		if (isTLS) {
+			Reference<IConnection> _conn =
+			    wait(HTTP::proxyConnect(host, service, b->proxyHost.get(), b->proxyPort.get()));
+			conn = _conn;
+		} else {
+			host = b->proxyHost.get();
+			service = b->proxyPort.get();
+			Reference<IConnection> _conn = wait(INetworkConnections::net()->connect(host, service, false));
+			conn = _conn;
 		}
-		host = b->proxyHost.get();
-		service = b->proxyPort.get();
+	} else {
+		wait(store(conn, INetworkConnections::net()->connect(host, service, isTLS)));
 	}
-	state Reference<IConnection> conn = wait(INetworkConnections::net()->connect(host, service, isTLS));
 	wait(conn->connectHandshake());
 
 	TraceEvent("S3BlobStoreEndpointNewConnection")
@@ -892,7 +897,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest_impl(Reference<S3BlobStoreEndp
 				canonicalURI += boost::algorithm::join(queryParameters, "&");
 			}
 
-			if (bstore->useProxy) {
+			if (bstore->useProxy && bstore->knobs.secure_connection == 0) {
 				// Has to be in absolute-form.
 				canonicalURI = "http://" + bstore->host + ":" + bstore->service + canonicalURI;
 			}
diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index 7cc2079e27..7f3a3c658b 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -427,7 +427,9 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                   "log_server_min_free_space",
                   "log_server_min_free_space_ratio",
                   "storage_server_durability_lag",
-                  "storage_server_list_fetch_failed"
+                  "storage_server_list_fetch_failed",
+                  "blob_worker_lag",
+                  "blob_worker_missing"
                ]
             },
             "description":"The database is not being saturated by the workload."
@@ -448,7 +450,9 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                   "log_server_min_free_space",
                   "log_server_min_free_space_ratio",
                   "storage_server_durability_lag",
-                  "storage_server_list_fetch_failed"
+                  "storage_server_list_fetch_failed",
+                  "blob_worker_lag",
+                  "blob_worker_missing"
                ]
             },
             "description":"The database is not being saturated by the workload."
diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp
index 948b3a1a9a..e71f1bdb5f 100644
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@@ -50,7 +50,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	// TLogs
 	init( TLOG_TIMEOUT,                                          0.4 ); //cannot buggify because of availability
 	init( TLOG_SLOW_REJOIN_WARN_TIMEOUT_SECS,                     60 ); if( randomize && BUGGIFY ) TLOG_SLOW_REJOIN_WARN_TIMEOUT_SECS = deterministicRandom()->randomInt(5,10);
-	init( RECOVERY_TLOG_SMART_QUORUM_DELAY,                     0.25 ); if( randomize && BUGGIFY ) RECOVERY_TLOG_SMART_QUORUM_DELAY = 0.0; // smaller might be better for bug amplification
 	init( TLOG_STORAGE_MIN_UPDATE_INTERVAL,                      0.5 );
 	init( BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL,               30 );
 	init( DESIRED_TOTAL_BYTES,                                150000 ); if( randomize && BUGGIFY ) DESIRED_TOTAL_BYTES = 10000;
@@ -58,10 +57,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( UPDATE_DELAY,                                        0.001 );
 	init( MAXIMUM_PEEK_BYTES,                                   10e6 );
 	init( APPLY_MUTATION_BYTES,                                  1e6 );
-	init( RECOVERY_DATA_BYTE_LIMIT,                           100000 );
-	init( BUGGIFY_RECOVERY_DATA_LIMIT,                          1000 );
-	init( LONG_TLOG_COMMIT_TIME,                                0.25 ); //cannot buggify because of recovery time
-	init( LARGE_TLOG_COMMIT_BYTES,                             4<<20 );
 	init( BUGGIFY_RECOVER_MEMORY_LIMIT,                          1e6 );
 	init( BUGGIFY_WORKER_REMOVED_MAX_LAG,                         30 );
 	init( UPDATE_STORAGE_BYTE_LIMIT,                             1e6 );
@@ -94,7 +89,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( MAX_CACHE_VERSIONS,                                   10e6 );
 	init( TLOG_IGNORE_POP_AUTO_ENABLE_DELAY,                   300.0 );
 	init( TXS_POPPED_MAX_DELAY,                                  1.0 ); if ( randomize && BUGGIFY ) TXS_POPPED_MAX_DELAY = deterministicRandom()->random01();
-	init( TLOG_MAX_CREATE_DURATION,                             10.0 );
+	// In some rare simulation tests, particularly with log_spill:=1 configured, the 10 second limit is exceeded, causing SevError trace events
+	// and simulation test failure. Increasing the knob value to 15.0 in simulation is a workaround to avoid these failures.
+	init( TLOG_MAX_CREATE_DURATION,                             10.0 ); if (isSimulated) TLOG_MAX_CREATE_DURATION = 15.0;
 	init( PEEK_LOGGING_AMOUNT,                                     5 );
 	init( PEEK_LOGGING_DELAY,                                    5.0 );
 	init( PEEK_RESET_INTERVAL,                                 300.0 ); if ( randomize && BUGGIFY ) PEEK_RESET_INTERVAL = 20.0;
@@ -133,16 +130,15 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( BG_REBALANCE_POLLING_INTERVAL,                        10.0 );
 	init( BG_REBALANCE_SWITCH_CHECK_INTERVAL,                    5.0 ); if (randomize && BUGGIFY) BG_REBALANCE_SWITCH_CHECK_INTERVAL = 1.0;
 	init( DD_QUEUE_LOGGING_INTERVAL,                             5.0 );
+	init( DD_QUEUE_COUNTER_REFRESH_INTERVAL,                    60.0 );
+	// 100 / 60 < 2 trace/sec ~ 2 * 200 = 400b/sec
+	init( DD_QUEUE_COUNTER_MAX_LOG,                              100 ); if( randomize && BUGGIFY ) DD_QUEUE_COUNTER_MAX_LOG = 1;
+	init( DD_QUEUE_COUNTER_SUMMARIZE,                           true );
 	init( RELOCATION_PARALLELISM_PER_SOURCE_SERVER,                2 ); if( randomize && BUGGIFY ) RELOCATION_PARALLELISM_PER_SOURCE_SERVER = 1;
 	init( RELOCATION_PARALLELISM_PER_DEST_SERVER,                 10 ); if( randomize && BUGGIFY ) RELOCATION_PARALLELISM_PER_DEST_SERVER = 1; // Note: if this is smaller than FETCH_KEYS_PARALLELISM, this will artificially reduce performance. The current default of 10 is probably too high but is set conservatively for now.
 	init( DD_QUEUE_MAX_KEY_SERVERS,                              100 ); if( randomize && BUGGIFY ) DD_QUEUE_MAX_KEY_SERVERS = 1;
 	init( DD_REBALANCE_PARALLELISM,                               50 );
 	init( DD_REBALANCE_RESET_AMOUNT,                              30 );
-	init( BG_DD_MAX_WAIT,                                      120.0 );
-	init( BG_DD_MIN_WAIT,                                        0.1 );
-	init( BG_DD_INCREASE_RATE,                                  1.10 );
-	init( BG_DD_DECREASE_RATE,                                  1.02 );
-	init( BG_DD_SATURATION_DELAY,                                1.0 );
 	init( INFLIGHT_PENALTY_HEALTHY,                              1.0 );
 	init( INFLIGHT_PENALTY_UNHEALTHY,                          500.0 );
 	init( INFLIGHT_PENALTY_ONE_LEFT,                          1000.0 );
@@ -165,9 +161,14 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( PRIORITY_TEAM_FAILED,                                  805 );
 	init( PRIORITY_TEAM_0_LEFT,                                  809 );
 	init( PRIORITY_SPLIT_SHARD,                                  950 ); if( randomize && BUGGIFY ) PRIORITY_SPLIT_SHARD = 350;
+	init( PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD,           960 ); if( randomize && BUGGIFY ) PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD = 360; // Set as the lowest priority
 
 	// Data distribution
 	init( SHARD_ENCODE_LOCATION_METADATA,                      false ); if( randomize && BUGGIFY )  SHARD_ENCODE_LOCATION_METADATA = true;
+	init( ENABLE_DD_PHYSICAL_SHARD,                            false ); // EXPERIMENTAL; If true, SHARD_ENCODE_LOCATION_METADATA must be true; When true, optimization of data move between DCs is disabled
+	init( MAX_PHYSICAL_SHARD_BYTES,                        500000000 ); // 500 MB; for ENABLE_DD_PHYSICAL_SHARD; smaller leads to larger number of physicalShard per storage server
+ 	init( PHYSICAL_SHARD_METRICS_DELAY,                        300.0 ); // 300 seconds; for ENABLE_DD_PHYSICAL_SHARD
+	init( ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME,            600.0 ); if( randomize && BUGGIFY )  ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME = 0.0; // 600 seconds; for ENABLE_DD_PHYSICAL_SHARD
 	init( READ_REBALANCE_CPU_THRESHOLD,                         15.0 );
 	init( READ_REBALANCE_SRC_PARALLELISM,                         20 );
 	init( READ_REBALANCE_SHARD_TOPK,  READ_REBALANCE_SRC_PARALLELISM * 2 );
@@ -250,7 +251,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( SERVER_LIST_DELAY,                                     1.0 );
 	init( RECRUITMENT_IDLE_DELAY,                                1.0 );
 	init( STORAGE_RECRUITMENT_DELAY,                            10.0 );
-	init( BLOB_WORKER_RECRUITMENT_DELAY,                        10.0 );
 	init( TSS_HACK_IDENTITY_MAPPING,                           false ); // THIS SHOULD NEVER BE SET IN PROD. Only for performance testing
 	init( TSS_RECRUITMENT_TIMEOUT,       3*STORAGE_RECRUITMENT_DELAY ); if (randomize && BUGGIFY ) TSS_RECRUITMENT_TIMEOUT = 1.0; // Super low timeout should cause tss recruitments to fail
 	init( TSS_DD_CHECK_INTERVAL,                                60.0 ); if (randomize && BUGGIFY ) TSS_DD_CHECK_INTERVAL = 1.0;    // May kill all TSS quickly
@@ -276,7 +276,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( DD_FAILURE_TIME,                                       1.0 ); if( randomize && BUGGIFY ) DD_FAILURE_TIME = 10.0;
 	init( DD_ZERO_HEALTHY_TEAM_DELAY,                            1.0 );
 	init( REMOTE_KV_STORE,                                     false );
-	init( REMOTE_KV_STORE_INIT_DELAY,                            0.1 );
+	init( REBOOT_KV_STORE_DELAY,                                 0.1 );
 	init( REMOTE_KV_STORE_MAX_INIT_DURATION,                    10.0 );
 	init( REBALANCE_MAX_RETRIES,                                 100 );
 	init( DD_OVERLAP_PENALTY,                                  10000 );
@@ -292,8 +292,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY,                    120 ); if( randomize && BUGGIFY ) DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY = 5;
 	init( DD_STORAGE_WIGGLE_PAUSE_THRESHOLD,                      10 ); if( randomize && BUGGIFY ) DD_STORAGE_WIGGLE_PAUSE_THRESHOLD = 1000;
 	init( DD_STORAGE_WIGGLE_STUCK_THRESHOLD,                      20 );
+	init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC,   isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120;
 	init( DD_TENANT_AWARENESS_ENABLED,                         false );
-	init( TENANT_CACHE_LIST_REFRESH_INTERVAL,                    2.0 );
+	init( TENANT_CACHE_LIST_REFRESH_INTERVAL,                      2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
+
 
 	// TeamRemover
 	init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER,                false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
@@ -371,6 +373,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( REPLACE_CONTENTS_BYTES,                                1e5 );
 
 	// KeyValueStoreRocksDB
+	init( ROCKSDB_READ_RANGE_ROW_LIMIT,                        65535 ); if( randomize && BUGGIFY )  ROCKSDB_READ_RANGE_ROW_LIMIT = deterministicRandom()->randomInt(2, 10);
+
 	init( ROCKSDB_BACKGROUND_PARALLELISM,                          4 );
 	init( ROCKSDB_READ_PARALLELISM,                                4 );
 	// Use a smaller memtable in simulation to avoid OOMs.
@@ -381,9 +385,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( ROCKSDB_PREFIX_LEN,                                      0 );
 	init( ROCKSDB_BLOCK_CACHE_SIZE,                                0 );
 	init( ROCKSDB_METRICS_DELAY,                                60.0 );
-	init( ROCKSDB_READ_VALUE_TIMEOUT,                            5.0 );
-	init( ROCKSDB_READ_VALUE_PREFIX_TIMEOUT,                     5.0 );
-	init( ROCKSDB_READ_RANGE_TIMEOUT,                            5.0 );
+	init( ROCKSDB_READ_VALUE_TIMEOUT,      isSimulated ? 5.0 : 200.0 );
+	init( ROCKSDB_READ_VALUE_PREFIX_TIMEOUT, isSimulated ? 5.0 : 200.0 );
+	init( ROCKSDB_READ_RANGE_TIMEOUT,       isSimulated ? 5.0 : 200.0 );
 	init( ROCKSDB_READ_QUEUE_WAIT,                               1.0 );
 	init( ROCKSDB_READ_QUEUE_HARD_MAX,                          1000 );
 	init( ROCKSDB_READ_QUEUE_SOFT_MAX,                           500 );
@@ -400,6 +404,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 
 	init( ROCKSDB_PERFCONTEXT_ENABLE,                          false ); if( randomize && BUGGIFY ) ROCKSDB_PERFCONTEXT_ENABLE = deterministicRandom()->coinflip() ? false : true;
 	init( ROCKSDB_PERFCONTEXT_SAMPLE_RATE,                    0.0001 );
+	init( ROCKSDB_METRICS_SAMPLE_INTERVAL,						  0.0);
 	init( ROCKSDB_MAX_SUBCOMPACTIONS,                              2 );
 	init( ROCKSDB_SOFT_PENDING_COMPACT_BYTES_LIMIT,      64000000000 ); // 64GB, Rocksdb option, Writes will slow down.
 	init( ROCKSDB_HARD_PENDING_COMPACT_BYTES_LIMIT,     100000000000 ); // 100GB, Rocksdb option, Writes will stall.
@@ -412,6 +417,12 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( ROCKSDB_COMPACTION_READAHEAD_SIZE,                   32768 ); // 32 KB, performs bigger reads when doing compaction.
 	init( ROCKSDB_BLOCK_SIZE,                                  32768 ); // 32 KB, size of the block in rocksdb cache.
  	init( ENABLE_SHARDED_ROCKSDB,                              false );
+	init( ROCKSDB_WRITE_BUFFER_SIZE,                         1 << 30 ); // 1G
+	init( ROCKSDB_CF_WRITE_BUFFER_SIZE,                     64 << 20 ); // 64M, RocksDB default.
+	init( ROCKSDB_MAX_TOTAL_WAL_SIZE,                              0 ); // RocksDB default.
+	init( ROCKSDB_MAX_BACKGROUND_JOBS,                             2 ); // RocksDB default.
+	init( ROCKSDB_DELETE_OBSOLETE_FILE_PERIOD,                 21600 ); // 6h, RocksDB default.
+	init( ROCKSDB_PHYSICAL_SHARD_CLEAN_UP_DELAY, isSimulated ? 10.0 : 300.0 ); // Delays shard clean up, must be larger than ROCKSDB_READ_VALUE_TIMEOUT to prevent reading deleted shard.
 
 	// Leader election
 	bool longLeaderElection = randomize && BUGGIFY;
@@ -475,7 +486,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( REPORT_TRANSACTION_COST_ESTIMATION_DELAY,               0.1 );
 	init( PROXY_REJECT_BATCH_QUEUED_TOO_LONG,                    true );
 
-        bool buggfyUseResolverPrivateMutations = randomize && BUGGIFY && !ENABLE_VERSION_VECTOR_TLOG_UNICAST;
+	bool buggfyUseResolverPrivateMutations = randomize && BUGGIFY && !ENABLE_VERSION_VECTOR_TLOG_UNICAST;
 	init( PROXY_USE_RESOLVER_PRIVATE_MUTATIONS,                 false ); if( buggfyUseResolverPrivateMutations ) PROXY_USE_RESOLVER_PRIVATE_MUTATIONS = deterministicRandom()->coinflip();
 
 	init( RESET_MASTER_BATCHES,                                   200 );
@@ -610,9 +621,13 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( SLOW_SMOOTHING_AMOUNT,                                10.0 ); if( slowRatekeeper ) SLOW_SMOOTHING_AMOUNT = 50.0;
 	init( METRIC_UPDATE_RATE,                                     .1 ); if( slowRatekeeper ) METRIC_UPDATE_RATE = 0.5;
 	init( DETAILED_METRIC_UPDATE_RATE,                           5.0 );
-	init (RATEKEEPER_DEFAULT_LIMIT,                              1e6 ); if( randomize && BUGGIFY ) RATEKEEPER_DEFAULT_LIMIT = 0;
+	init( RATEKEEPER_DEFAULT_LIMIT,                              1e6 ); if( randomize && BUGGIFY ) RATEKEEPER_DEFAULT_LIMIT = 0;
 	init( RATEKEEPER_LIMIT_REASON_SAMPLE_RATE,                   0.1 );
 	init( RATEKEEPER_PRINT_LIMIT_REASON,                       false ); if( randomize && BUGGIFY ) RATEKEEPER_PRINT_LIMIT_REASON = true;
+	init( RATEKEEPER_MIN_RATE,                                   0.0 );
+	init( RATEKEEPER_MAX_RATE,                                   1e9 );
+	init( RATEKEEPER_BATCH_MIN_RATE,                             0.0 );
+	init( RATEKEEPER_BATCH_MAX_RATE,                             1e9 );
 
 	bool smallStorageTarget = randomize && BUGGIFY;
 	init( TARGET_BYTES_PER_STORAGE_SERVER,                    1000e6 ); if( smallStorageTarget ) TARGET_BYTES_PER_STORAGE_SERVER = 3000e3;
@@ -662,6 +677,19 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( DURABILITY_LAG_REDUCTION_RATE,                      0.9999 );
 	init( DURABILITY_LAG_INCREASE_RATE,                        1.001 );
 	init( STORAGE_SERVER_LIST_FETCH_TIMEOUT,                    20.0 );
+	init( BW_THROTTLING_ENABLED,                                true );
+
+	bool buggifySmallBWLag = randomize && BUGGIFY;
+	init( TARGET_BW_LAG,                                        50.0 ); if(buggifySmallBWLag) TARGET_BW_LAG = 10.0;
+	init( TARGET_BW_LAG_BATCH,                                  20.0 ); if(buggifySmallBWLag) TARGET_BW_LAG_BATCH = 4.0;
+	init( TARGET_BW_LAG_UPDATE,                                  9.0 ); if(buggifySmallBWLag) TARGET_BW_LAG_UPDATE = 1.0;
+	init( MIN_BW_HISTORY,                                         10 );
+	init( BW_ESTIMATION_INTERVAL,                               10.0 ); if(buggifySmallBWLag) BW_ESTIMATION_INTERVAL = 2.0;
+	init( BW_LAG_INCREASE_AMOUNT,                                1.1 );
+	init( BW_LAG_DECREASE_AMOUNT,                                0.9 );
+	init( BW_FETCH_WORKERS_INTERVAL,                             5.0 );
+	init( BW_RW_LOGGING_INTERVAL,                                5.0 );
+	init( BW_MAX_BLOCKED_INTERVAL,                              10.0 ); if(buggifySmallBWLag) BW_MAX_BLOCKED_INTERVAL = 2.0;
 
 	init( MAX_AUTO_THROTTLED_TRANSACTION_TAGS,                     5 ); if(randomize && BUGGIFY) MAX_AUTO_THROTTLED_TRANSACTION_TAGS = 1;
 	init( MAX_MANUAL_THROTTLED_TRANSACTION_TAGS,                  40 ); if(randomize && BUGGIFY) MAX_MANUAL_THROTTLED_TRANSACTION_TAGS = 1;
@@ -676,6 +704,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( AUTO_TAG_THROTTLING_ENABLED,                          true ); if(randomize && BUGGIFY) AUTO_TAG_THROTTLING_ENABLED = false;
 	init( SS_THROTTLE_TAGS_TRACKED,                                1 ); if(randomize && BUGGIFY) SS_THROTTLE_TAGS_TRACKED = deterministicRandom()->randomInt(1, 10);
 	init( GLOBAL_TAG_THROTTLING,                               false );
+	init( ENFORCE_TAG_THROTTLING_ON_PROXIES,                   false );
 	init( GLOBAL_TAG_THROTTLING_MIN_RATE,                        1.0 );
 	init( GLOBAL_TAG_THROTTLING_FOLDING_TIME,                   10.0 );
 
@@ -698,12 +727,12 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( STORAGE_LIMIT_BYTES,                                500000 );
 	init( BUGGIFY_LIMIT_BYTES,                                  1000 );
 	init( FETCH_USING_STREAMING,                               false ); if( randomize && isSimulated && BUGGIFY ) FETCH_USING_STREAMING = true; //Determines if fetch keys uses streaming reads
+	init( FETCH_USING_BLOB,                                    false );
 	init( FETCH_BLOCK_BYTES,                                     2e6 );
 	init( FETCH_KEYS_PARALLELISM_BYTES,                          4e6 ); if( randomize && BUGGIFY ) FETCH_KEYS_PARALLELISM_BYTES = 3e6;
 	init( FETCH_KEYS_PARALLELISM,                                  2 );
-	init( FETCH_KEYS_PARALLELISM_FULL,                            10 );
+	init( FETCH_KEYS_PARALLELISM_FULL,                             6 );
 	init( FETCH_KEYS_LOWER_PRIORITY,                               0 );
-	init( FETCH_CHANGEFEED_PARALLELISM,                            4 );
 	init( SERVE_FETCH_CHECKPOINT_PARALLELISM,                      4 );
 	init( BUGGIFY_BLOCK_BYTES,                                 10000 );
 	init( STORAGE_RECOVERY_VERSION_LAG_LIMIT,				2 * MAX_READ_TRANSACTION_LIFE_VERSIONS );
@@ -712,7 +741,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( STORAGE_DURABILITY_LAG_REJECT_THRESHOLD,              0.25 );
 	init( STORAGE_DURABILITY_LAG_MIN_RATE,                       0.1 );
 	init( STORAGE_COMMIT_INTERVAL,                               0.5 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_INTERVAL = 2.0;
-	init( UPDATE_SHARD_VERSION_INTERVAL,                        0.25 ); if( randomize && BUGGIFY ) UPDATE_SHARD_VERSION_INTERVAL = 1.0;
 	init( BYTE_SAMPLING_FACTOR,                                  250 ); //cannot buggify because of differences in restarting tests
 	init( BYTE_SAMPLING_OVERHEAD,                                100 );
 	init( MAX_STORAGE_SERVER_WATCH_BYTES,                      100e6 ); if( randomize && BUGGIFY ) MAX_STORAGE_SERVER_WATCH_BYTES = 10e3;
@@ -721,7 +749,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( BYTE_SAMPLE_LOAD_PARALLELISM,                            8 ); if( randomize && BUGGIFY ) BYTE_SAMPLE_LOAD_PARALLELISM = 1;
 	init( BYTE_SAMPLE_LOAD_DELAY,                                0.0 ); if( randomize && BUGGIFY ) BYTE_SAMPLE_LOAD_DELAY = 0.1;
 	init( BYTE_SAMPLE_START_DELAY,                               1.0 ); if( randomize && BUGGIFY ) BYTE_SAMPLE_START_DELAY = 0.0;
-	init( UPDATE_STORAGE_PROCESS_STATS_INTERVAL,                 5.0 );
 	init( BEHIND_CHECK_DELAY,                                    2.0 );
 	init( BEHIND_CHECK_COUNT,                                      2 );
 	init( BEHIND_CHECK_VERSIONS,             5 * VERSIONS_PER_SECOND );
@@ -788,7 +815,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 
 	// Dynamic Knobs (implementation)
 	init( COMPACTION_INTERVAL,             isSimulated ? 5.0 : 300.0 );
-	init( UPDATE_NODE_TIMEOUT,                                   3.0 );
 	init( GET_COMMITTED_VERSION_TIMEOUT,                         3.0 );
 	init( GET_SNAPSHOT_AND_CHANGES_TIMEOUT,                      3.0 );
 	init( FETCH_CHANGES_TIMEOUT,                                 3.0 );
@@ -804,14 +830,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( DISABLE_DUPLICATE_LOG_WARNING,                       false );
 	init( HISTOGRAM_REPORT_INTERVAL,                           300.0 );
 
-	// IPager
-	init( PAGER_RESERVED_PAGES,                                    1 );
-
-	// IndirectShadowPager
-	init( FREE_PAGE_VACUUM_THRESHOLD,                              1 );
-	init( VACUUM_QUEUE_SIZE,                                  100000 );
-	init( VACUUM_BYTES_PER_SECOND,                               1e6 );
-
 	// Timekeeper
 	init( TIME_KEEPER_DELAY,                                      10 );
 	init( TIME_KEEPER_MAX_ENTRIES,                3600 * 24 * 30 * 6 ); if( randomize && BUGGIFY ) { TIME_KEEPER_MAX_ENTRIES = 2; }
@@ -830,11 +848,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( FASTRESTORE_ROLE_LOGGING_DELAY,                          5 ); if( randomize && BUGGIFY ) { FASTRESTORE_ROLE_LOGGING_DELAY = deterministicRandom()->random01() * 60 + 1; }
 	init( FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL,               5 ); if( randomize && BUGGIFY ) { FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL = deterministicRandom()->random01() * 60 + 1; }
 	init( FASTRESTORE_ATOMICOP_WEIGHT,                             1 ); if( randomize && BUGGIFY ) { FASTRESTORE_ATOMICOP_WEIGHT = deterministicRandom()->random01() * 200 + 1; }
-	init( FASTRESTORE_APPLYING_PARALLELISM,               	   10000 ); if( randomize && BUGGIFY ) { FASTRESTORE_APPLYING_PARALLELISM = deterministicRandom()->random01() * 10 + 1; }
 	init( FASTRESTORE_MONITOR_LEADER_DELAY,                        5 ); if( randomize && BUGGIFY ) { FASTRESTORE_MONITOR_LEADER_DELAY = deterministicRandom()->random01() * 100; }
 	init( FASTRESTORE_STRAGGLER_THRESHOLD_SECONDS,                60 ); if( randomize && BUGGIFY ) { FASTRESTORE_STRAGGLER_THRESHOLD_SECONDS = deterministicRandom()->random01() * 240 + 10; }
 	init( FASTRESTORE_TRACK_REQUEST_LATENCY,              	   false ); if( randomize && BUGGIFY ) { FASTRESTORE_TRACK_REQUEST_LATENCY = false; }
-	init( FASTRESTORE_TRACK_LOADER_SEND_REQUESTS,              false ); if( randomize && BUGGIFY ) { FASTRESTORE_TRACK_LOADER_SEND_REQUESTS = true; }
 	init( FASTRESTORE_MEMORY_THRESHOLD_MB_SOFT,                 6144 ); if( randomize && BUGGIFY ) { FASTRESTORE_MEMORY_THRESHOLD_MB_SOFT = 1; }
 	init( FASTRESTORE_WAIT_FOR_MEMORY_LATENCY,                    10 ); if( randomize && BUGGIFY ) { FASTRESTORE_WAIT_FOR_MEMORY_LATENCY = 60; }
 	init( FASTRESTORE_HEARTBEAT_DELAY,                            10 ); if( randomize && BUGGIFY ) { FASTRESTORE_HEARTBEAT_DELAY = deterministicRandom()->random01() * 120 + 2; }
@@ -893,27 +909,25 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init ( CLUSTER_RECOVERY_EVENT_NAME_PREFIX,              "Master" );
 
 	// Encryption
-	init( ENABLE_ENCRYPTION,                                   false ); if ( randomize && BUGGIFY ) { ENABLE_ENCRYPTION = deterministicRandom()->coinflip(); }
+	init( ENABLE_ENCRYPTION,                                   false ); if ( randomize && BUGGIFY ) ENABLE_ENCRYPTION = !ENABLE_ENCRYPTION;
 	init( ENCRYPTION_MODE,                             "AES-256-CTR" );
 	init( SIM_KMS_MAX_KEYS,                                     4096 );
 	init( ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH,                 100000 );
-	init( ENABLE_TLOG_ENCRYPTION,                  ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY ) { ENABLE_TLOG_ENCRYPTION = (ENABLE_ENCRYPTION && !PROXY_USE_RESOLVER_PRIVATE_MUTATIONS && deterministicRandom()->coinflip()); }
-	init( ENABLE_BLOB_GRANULE_ENCRYPTION,          ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY ) { ENABLE_BLOB_GRANULE_ENCRYPTION = (ENABLE_ENCRYPTION && deterministicRandom()->coinflip()); }
+	init( ENABLE_TLOG_ENCRYPTION,                  ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY && ENABLE_ENCRYPTION && !PROXY_USE_RESOLVER_PRIVATE_MUTATIONS ) ENABLE_TLOG_ENCRYPTION = true;
+	init( ENABLE_STORAGE_SERVER_ENCRYPTION,        ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY) ENABLE_STORAGE_SERVER_ENCRYPTION = !ENABLE_STORAGE_SERVER_ENCRYPTION;
+	init( ENABLE_BLOB_GRANULE_ENCRYPTION,          ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY) ENABLE_BLOB_GRANULE_ENCRYPTION = !ENABLE_BLOB_GRANULE_ENCRYPTION;
 
 	// encrypt key proxy
 	init( ENABLE_BLOB_GRANULE_COMPRESSION,                     false ); if ( randomize && BUGGIFY ) { ENABLE_BLOB_GRANULE_COMPRESSION = deterministicRandom()->coinflip(); }
 	init( BLOB_GRANULE_COMPRESSION_FILTER,                    "GZIP" ); if ( randomize && BUGGIFY ) { BLOB_GRANULE_COMPRESSION_FILTER = "NONE"; }
 
-
-    // KMS connector type
+	// KMS connector type
 	init( KMS_CONNECTOR_TYPE,                     "RESTKmsConnector" );
 
 	// Blob granlues
-	init( BG_URL,               isSimulated ? "file://fdbblob/" : "" ); // TODO: store in system key space or something, eventually
-	// BlobGranuleVerify* simulation tests use "blobRangeKeys", BlobGranuleCorrectness* use "tenant", default in real clusters is "tenant"
-	init( BG_RANGE_SOURCE,                                  "tenant" );
-	// BlobGranuleVerify* simulation tests use "knobs", BlobGranuleCorrectness* use "tenant", default in real clusters is "knobs"
+	init( BG_URL,               isSimulated ? "file://fdbblob/" : "" ); // TODO: store in system key space or something, eventually	
 	bool buggifyMediumGranules = simulationMediumShards || (randomize && BUGGIFY);
+	// BlobGranuleVerify* simulation tests use "knobs", BlobGranuleCorrectness* use "tenant", default in real clusters is "knobs"
 	init( BG_METADATA_SOURCE,                                "knobs" );
 	init( BG_SNAPSHOT_FILE_TARGET_BYTES,                    10000000 ); if( buggifySmallShards ) BG_SNAPSHOT_FILE_TARGET_BYTES = 100000; else if (buggifyMediumGranules) BG_SNAPSHOT_FILE_TARGET_BYTES = 1000000;
 	init( BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES,               64*1024 ); if ( randomize && BUGGIFY ) BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES = BG_SNAPSHOT_FILE_TARGET_BYTES / (1 << deterministicRandom()->randomInt(0, 8));
@@ -933,11 +947,14 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( BG_MERGE_CANDIDATE_DELAY_SECONDS, BG_MERGE_CANDIDATE_THRESHOLD_SECONDS / 10.0 );
 
 	init( BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM,                8 ); if( randomize && BUGGIFY ) BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM = 1;
+	init( BLOB_WORKER_RESNAPSHOT_PARALLELISM,                     40 ); if( randomize && BUGGIFY ) BLOB_WORKER_RESNAPSHOT_PARALLELISM = deterministicRandom()->randomInt(1, 10);
+	init( BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM,             2000 ); if( randomize && BUGGIFY ) BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM = deterministicRandom()->randomInt(10, 100);
 	init( BLOB_WORKER_TIMEOUT,                                  10.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_TIMEOUT = 1.0;
 	init( BLOB_WORKER_REQUEST_TIMEOUT,                           5.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_REQUEST_TIMEOUT = 1.0;
 	init( BLOB_WORKERLIST_FETCH_INTERVAL,                        1.0 );
 	init( BLOB_WORKER_BATCH_GRV_INTERVAL,                        0.1 );
-	
+	init( BLOB_WORKER_DO_REJECT_WHEN_FULL,                      true ); if ( randomize && BUGGIFY ) BLOB_WORKER_DO_REJECT_WHEN_FULL = false;
+	init( BLOB_WORKER_REJECT_WHEN_FULL_THRESHOLD,                0.9 );
 
 	init( BLOB_MANAGER_STATUS_EXP_BACKOFF_MIN,                   0.1 );
 	init( BLOB_MANAGER_STATUS_EXP_BACKOFF_MAX,                   5.0 );
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 7774b99ba7..719aff9fe8 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -133,7 +133,8 @@ std::unordered_map<std::string, KeyRange> SpecialKeySpace::actorLineageApiComman
 std::set<std::string> SpecialKeySpace::options = { "excluded/force",
 	                                               "failed/force",
 	                                               "excluded_locality/force",
-	                                               "failed_locality/force" };
+	                                               "failed_locality/force",
+	                                               "worker_interfaces/verify" };
 
 std::set<std::string> SpecialKeySpace::tracingOptions = { kTracingTransactionIdKey, kTracingTokenKey };
 
@@ -1603,7 +1604,8 @@ Future<RangeResult> TracingOptionsImpl::getRange(ReadYourWritesTransaction* ryw,
 
 void TracingOptionsImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
 	if (ryw->getApproximateSize() > 0) {
-		ryw->setSpecialKeySpaceErrorMsg("tracing options must be set first");
+		ryw->setSpecialKeySpaceErrorMsg(
+		    ManagementAPIError::toJsonString(false, "configure trace", "tracing options must be set first"));
 		ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>()));
 		return;
 	}
@@ -1616,7 +1618,8 @@ void TracingOptionsImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key,
 		} else if (value.toString() == "false") {
 			ryw->setToken(0);
 		} else {
-			ryw->setSpecialKeySpaceErrorMsg("token must be set to true/false");
+			ryw->setSpecialKeySpaceErrorMsg(
+			    ManagementAPIError::toJsonString(false, "configure trace token", "token must be set to true/false"));
 			throw special_keys_api_failure();
 		}
 	}
@@ -1630,12 +1633,12 @@ Future<Optional<std::string>> TracingOptionsImpl::commit(ReadYourWritesTransacti
 }
 
 void TracingOptionsImpl::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) {
-	ryw->setSpecialKeySpaceErrorMsg("clear range disabled");
+	ryw->setSpecialKeySpaceErrorMsg(ManagementAPIError::toJsonString(false, "clear trace", "clear range disabled"));
 	throw special_keys_api_failure();
 }
 
 void TracingOptionsImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
-	ryw->setSpecialKeySpaceErrorMsg("clear disabled");
+	ryw->setSpecialKeySpaceErrorMsg(ManagementAPIError::toJsonString(false, "clear trace", "clear disabled"));
 	throw special_keys_api_failure();
 }
 
@@ -2180,7 +2183,8 @@ ACTOR static Future<RangeResult> actorLineageGetRangeActor(ReadYourWritesTransac
 	state std::vector<StringRef> endValues = kr.end.removePrefix(prefix).splitAny("/"_sr);
 	// Require index (either "state" or "time") and address:port.
 	if (beginValues.size() < 2 || endValues.size() < 2) {
-		ryw->setSpecialKeySpaceErrorMsg("missing required parameters (index, host)");
+		ryw->setSpecialKeySpaceErrorMsg(
+		    ManagementAPIError::toJsonString(false, "read actor_lineage", "missing required parameters (index, host)"));
 		throw special_keys_api_failure();
 	}
 
@@ -2199,12 +2203,14 @@ ACTOR static Future<RangeResult> actorLineageGetRangeActor(ReadYourWritesTransac
 				parse(endValues.begin() + 1, endValues.end(), endRangeHost, timeEnd, waitStateEnd, seqEnd);
 			}
 		} else {
-			ryw->setSpecialKeySpaceErrorMsg("invalid index in actor_lineage");
+			ryw->setSpecialKeySpaceErrorMsg(
+			    ManagementAPIError::toJsonString(false, "read actor_lineage", "invalid index in actor_lineage"));
 			throw special_keys_api_failure();
 		}
 	} catch (Error& e) {
 		if (e.code() != special_keys_api_failure().code()) {
-			ryw->setSpecialKeySpaceErrorMsg("failed to parse key");
+			ryw->setSpecialKeySpaceErrorMsg(
+			    ManagementAPIError::toJsonString(false, "read actor_lineage", "failed to parse key"));
 			throw special_keys_api_failure();
 		} else {
 			throw e;
@@ -2214,7 +2220,8 @@ ACTOR static Future<RangeResult> actorLineageGetRangeActor(ReadYourWritesTransac
 	if (kr.begin != kr.end && host != endRangeHost) {
 		// The client doesn't know about all the hosts, so a get range covering
 		// multiple hosts has no way of knowing which IP:port combos to use.
-		ryw->setSpecialKeySpaceErrorMsg("the host must remain the same on both ends of the range");
+		ryw->setSpecialKeySpaceErrorMsg(ManagementAPIError::toJsonString(
+		    false, "read actor_lineage", "the host must remain the same on both ends of the range"));
 		throw special_keys_api_failure();
 	}
 
@@ -2748,6 +2755,64 @@ Future<Optional<std::string>> FailedLocalitiesRangeImpl::commit(ReadYourWritesTr
 	return excludeLocalityCommitActor(ryw, true);
 }
 
+// Defined in ReadYourWrites.actor.cpp
+ACTOR Future<RangeResult> getWorkerInterfaces(Reference<IClusterConnectionRecord> clusterRecord);
+// Defined in NativeAPI.actor.cpp
+ACTOR Future<bool> verifyInterfaceActor(Reference<FlowLock> connectLock, ClientWorkerInterface workerInterf);
+
+ACTOR static Future<RangeResult> workerInterfacesImplGetRangeActor(ReadYourWritesTransaction* ryw,
+                                                                   KeyRef prefix,
+                                                                   KeyRangeRef kr) {
+	if (!ryw->getDatabase().getPtr() || !ryw->getDatabase()->getConnectionRecord())
+		return RangeResult();
+
+	state RangeResult interfs = wait(getWorkerInterfaces(ryw->getDatabase()->getConnectionRecord()));
+	// for options' special keys, the boolean flag indicates if it's a SET operation
+	auto [verify, _] = ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandOptionSpecialKey(
+	    "worker_interfaces", "verify")];
+	state RangeResult result;
+	if (verify) {
+		// if verify option is set, we try to talk to every worker and only returns those we can talk to
+		Reference<FlowLock> connectLock(new FlowLock(CLIENT_KNOBS->CLI_CONNECT_PARALLELISM));
+		state std::vector<Future<bool>> verifyInterfs;
+		for (const auto& [k_, value] : interfs) {
+			auto k = k_.withPrefix(prefix);
+			if (kr.contains(k)) {
+				ClientWorkerInterface workerInterf =
+				    BinaryReader::fromStringRef<ClientWorkerInterface>(value, IncludeVersion());
+				verifyInterfs.push_back(verifyInterfaceActor(connectLock, workerInterf));
+			} else {
+				verifyInterfs.push_back(false);
+			}
+		}
+		wait(waitForAll(verifyInterfs));
+		// state int index;
+		for (int index = 0; index < interfs.size(); index++) {
+			if (verifyInterfs[index].get()) {
+				// if we can establish a connection, add the kv pair into the result
+				result.push_back_deep(result.arena(),
+				                      KeyValueRef(interfs[index].key.withPrefix(prefix), interfs[index].value));
+			}
+		}
+	} else {
+		for (const auto& [k_, v] : interfs) {
+			auto k = k_.withPrefix(prefix);
+			if (kr.contains(k))
+				result.push_back_deep(result.arena(), KeyValueRef(k, v));
+		}
+	}
+	std::sort(result.begin(), result.end(), KeyValueRef::OrderByKey{});
+	return result;
+}
+
+WorkerInterfacesSpecialKeyImpl::WorkerInterfacesSpecialKeyImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
+
+Future<RangeResult> WorkerInterfacesSpecialKeyImpl::getRange(ReadYourWritesTransaction* ryw,
+                                                             KeyRangeRef kr,
+                                                             GetRangeLimits limitsHint) const {
+	return workerInterfacesImplGetRangeActor(ryw, getKeyRange().begin, kr);
+}
+
 ACTOR Future<Void> validateSpecialSubrangeRead(ReadYourWritesTransaction* ryw,
                                                KeySelector begin,
                                                KeySelector end,
diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp
index 3f110d0b80..d56c117a65 100644
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@@ -356,7 +356,7 @@ const Key storageCacheServerKey(UID id) {
 }
 
 const Value storageCacheServerValue(const StorageServerInterface& ssi) {
-	auto protocolVersion = currentProtocolVersion;
+	auto protocolVersion = currentProtocolVersion();
 	protocolVersion.addObjectSerializerFlag();
 	return ObjectWriter::toValue(ssi, IncludeVersion(protocolVersion));
 }
@@ -666,7 +666,7 @@ const KeyRangeRef tagLocalityListKeys(LiteralStringRef("\xff/tagLocalityList/"),
 const KeyRef tagLocalityListPrefix = tagLocalityListKeys.begin;
 
 const Key tagLocalityListKeyFor(Optional<Value> dcID) {
-	BinaryWriter wr(AssumeVersion(currentProtocolVersion));
+	BinaryWriter wr(AssumeVersion(currentProtocolVersion()));
 	wr.serializeBytes(tagLocalityListKeys.begin);
 	wr << dcID;
 	return wr.toValue();
@@ -679,7 +679,7 @@ const Value tagLocalityListValue(int8_t const& tagLocality) {
 }
 Optional<Value> decodeTagLocalityListKey(KeyRef const& key) {
 	Optional<Value> dcID;
-	BinaryReader rd(key.removePrefix(tagLocalityListKeys.begin), AssumeVersion(currentProtocolVersion));
+	BinaryReader rd(key.removePrefix(tagLocalityListKeys.begin), AssumeVersion(currentProtocolVersion()));
 	rd >> dcID;
 	return dcID;
 }
@@ -695,7 +695,7 @@ const KeyRangeRef datacenterReplicasKeys(LiteralStringRef("\xff\x02/datacenterRe
 const KeyRef datacenterReplicasPrefix = datacenterReplicasKeys.begin;
 
 const Key datacenterReplicasKeyFor(Optional<Value> dcID) {
-	BinaryWriter wr(AssumeVersion(currentProtocolVersion));
+	BinaryWriter wr(AssumeVersion(currentProtocolVersion()));
 	wr.serializeBytes(datacenterReplicasKeys.begin);
 	wr << dcID;
 	return wr.toValue();
@@ -708,7 +708,7 @@ const Value datacenterReplicasValue(int const& replicas) {
 }
 Optional<Value> decodeDatacenterReplicasKey(KeyRef const& key) {
 	Optional<Value> dcID;
-	BinaryReader rd(key.removePrefix(datacenterReplicasKeys.begin), AssumeVersion(currentProtocolVersion));
+	BinaryReader rd(key.removePrefix(datacenterReplicasKeys.begin), AssumeVersion(currentProtocolVersion()));
 	rd >> dcID;
 	return dcID;
 }
@@ -729,14 +729,14 @@ const KeyRangeRef tLogDatacentersKeys(LiteralStringRef("\xff\x02/tLogDatacenters
 const KeyRef tLogDatacentersPrefix = tLogDatacentersKeys.begin;
 
 const Key tLogDatacentersKeyFor(Optional<Value> dcID) {
-	BinaryWriter wr(AssumeVersion(currentProtocolVersion));
+	BinaryWriter wr(AssumeVersion(currentProtocolVersion()));
 	wr.serializeBytes(tLogDatacentersKeys.begin);
 	wr << dcID;
 	return wr.toValue();
 }
 Optional<Value> decodeTLogDatacentersKey(KeyRef const& key) {
 	Optional<Value> dcID;
-	BinaryReader rd(key.removePrefix(tLogDatacentersKeys.begin), AssumeVersion(currentProtocolVersion));
+	BinaryReader rd(key.removePrefix(tLogDatacentersKeys.begin), AssumeVersion(currentProtocolVersion()));
 	rd >> dcID;
 	return dcID;
 }
@@ -755,7 +755,7 @@ const Key serverListKeyFor(UID serverID) {
 }
 
 const Value serverListValue(StorageServerInterface const& server) {
-	auto protocolVersion = currentProtocolVersion;
+	auto protocolVersion = currentProtocolVersion();
 	protocolVersion.addObjectSerializerFlag();
 	return ObjectWriter::toValue(server, IncludeVersion(protocolVersion));
 }
@@ -787,7 +787,7 @@ StorageServerInterface decodeServerListValue(ValueRef const& value) {
 }
 
 Value swVersionValue(SWVersion const& swversion) {
-	auto protocolVersion = currentProtocolVersion;
+	auto protocolVersion = currentProtocolVersion();
 	protocolVersion.addObjectSerializerFlag();
 	return ObjectWriter::toValue(swversion, IncludeVersion(protocolVersion));
 }
@@ -1331,6 +1331,9 @@ int64_t decodeBlobManagerEpochValue(ValueRef const& value) {
 }
 
 // blob granule data
+const KeyRef blobRangeActive = LiteralStringRef("1");
+const KeyRef blobRangeInactive = StringRef();
+
 const KeyRangeRef blobGranuleFileKeys(LiteralStringRef("\xff\x02/bgf/"), LiteralStringRef("\xff\x02/bgf0"));
 const KeyRangeRef blobGranuleMappingKeys(LiteralStringRef("\xff\x02/bgm/"), LiteralStringRef("\xff\x02/bgm0"));
 const KeyRangeRef blobGranuleLockKeys(LiteralStringRef("\xff\x02/bgl/"), LiteralStringRef("\xff\x02/bgl0"));
@@ -1340,7 +1343,8 @@ const KeyRangeRef blobGranuleMergeBoundaryKeys(LiteralStringRef("\xff\x02/bgmerg
                                                LiteralStringRef("\xff\x02/bgmergebounds0"));
 const KeyRangeRef blobGranuleHistoryKeys(LiteralStringRef("\xff\x02/bgh/"), LiteralStringRef("\xff\x02/bgh0"));
 const KeyRangeRef blobGranulePurgeKeys(LiteralStringRef("\xff\x02/bgp/"), LiteralStringRef("\xff\x02/bgp0"));
-const KeyRangeRef blobGranuleVersionKeys(LiteralStringRef("\xff\x02/bgv/"), LiteralStringRef("\xff\x02/bgv0"));
+const KeyRangeRef blobGranuleForcePurgedKeys(LiteralStringRef("\xff\x02/bgpforce/"),
+                                             LiteralStringRef("\xff\x02/bgpforce0"));
 const KeyRef blobGranulePurgeChangeKey = LiteralStringRef("\xff\x02/bgpChange");
 
 const uint8_t BG_FILE_TYPE_DELTA = 'D';
diff --git a/fdbclient/Tenant.cpp b/fdbclient/Tenant.cpp
index 9c8c63f18e..b1c5d7ce53 100644
--- a/fdbclient/Tenant.cpp
+++ b/fdbclient/Tenant.cpp
@@ -26,11 +26,11 @@
 
 Key TenantMapEntry::idToPrefix(int64_t id) {
 	int64_t swapped = bigEndian64(id);
-	return StringRef(reinterpret_cast<const uint8_t*>(&swapped), 8);
+	return StringRef(reinterpret_cast<const uint8_t*>(&swapped), TENANT_PREFIX_SIZE);
 }
 
 int64_t TenantMapEntry::prefixToId(KeyRef prefix) {
-	ASSERT(prefix.size() == 8);
+	ASSERT(prefix.size() == TENANT_PREFIX_SIZE);
 	int64_t id = *reinterpret_cast<const int64_t*>(prefix.begin());
 	id = bigEndian64(id);
 	ASSERT(id >= 0);
@@ -47,6 +47,10 @@ std::string TenantMapEntry::tenantStateToString(TenantState tenantState) {
 		return "removing";
 	case TenantState::UPDATING_CONFIGURATION:
 		return "updating configuration";
+	case TenantState::RENAMING_FROM:
+		return "renaming from";
+	case TenantState::RENAMING_TO:
+		return "renaming to";
 	case TenantState::ERROR:
 		return "error";
 	default:
@@ -63,6 +67,10 @@ TenantState TenantMapEntry::stringToTenantState(std::string stateStr) {
 		return TenantState::REMOVING;
 	} else if (stateStr == "updating configuration") {
 		return TenantState::UPDATING_CONFIGURATION;
+	} else if (stateStr == "renaming from") {
+		return TenantState::RENAMING_FROM;
+	} else if (stateStr == "renaming to") {
+		return TenantState::RENAMING_TO;
 	} else if (stateStr == "error") {
 		return TenantState::ERROR;
 	}
@@ -70,6 +78,31 @@ TenantState TenantMapEntry::stringToTenantState(std::string stateStr) {
 	UNREACHABLE();
 }
 
+std::string TenantMapEntry::tenantLockStateToString(TenantLockState tenantState) {
+	switch (tenantState) {
+	case TenantLockState::UNLOCKED:
+		return "unlocked";
+	case TenantLockState::READ_ONLY:
+		return "read only";
+	case TenantLockState::LOCKED:
+		return "locked";
+	default:
+		UNREACHABLE();
+	}
+}
+
+TenantLockState TenantMapEntry::stringToTenantLockState(std::string stateStr) {
+	if (stateStr == "unlocked") {
+		return TenantLockState::UNLOCKED;
+	} else if (stateStr == "read only") {
+		return TenantLockState::READ_ONLY;
+	} else if (stateStr == "locked") {
+		return TenantLockState::LOCKED;
+	}
+
+	UNREACHABLE();
+}
+
 TenantMapEntry::TenantMapEntry() {}
 TenantMapEntry::TenantMapEntry(int64_t id, TenantState tenantState, bool encrypted)
   : tenantState(tenantState), encrypted(encrypted) {
@@ -109,7 +142,9 @@ std::string TenantMapEntry::toJson(int apiVersion) const {
 	}
 
 	tenantEntry["tenant_state"] = TenantMapEntry::tenantStateToString(tenantState);
-
+	if (assignedCluster.present()) {
+		tenantEntry["assigned_cluster"] = assignedCluster.get().toString();
+	}
 	if (tenantGroup.present()) {
 		json_spirit::mObject tenantGroupObject;
 		std::string encodedTenantGroup = base64::encoder::from_string(tenantGroup.get().toString());
@@ -125,7 +160,7 @@ std::string TenantMapEntry::toJson(int apiVersion) const {
 }
 
 bool TenantMapEntry::matchesConfiguration(TenantMapEntry const& other) const {
-	return tenantGroup == other.tenantGroup;
+	return tenantGroup == other.tenantGroup && encrypted == other.encrypted;
 }
 
 void TenantMapEntry::configure(Standalone<StringRef> parameter, Optional<Value> value) {
@@ -137,6 +172,16 @@ void TenantMapEntry::configure(Standalone<StringRef> parameter, Optional<Value>
 	}
 }
 
+TenantMetadataSpecification& TenantMetadata::instance() {
+	static TenantMetadataSpecification _instance = TenantMetadataSpecification("\xff/"_sr);
+	return _instance;
+}
+
+Key TenantMetadata::tenantMapPrivatePrefix() {
+	static Key _prefix = "\xff"_sr.withSuffix(tenantMap().subspace.begin);
+	return _prefix;
+}
+
 TEST_CASE("/fdbclient/TenantMapEntry/Serialization") {
 	TenantMapEntry entry1(1, TenantState::READY, false);
 	ASSERT(entry1.prefix == "\x00\x00\x00\x00\x00\x00\x00\x01"_sr);
diff --git a/fdbclient/TenantManagement.actor.cpp b/fdbclient/TenantManagement.actor.cpp
new file mode 100644
index 0000000000..608da5c690
--- /dev/null
+++ b/fdbclient/TenantManagement.actor.cpp
@@ -0,0 +1,40 @@
+/*
+ * TenantManagement.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+#include <map>
+#include "fdbclient/SystemData.h"
+#include "fdbclient/TenantManagement.actor.h"
+#include "fdbclient/Tuple.h"
+#include "flow/actorcompiler.h" // has to be last include
+
+namespace TenantAPI {
+
+TenantMode tenantModeForClusterType(ClusterType clusterType, TenantMode tenantMode) {
+	if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) {
+		return TenantMode::DISABLED;
+	} else if (clusterType == ClusterType::METACLUSTER_DATA) {
+		return TenantMode::REQUIRED;
+	} else {
+		return tenantMode;
+	}
+}
+
+} // namespace TenantAPI
diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp
index 1dc5357572..0edd398c53 100644
--- a/fdbclient/ThreadSafeTransaction.cpp
+++ b/fdbclient/ThreadSafeTransaction.cpp
@@ -21,6 +21,7 @@
 #include "fdbclient/BlobGranuleFiles.h"
 #include "fdbclient/ClusterConnectionFile.h"
 #include "fdbclient/ClusterConnectionMemoryRecord.h"
+#include "fdbclient/CoordinationInterface.h"
 #include "fdbclient/ThreadSafeTransaction.h"
 #include "fdbclient/DatabaseContext.h"
 #include "fdbclient/versions.h"
@@ -143,13 +144,47 @@ ThreadFuture<Void> ThreadSafeDatabase::waitPurgeGranulesComplete(const KeyRef& p
 	return onMainThread([db, key]() -> Future<Void> { return db->waitPurgeGranulesComplete(key); });
 }
 
-ThreadSafeDatabase::ThreadSafeDatabase(Reference<IClusterConnectionRecord> connectionRecord, int apiVersion) {
+ThreadFuture<bool> ThreadSafeDatabase::blobbifyRange(const KeyRangeRef& keyRange) {
+	DatabaseContext* db = this->db;
+	KeyRange range = keyRange;
+	return onMainThread([=]() -> Future<bool> { return db->blobbifyRange(range); });
+}
+
+ThreadFuture<bool> ThreadSafeDatabase::unblobbifyRange(const KeyRangeRef& keyRange) {
+	DatabaseContext* db = this->db;
+	KeyRange range = keyRange;
+	return onMainThread([=]() -> Future<bool> { return db->blobbifyRange(range); });
+}
+
+ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> ThreadSafeDatabase::listBlobbifiedRanges(const KeyRangeRef& keyRange,
+                                                                                          int rangeLimit) {
+	DatabaseContext* db = this->db;
+	KeyRange range = keyRange;
+	return onMainThread(
+	    [=]() -> Future<Standalone<VectorRef<KeyRangeRef>>> { return db->listBlobbifiedRanges(range, rangeLimit); });
+}
+
+ThreadFuture<Version> ThreadSafeDatabase::verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) {
+	DatabaseContext* db = this->db;
+	KeyRange range = keyRange;
+	return onMainThread([=]() -> Future<Version> { return db->verifyBlobRange(range, version); });
+}
+
+ThreadSafeDatabase::ThreadSafeDatabase(ConnectionRecordType connectionRecordType,
+                                       std::string connectionRecordString,
+                                       int apiVersion) {
 	// Allocate memory for the Database from this thread (so the pointer is known for subsequent method calls)
 	// but run its constructor on the main thread
 	DatabaseContext* db = this->db = DatabaseContext::allocateOnForeignThread();
 
-	onMainThreadVoid([db, connectionRecord, apiVersion]() {
+	onMainThreadVoid([db, connectionRecordType, connectionRecordString, apiVersion]() {
 		try {
+			Reference<IClusterConnectionRecord> connectionRecord =
+			    connectionRecordType == ConnectionRecordType::FILE
+			        ? Reference<IClusterConnectionRecord>(ClusterConnectionFile::openOrDefault(connectionRecordString))
+			        : Reference<IClusterConnectionRecord>(
+			              new ClusterConnectionMemoryRecord(ClusterConnectionString(connectionRecordString)));
+
 			Database::createDatabase(connectionRecord, apiVersion, IsInternal::False, LocalityData(), db).extractPtr();
 		} catch (Error& e) {
 			new (db) DatabaseContext(e);
@@ -350,13 +385,14 @@ ThreadFuture<Standalone<VectorRef<const char*>>> ThreadSafeTransaction::getAddre
 }
 
 ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> ThreadSafeTransaction::getBlobGranuleRanges(
-    const KeyRangeRef& keyRange) {
+    const KeyRangeRef& keyRange,
+    int rangeLimit) {
 	ISingleThreadTransaction* tr = this->tr;
 	KeyRange r = keyRange;
 
-	return onMainThread([tr, r]() -> Future<Standalone<VectorRef<KeyRangeRef>>> {
+	return onMainThread([=]() -> Future<Standalone<VectorRef<KeyRangeRef>>> {
 		tr->checkDeferredError();
-		return tr->getBlobGranuleRanges(r);
+		return tr->getBlobGranuleRanges(r, rangeLimit);
 	});
 }
 
@@ -364,34 +400,33 @@ ThreadResult<RangeResult> ThreadSafeTransaction::readBlobGranules(const KeyRange
                                                                   Version beginVersion,
                                                                   Optional<Version> readVersion,
                                                                   ReadBlobGranuleContext granule_context) {
-	// FIXME: prevent from calling this from another main thread!
+	// This should not be called directly, bypassMultiversionApi should not be set
+	return ThreadResult<RangeResult>(unsupported_operation());
+}
 
+ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>> ThreadSafeTransaction::readBlobGranulesStart(
+    const KeyRangeRef& keyRange,
+    Version beginVersion,
+    Optional<Version> readVersion,
+    Version* readVersionOut) {
 	ISingleThreadTransaction* tr = this->tr;
 	KeyRange r = keyRange;
 
-	int64_t readVersionOut;
-	ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>> getFilesFuture = onMainThread(
-	    [tr, r, beginVersion, readVersion, &readVersionOut]() -> Future<Standalone<VectorRef<BlobGranuleChunkRef>>> {
+	return onMainThread(
+	    [tr, r, beginVersion, readVersion, readVersionOut]() -> Future<Standalone<VectorRef<BlobGranuleChunkRef>>> {
 		    tr->checkDeferredError();
-		    return tr->readBlobGranules(r, beginVersion, readVersion, &readVersionOut);
+		    return tr->readBlobGranules(r, beginVersion, readVersion, readVersionOut);
 	    });
-
-	// FIXME: can this safely avoid another main thread jump?
-	getFilesFuture.blockUntilReadyCheckOnMainThread();
-
-	// propagate error to client
-	if (getFilesFuture.isError()) {
-		return ThreadResult<RangeResult>(getFilesFuture.getError());
-	}
-
-	Standalone<VectorRef<BlobGranuleChunkRef>> files = getFilesFuture.get();
-
+}
+ThreadResult<RangeResult> ThreadSafeTransaction::readBlobGranulesFinish(
+    ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>> startFuture,
+    const KeyRangeRef& keyRange,
+    Version beginVersion,
+    Version readVersion,
+    ReadBlobGranuleContext granuleContext) {
 	// do this work off of fdb network threads for performance!
-	if (granule_context.debugNoMaterialize) {
-		return ThreadResult<RangeResult>(blob_granule_not_materialized());
-	} else {
-		return loadAndMaterializeBlobGranules(files, keyRange, beginVersion, readVersionOut, granule_context);
-	}
+	Standalone<VectorRef<BlobGranuleChunkRef>> files = startFuture.get();
+	return loadAndMaterializeBlobGranules(files, keyRange, beginVersion, readVersion, granuleContext);
 }
 
 void ThreadSafeTransaction::addReadConflictRange(const KeyRangeRef& keys) {
@@ -563,19 +598,25 @@ void ThreadSafeTransaction::reset() {
 
 extern const char* getSourceVersion();
 
-ThreadSafeApi::ThreadSafeApi()
-  : apiVersion(-1), clientVersion(format("%s,%s,%llx", FDB_VT_VERSION, getSourceVersion(), currentProtocolVersion)),
-    transportId(0) {}
+ThreadSafeApi::ThreadSafeApi() : apiVersion(-1), transportId(0) {}
 
 void ThreadSafeApi::selectApiVersion(int apiVersion) {
 	this->apiVersion = apiVersion;
 }
 
 const char* ThreadSafeApi::getClientVersion() {
-	// There is only one copy of the ThreadSafeAPI, and it never gets deleted. Also, clientVersion is never modified.
+	// There is only one copy of the ThreadSafeAPI, and it never gets deleted.
+	// Also, clientVersion is initialized on demand and never modified afterwards.
+	if (clientVersion.empty()) {
+		clientVersion = format("%s,%s,%llx", FDB_VT_VERSION, getSourceVersion(), currentProtocolVersion());
+	}
 	return clientVersion.c_str();
 }
 
+void ThreadSafeApi::useFutureProtocolVersion() {
+	::useFutureProtocolVersion();
+}
+
 void ThreadSafeApi::setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value) {
 	if (option == FDBNetworkOptions::EXTERNAL_CLIENT_TRANSPORT_ID) {
 		if (value.present()) {
@@ -632,12 +673,12 @@ void ThreadSafeApi::stopNetwork() {
 
 Reference<IDatabase> ThreadSafeApi::createDatabase(const char* clusterFilePath) {
 	return Reference<IDatabase>(
-	    new ThreadSafeDatabase(ClusterConnectionFile::openOrDefault(clusterFilePath), apiVersion));
+	    new ThreadSafeDatabase(ThreadSafeDatabase::ConnectionRecordType::FILE, clusterFilePath, apiVersion));
 }
 
 Reference<IDatabase> ThreadSafeApi::createDatabaseFromConnectionString(const char* connectionString) {
 	return Reference<IDatabase>(new ThreadSafeDatabase(
-	    makeReference<ClusterConnectionMemoryRecord>(ClusterConnectionString(connectionString)), apiVersion));
+	    ThreadSafeDatabase::ConnectionRecordType::CONNECTION_STRING, connectionString, apiVersion));
 }
 
 void ThreadSafeApi::addNetworkThreadCompletionHook(void (*hook)(void*), void* hookParameter) {
diff --git a/fdbclient/Tuple.cpp b/fdbclient/Tuple.cpp
index d3c3416b88..1575a565ab 100644
--- a/fdbclient/Tuple.cpp
+++ b/fdbclient/Tuple.cpp
@@ -208,7 +208,7 @@ Tuple& Tuple::append(double value) {
 	return *this;
 }
 
-Tuple& Tuple::append(nullptr_t) {
+Tuple& Tuple::append(std::nullptr_t) {
 	offsets.push_back(data.size());
 	data.push_back(data.arena(), (uint8_t)'\x00');
 	return *this;
diff --git a/fdbclient/azure_backup/BackupContainerAzureBlobStore.actor.cpp b/fdbclient/azure_backup/BackupContainerAzureBlobStore.actor.cpp
index 59d31fc8f3..8bd8f94872 100644
--- a/fdbclient/azure_backup/BackupContainerAzureBlobStore.actor.cpp
+++ b/fdbclient/azure_backup/BackupContainerAzureBlobStore.actor.cpp
@@ -29,7 +29,7 @@ namespace {
 std::string const notFoundErrorCode = "404";
 
 void printAzureError(std::string const& operationName, azure::storage_lite::storage_error const& err) {
-	printf("(%s) : Error from Azure SDK : %s (%s) : %s",
+	printf("(%s) : Error from Azure SDK : %s (%s) : %s\n",
 	       operationName.c_str(),
 	       err.code_name.c_str(),
 	       err.code.c_str(),
@@ -109,9 +109,9 @@ public:
 
 	class WriteFile final : public IAsyncFile, ReferenceCounted<WriteFile> {
 		AsyncTaskThread* asyncTaskThread;
-		std::shared_ptr<AzureClient> client;
 		std::string containerName;
 		std::string blobName;
+		std::shared_ptr<AzureClient> client;
 		int64_t m_cursor{ 0 };
 		// Ideally this buffer should not be a string, but
 		// the Azure SDK only supports/tests uploading to append
@@ -318,7 +318,7 @@ BackupContainerAzureBlobStore::BackupContainerAzureBlobStore(const std::string&
 	std::string accountKey = _accountKey;
 	auto credential = std::make_shared<azure::storage_lite::shared_key_credential>(accountName, accountKey);
 	auto storageAccount = std::make_shared<azure::storage_lite::storage_account>(
-	    accountName, credential, true, format("https://%s", endpoint.c_str()));
+	    accountName, credential, true, fmt::format("https://{}", endpoint));
 	client = std::make_unique<AzureClient>(storageAccount, 1);
 }
 
@@ -342,6 +342,7 @@ Future<Void> BackupContainerAzureBlobStore::create() {
 	Future<Void> encryptionSetupFuture = usesEncryption() ? encryptionSetupComplete() : Void();
 	return createContainerFuture && encryptionSetupFuture;
 }
+
 Future<bool> BackupContainerAzureBlobStore::exists() {
 	TraceEvent(SevDebug, "BCAzureBlobStoreCheckContainerExists").detail("ContainerName", containerName);
 	return asyncTaskThread.execAsync([containerName = this->containerName, client = this->client] {
diff --git a/fdbclient/azure_backup/README.md b/fdbclient/azure_backup/README.md
new file mode 100644
index 0000000000..4a34683674
--- /dev/null
+++ b/fdbclient/azure_backup/README.md
@@ -0,0 +1,33 @@
+# Set up the Azure Backup Testing Environment
+
+Make sure we built FDB with `-DBUILD_AZURE_BACKUP=ON`
+
+# Test
+
+If you run _BackupToBlob_ and _RestoreFromBlob_ workloads with the paramter _backupURL_ starts with `azure://`,
+the workload will backup to and restore from the azure blob storage.
+For example, _BackupAzureBlobCorrectness.toml_
+
+## Url format
+
+The code now supports the following style urls:
+
+- `azure://<account_name>.blob.core.windows.net/<container_name>` (The formal url format for the blob service provided by the azure storage account)
+- `azure://<ip|hostname>:<port>/<account_name>/<container_name>` (Directly providing the endpoint address for the blob service, usually for local testing)
+
+## Local test environment 
+
+We need to use the _Azurite_ to simulate an Azure blob service locally.
+Please follow the [turtorial](https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=docker-hub) to start your service locally.
+
+For example,
+```
+docker run -p 10000:10000 -v `pwd`:<path> -w <path> mcr.microsoft.com/azure-storage/azurite azurite-blob --blobHost 0.0.0.0 --blobPort 10000 --oauth basic --cert ./<...>.pem --key ./<...>.key.pem --debug ./<log_file_path>
+```
+
+### Notice
+
+- To use uses _https_, we need to provide the certificates via `--cert` and `--key`
+    The detailed [turtorial](https://github.com/Azure/Azurite/blob/main/README.md#https-setup) to setup HTTPS.  (We tested with the `mkcert` method)
+- To use Azure SDKs, we need to pass `--oauth basic` option
+- Please take a look at the [difference](https://github.com/Azure/Azurite/blob/main/README.md#differences-between-azurite-and-azure-storage) between Azurite and Azure Storage
diff --git a/fdbclient/azurestorage.cmake b/fdbclient/azurestorage.cmake
index 36f8e24f6e..b967824948 100644
--- a/fdbclient/azurestorage.cmake
+++ b/fdbclient/azurestorage.cmake
@@ -1,3 +1,5 @@
+cmake_minimum_required(VERSION 3.13)
+
 project(azurestorage-download)
 
 include(ExternalProject)
diff --git a/fdbclient/include/fdbclient/BackupContainerAzureBlobStore.h b/fdbclient/include/fdbclient/BackupContainerAzureBlobStore.h
index 77285ced16..ed79a56078 100644
--- a/fdbclient/include/fdbclient/BackupContainerAzureBlobStore.h
+++ b/fdbclient/include/fdbclient/BackupContainerAzureBlobStore.h
@@ -25,8 +25,6 @@
 #include "fdbclient/AsyncTaskThread.h"
 #include "fdbclient/BackupContainerFileSystem.h"
 
-#include "storage_credential.h"
-#include "storage_account.h"
 #include "blob/blob_client.h"
 
 class BackupContainerAzureBlobStore final : public BackupContainerFileSystem,
diff --git a/fdbclient/include/fdbclient/BlobGranuleCommon.h b/fdbclient/include/fdbclient/BlobGranuleCommon.h
index 5120a7d021..e0589877c1 100644
--- a/fdbclient/include/fdbclient/BlobGranuleCommon.h
+++ b/fdbclient/include/fdbclient/BlobGranuleCommon.h
@@ -35,7 +35,6 @@
 #define BG_ENCRYPT_COMPRESS_DEBUG false
 
 // file format of actual blob files
-// FIXME: use VecSerStrategy::String serialization for this
 struct GranuleSnapshot : VectorRef<KeyValueRef> {
 
 	constexpr static FileIdentifier file_identifier = 1300395;
@@ -234,6 +233,22 @@ struct BlobGranuleChunkRef {
 	}
 };
 
+struct BlobGranuleSummaryRef {
+	constexpr static FileIdentifier file_identifier = 9774587;
+	KeyRangeRef keyRange;
+	Version snapshotVersion;
+	int64_t snapshotSize;
+	Version deltaVersion;
+	int64_t deltaSize;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, keyRange, snapshotVersion, snapshotSize, deltaVersion, deltaSize);
+	}
+};
+
+BlobGranuleSummaryRef summarizeGranuleChunk(Arena& ar, const BlobGranuleChunkRef& chunk);
+
 enum BlobGranuleSplitState { Unknown = 0, Initialized = 1, Assigned = 2, Done = 3 };
 
 // Boundary metadata for each range indexed by the beginning of the range.
@@ -252,7 +267,6 @@ struct BlobGranuleMergeBoundary {
 struct BlobGranuleHistoryValue {
 	constexpr static FileIdentifier file_identifier = 991434;
 	UID granuleID;
-	// VectorRef<std::pair<KeyRangeRef, Version>> parentGranules;
 	VectorRef<KeyRef> parentBoundaries;
 	VectorRef<Version> parentVersions;
 
diff --git a/fdbclient/include/fdbclient/BlobGranuleReader.actor.h b/fdbclient/include/fdbclient/BlobGranuleReader.actor.h
index a9008b03eb..395b76b26c 100644
--- a/fdbclient/include/fdbclient/BlobGranuleReader.actor.h
+++ b/fdbclient/include/fdbclient/BlobGranuleReader.actor.h
@@ -51,5 +51,7 @@ ACTOR Future<Void> readBlobGranules(BlobGranuleFileRequest request,
                                     Reference<BlobConnectionProvider> bstore,
                                     PromiseStream<RangeResult> results);
 
+bool isRangeFullyCovered(KeyRange range, Standalone<VectorRef<BlobGranuleChunkRef>> blobChunks);
+
 #include "flow/unactorcompiler.h"
 #endif
diff --git a/fdbclient/include/fdbclient/BlobWorkerCommon.h b/fdbclient/include/fdbclient/BlobWorkerCommon.h
index 0535301427..45ee961320 100644
--- a/fdbclient/include/fdbclient/BlobWorkerCommon.h
+++ b/fdbclient/include/fdbclient/BlobWorkerCommon.h
@@ -30,7 +30,7 @@ struct BlobWorkerStats {
 	Counter deltaBytesWritten, snapshotBytesWritten;
 	Counter bytesReadFromFDBForInitialSnapshot;
 	Counter bytesReadFromS3ForCompaction;
-	Counter rangeAssignmentRequests, readRequests;
+	Counter rangeAssignmentRequests, readRequests, summaryReads;
 	Counter wrongShardServer;
 	Counter changeFeedInputBytes;
 	Counter readReqTotalFilesReturned;
@@ -41,16 +41,32 @@ struct BlobWorkerStats {
 	Counter readRequestsWithBegin;
 	Counter readRequestsCollapsed;
 	Counter flushGranuleReqs;
+	Counter compressionBytesRaw;
+	Counter compressionBytesFinal;
+	Counter fullRejections;
 
 	int numRangesAssigned;
 	int mutationBytesBuffered;
 	int activeReadRequests;
 	int granulesPendingSplitCheck;
+	Version minimumCFVersion;
+	Version cfVersionLag;
+	int notAtLatestChangeFeeds;
+	int64_t lastResidentMemory;
+	int64_t estimatedMaxResidentMemory;
+
+	Reference<FlowLock> initialSnapshotLock;
+	Reference<FlowLock> resnapshotLock;
+	Reference<FlowLock> deltaWritesLock;
 
 	Future<Void> logger;
 
 	// Current stats maintained for a given blob worker process
-	explicit BlobWorkerStats(UID id, double interval)
+	explicit BlobWorkerStats(UID id,
+	                         double interval,
+	                         Reference<FlowLock> initialSnapshotLock,
+	                         Reference<FlowLock> resnapshotLock,
+	                         Reference<FlowLock> deltaWritesLock)
 	  : cc("BlobWorkerStats", id.toString()),
 
 	    s3PutReqs("S3PutReqs", cc), s3GetReqs("S3GetReqs", cc), s3DeleteReqs("S3DeleteReqs", cc),
@@ -59,17 +75,31 @@ struct BlobWorkerStats {
 	    bytesReadFromFDBForInitialSnapshot("BytesReadFromFDBForInitialSnapshot", cc),
 	    bytesReadFromS3ForCompaction("BytesReadFromS3ForCompaction", cc),
 	    rangeAssignmentRequests("RangeAssignmentRequests", cc), readRequests("ReadRequests", cc),
-	    wrongShardServer("WrongShardServer", cc), changeFeedInputBytes("ChangeFeedInputBytes", cc),
-	    readReqTotalFilesReturned("ReadReqTotalFilesReturned", cc),
+	    summaryReads("SummaryReads", cc), wrongShardServer("WrongShardServer", cc),
+	    changeFeedInputBytes("ChangeFeedInputBytes", cc), readReqTotalFilesReturned("ReadReqTotalFilesReturned", cc),
 	    readReqDeltaBytesReturned("ReadReqDeltaBytesReturned", cc), commitVersionChecks("CommitVersionChecks", cc),
 	    granuleUpdateErrors("GranuleUpdateErrors", cc), granuleRequestTimeouts("GranuleRequestTimeouts", cc),
 	    readRequestsWithBegin("ReadRequestsWithBegin", cc), readRequestsCollapsed("ReadRequestsCollapsed", cc),
-	    flushGranuleReqs("FlushGranuleReqs", cc), numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0),
-	    granulesPendingSplitCheck(0) {
+	    flushGranuleReqs("FlushGranuleReqs", cc), compressionBytesRaw("CompressionBytesRaw", cc),
+	    compressionBytesFinal("CompressionBytesFinal", cc), fullRejections("FullRejections", cc), numRangesAssigned(0),
+	    mutationBytesBuffered(0), activeReadRequests(0), granulesPendingSplitCheck(0), minimumCFVersion(0),
+	    cfVersionLag(0), notAtLatestChangeFeeds(0), lastResidentMemory(0), estimatedMaxResidentMemory(0),
+	    initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock), deltaWritesLock(deltaWritesLock) {
 		specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; });
 		specialCounter(cc, "MutationBytesBuffered", [this]() { return this->mutationBytesBuffered; });
 		specialCounter(cc, "ActiveReadRequests", [this]() { return this->activeReadRequests; });
 		specialCounter(cc, "GranulesPendingSplitCheck", [this]() { return this->granulesPendingSplitCheck; });
+		specialCounter(cc, "MinimumChangeFeedVersion", [this]() { return this->minimumCFVersion; });
+		specialCounter(cc, "CFVersionLag", [this]() { return this->cfVersionLag; });
+		specialCounter(cc, "NotAtLatestChangeFeeds", [this]() { return this->notAtLatestChangeFeeds; });
+		specialCounter(cc, "LastResidentMemory", [this]() { return this->lastResidentMemory; });
+		specialCounter(cc, "EstimatedMaxResidentMemory", [this]() { return this->estimatedMaxResidentMemory; });
+		specialCounter(cc, "InitialSnapshotsActive", [this]() { return this->initialSnapshotLock->activePermits(); });
+		specialCounter(cc, "InitialSnapshotsWaiting", [this]() { return this->initialSnapshotLock->waiters(); });
+		specialCounter(cc, "ReSnapshotsActive", [this]() { return this->resnapshotLock->activePermits(); });
+		specialCounter(cc, "ReSnapshotsWaiting", [this]() { return this->resnapshotLock->waiters(); });
+		specialCounter(cc, "DeltaFileWritesActive", [this]() { return this->deltaWritesLock->activePermits(); });
+		specialCounter(cc, "DeltaFileWritesWaiting", [this]() { return this->deltaWritesLock->waiters(); });
 
 		logger = traceCounters("BlobWorkerMetrics", id, interval, &cc, "BlobWorkerMetrics");
 	}
diff --git a/fdbclient/include/fdbclient/BlobWorkerInterface.h b/fdbclient/include/fdbclient/BlobWorkerInterface.h
index de370d248f..69d938300e 100644
--- a/fdbclient/include/fdbclient/BlobWorkerInterface.h
+++ b/fdbclient/include/fdbclient/BlobWorkerInterface.h
@@ -30,15 +30,15 @@
 
 struct BlobWorkerInterface {
 	constexpr static FileIdentifier file_identifier = 8358753;
-	// TODO: mimic what StorageServerInterface does with sequential endpoint IDs
 	RequestStream<ReplyPromise<Void>> waitFailure;
-	RequestStream<struct BlobGranuleFileRequest> blobGranuleFileRequest;
+	PublicRequestStream<struct BlobGranuleFileRequest> blobGranuleFileRequest;
 	RequestStream<struct AssignBlobRangeRequest> assignBlobRangeRequest;
 	RequestStream<struct RevokeBlobRangeRequest> revokeBlobRangeRequest;
 	RequestStream<struct GetGranuleAssignmentsRequest> granuleAssignmentsRequest;
 	RequestStream<struct GranuleStatusStreamRequest> granuleStatusStreamRequest;
 	RequestStream<struct HaltBlobWorkerRequest> haltBlobWorker;
 	RequestStream<struct FlushGranuleRequest> flushGranuleRequest;
+	RequestStream<struct MinBlobVersionRequest> minBlobVersionRequest;
 
 	struct LocalityData locality;
 	UID myId;
@@ -57,6 +57,7 @@ struct BlobWorkerInterface {
 		streams.push_back(granuleStatusStreamRequest.getReceiver());
 		streams.push_back(haltBlobWorker.getReceiver());
 		streams.push_back(flushGranuleRequest.getReceiver());
+		streams.push_back(minBlobVersionRequest.getReceiver());
 		FlowTransport::transport().addEndpoints(streams);
 	}
 	UID id() const { return myId; }
@@ -72,7 +73,7 @@ struct BlobWorkerInterface {
 		serializer(ar, myId, locality, waitFailure);
 		if (Archive::isDeserializing) {
 			blobGranuleFileRequest =
-			    RequestStream<struct BlobGranuleFileRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(1));
+			    PublicRequestStream<struct BlobGranuleFileRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(1));
 			assignBlobRangeRequest =
 			    RequestStream<struct AssignBlobRangeRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(2));
 			revokeBlobRangeRequest =
@@ -85,6 +86,8 @@ struct BlobWorkerInterface {
 			    RequestStream<struct HaltBlobWorkerRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(6));
 			flushGranuleRequest =
 			    RequestStream<struct FlushGranuleRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(7));
+			minBlobVersionRequest =
+			    RequestStream<struct MinBlobVersionRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(8));
 		}
 	}
 };
@@ -110,13 +113,16 @@ struct BlobGranuleFileRequest {
 	Version readVersion;
 	bool canCollapseBegin = true;
 	TenantInfo tenantInfo;
+	bool summarize = false;
 	ReplyPromise<BlobGranuleFileReply> reply;
 
 	BlobGranuleFileRequest() {}
 
+	bool verify() const { return tenantInfo.isAuthorized(); }
+
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, keyRange, beginVersion, readVersion, canCollapseBegin, tenantInfo, reply, arena);
+		serializer(ar, keyRange, beginVersion, readVersion, canCollapseBegin, tenantInfo, summarize, reply, arena);
 	}
 };
 
@@ -137,6 +143,28 @@ struct RevokeBlobRangeRequest {
 	}
 };
 
+struct MinBlobVersionReply {
+	constexpr static FileIdentifier file_identifier = 6857512;
+	Version version;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, version);
+	}
+};
+
+struct MinBlobVersionRequest {
+	constexpr static FileIdentifier file_identifier = 4833278;
+	Version grv;
+	ReplyPromise<MinBlobVersionReply> reply;
+
+	MinBlobVersionRequest() {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, grv, reply);
+	}
+};
 /*
  * Continue: Blob worker should continue handling a granule that was evaluated for a split
  * Normal: Blob worker should open the granule and start processing it
@@ -172,6 +200,7 @@ struct GranuleStatusReply : public ReplyPromiseStreamReply {
 	KeyRange granuleRange;
 	bool doSplit;
 	bool writeHotSplit;
+	bool initialSplitTooBig;
 	int64_t continueEpoch;
 	int64_t continueSeqno;
 	UID granuleID;
@@ -180,11 +209,13 @@ struct GranuleStatusReply : public ReplyPromiseStreamReply {
 	bool mergeCandidate;
 	int64_t originalEpoch;
 	int64_t originalSeqno;
+	Optional<Key> proposedSplitKey;
 
 	GranuleStatusReply() {}
 	explicit GranuleStatusReply(KeyRange range,
 	                            bool doSplit,
 	                            bool writeHotSplit,
+	                            bool initialSplitTooBig,
 	                            int64_t continueEpoch,
 	                            int64_t continueSeqno,
 	                            UID granuleID,
@@ -193,11 +224,15 @@ struct GranuleStatusReply : public ReplyPromiseStreamReply {
 	                            bool mergeCandidate,
 	                            int64_t originalEpoch,
 	                            int64_t originalSeqno)
-	  : granuleRange(range), doSplit(doSplit), writeHotSplit(writeHotSplit), continueEpoch(continueEpoch),
-	    continueSeqno(continueSeqno), granuleID(granuleID), startVersion(startVersion), blockedVersion(blockedVersion),
-	    mergeCandidate(mergeCandidate), originalEpoch(originalEpoch), originalSeqno(originalSeqno) {}
+	  : granuleRange(range), doSplit(doSplit), writeHotSplit(writeHotSplit), initialSplitTooBig(initialSplitTooBig),
+	    continueEpoch(continueEpoch), continueSeqno(continueSeqno), granuleID(granuleID), startVersion(startVersion),
+	    blockedVersion(blockedVersion), mergeCandidate(mergeCandidate), originalEpoch(originalEpoch),
+	    originalSeqno(originalSeqno) {}
 
-	int expectedSize() const { return sizeof(GranuleStatusReply) + granuleRange.expectedSize(); }
+	int expectedSize() const {
+		return sizeof(GranuleStatusReply) + granuleRange.expectedSize() +
+		       (proposedSplitKey.present() ? proposedSplitKey.get().expectedSize() : 0);
+	}
 
 	template <class Ar>
 	void serialize(Ar& ar) {
@@ -207,6 +242,7 @@ struct GranuleStatusReply : public ReplyPromiseStreamReply {
 		           granuleRange,
 		           doSplit,
 		           writeHotSplit,
+		           initialSplitTooBig,
 		           continueEpoch,
 		           continueSeqno,
 		           granuleID,
@@ -214,7 +250,8 @@ struct GranuleStatusReply : public ReplyPromiseStreamReply {
 		           blockedVersion,
 		           mergeCandidate,
 		           originalEpoch,
-		           originalSeqno);
+		           originalSeqno,
+		           proposedSplitKey);
 	}
 };
 
diff --git a/fdbclient/include/fdbclient/ClientKnobs.h b/fdbclient/include/fdbclient/ClientKnobs.h
index 4978bd84e9..4bcec9fd44 100644
--- a/fdbclient/include/fdbclient/ClientKnobs.h
+++ b/fdbclient/include/fdbclient/ClientKnobs.h
@@ -39,10 +39,6 @@ public:
 
 	double FAILURE_MAX_DELAY;
 	double FAILURE_MIN_DELAY;
-	double FAILURE_TIMEOUT_DELAY;
-	double CLIENT_FAILURE_TIMEOUT_DELAY;
-	double FAILURE_EMERGENCY_DELAY;
-	double FAILURE_MAX_GENERATIONS;
 	double RECOVERY_DELAY_START_GENERATION;
 	double RECOVERY_DELAY_SECONDS_PER_GENERATION;
 	double MAX_GENERATIONS;
@@ -61,6 +57,7 @@ public:
 	double WRONG_SHARD_SERVER_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is
 	                                 // mostly wrong (e.g. dumping the database after a test)
 	double FUTURE_VERSION_RETRY_DELAY;
+	double GRV_ERROR_RETRY_DELAY;
 	double UNKNOWN_TENANT_RETRY_DELAY;
 	int REPLY_BYTE_LIMIT;
 	double DEFAULT_BACKOFF;
@@ -81,6 +78,7 @@ public:
 	int64_t CHANGE_FEED_CACHE_SIZE;
 	double CHANGE_FEED_POP_TIMEOUT;
 	int64_t CHANGE_FEED_STREAM_MIN_BYTES;
+	double CHANGE_FEED_START_INTERVAL;
 
 	int MAX_BATCH_SIZE;
 	double GRV_BATCH_TIMEOUT;
@@ -161,10 +159,8 @@ public:
 	double BACKUP_AGGREGATE_POLL_RATE;
 	double BACKUP_AGGREGATE_POLL_RATE_UPDATE_INTERVAL;
 	int BACKUP_LOG_WRITE_BATCH_MAX_SIZE;
-	int BACKUP_LOG_ATOMIC_OPS_SIZE;
 	int BACKUP_MAX_LOG_RANGES;
 	int BACKUP_SIM_COPY_LOG_RANGES;
-	int BACKUP_OPERATION_COST_OVERHEAD;
 	int BACKUP_VERSION_DELAY;
 	int BACKUP_MAP_KEY_LOWER_LIMIT;
 	int BACKUP_MAP_KEY_UPPER_LIMIT;
@@ -269,12 +265,9 @@ public:
 	double BUSYNESS_SPIKE_START_THRESHOLD;
 	double BUSYNESS_SPIKE_SATURATED_THRESHOLD;
 
-	// multi-version client control
-	int MVC_CLIENTLIB_CHUNK_SIZE;
-	int MVC_CLIENTLIB_CHUNKS_PER_TRANSACTION;
-
 	// Blob Granules
 	int BG_MAX_GRANULE_PARALLELISM;
+	int BG_TOO_MANY_GRANULES;
 
 	// The coordinator key/value in storage server might be inconsistent to the value stored in the cluster file.
 	// This might happen when a recovery is happening together with a cluster controller coordinator key change.
@@ -285,6 +278,12 @@ public:
 
 	// Tenants and Metacluster
 	int MAX_TENANTS_PER_CLUSTER;
+	int TENANT_TOMBSTONE_CLEANUP_INTERVAL;
+	int MAX_DATA_CLUSTERS;
+	int REMOVE_CLUSTER_TENANT_BATCH_SIZE;
+	int METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK;
+	double METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY;
+	double METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT;
 
 	ClientKnobs(Randomize randomize);
 	void initialize(Randomize randomize);
diff --git a/fdbclient/include/fdbclient/ClusterInterface.h b/fdbclient/include/fdbclient/ClusterInterface.h
index 14935f1700..a4e3da44f3 100644
--- a/fdbclient/include/fdbclient/ClusterInterface.h
+++ b/fdbclient/include/fdbclient/ClusterInterface.h
@@ -98,32 +98,44 @@ struct ClusterControllerClientInterface {
 	}
 };
 
-template <class T>
-struct ItemWithExamples {
-	T item;
-	int count;
-	std::vector<std::pair<NetworkAddress, Key>> examples;
-
-	ItemWithExamples() : item{}, count(0) {}
-	ItemWithExamples(T const& item, int count, std::vector<std::pair<NetworkAddress, Key>> const& examples)
-	  : item(item), count(count), examples(examples) {}
-
-	template <class Ar>
-	void serialize(Ar& ar) {
-		serializer(ar, item, count, examples);
-	}
-};
-
 struct OpenDatabaseRequest {
 	constexpr static FileIdentifier file_identifier = 2799502;
 	// Sent by the native API to the cluster controller to open a database and track client
 	//   info changes.  Returns immediately if the current client info id is different from
 	//   knownClientInfoID; otherwise returns when it next changes (or perhaps after a long interval)
 
-	int clientCount;
-	std::vector<ItemWithExamples<Key>> issues;
-	std::vector<ItemWithExamples<Standalone<ClientVersionRef>>> supportedVersions;
-	std::vector<ItemWithExamples<Key>> maxProtocolSupported;
+	struct Samples {
+		int count;
+
+		// network address / trace log group
+		std::set<std::pair<NetworkAddress, Key>> samples;
+
+		Samples() : count(0), samples{} {}
+
+		template <typename Ar>
+		void serialize(Ar& ar) {
+			serializer(ar, count, samples);
+		}
+
+		// Merges a set of Samples into *this
+		Samples& operator+=(const Samples& other) {
+			count += other.count;
+			samples.insert(std::begin(other.samples), std::end(other.samples));
+
+			return *this;
+		}
+	};
+
+	int clientCount = 0;
+
+	// Maps issue to Samples
+	std::map<Key, Samples> issues;
+
+	// Maps ClientVersionRef to Samples
+	std::map<Standalone<ClientVersionRef>, Samples> supportedVersions;
+
+	// Maps max protocol to Samples
+	std::map<Key, Samples> maxProtocolSupported;
 
 	UID knownClientInfoID;
 	ReplyPromise<struct ClientDBInfo> reply;
diff --git a/fdbclient/include/fdbclient/CommitProxyInterface.h b/fdbclient/include/fdbclient/CommitProxyInterface.h
index 253e0e1f36..1a6a0410ae 100644
--- a/fdbclient/include/fdbclient/CommitProxyInterface.h
+++ b/fdbclient/include/fdbclient/CommitProxyInterface.h
@@ -25,6 +25,7 @@
 #include <utility>
 #include <vector>
 
+#include "fdbclient/EncryptKeyProxyInterface.h"
 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbclient/CommitTransaction.h"
@@ -118,8 +119,11 @@ struct ClientDBInfo {
 	std::vector<VersionHistory> history;
 	UID clusterId;
 	bool isEncryptionEnabled = false;
+	Optional<EncryptKeyProxyInterface> encryptKeyProxy;
 
 	TenantMode tenantMode;
+	ClusterType clusterType = ClusterType::STANDALONE;
+	Optional<ClusterName> metaclusterName;
 
 	ClientDBInfo() {}
 
@@ -131,7 +135,18 @@ struct ClientDBInfo {
 		if constexpr (!is_fb_function<Archive>) {
 			ASSERT(ar.protocolVersion().isValid());
 		}
-		serializer(ar, grvProxies, commitProxies, id, forward, history, tenantMode, clusterId, isEncryptionEnabled);
+		serializer(ar,
+		           grvProxies,
+		           commitProxies,
+		           id,
+		           forward,
+		           history,
+		           tenantMode,
+		           isEncryptionEnabled,
+		           encryptKeyProxy,
+		           clusterId,
+		           clusterType,
+		           metaclusterName);
 	}
 };
 
diff --git a/fdbclient/include/fdbclient/CommitTransaction.h b/fdbclient/include/fdbclient/CommitTransaction.h
index f6757ac17e..dc26df4fa4 100644
--- a/fdbclient/include/fdbclient/CommitTransaction.h
+++ b/fdbclient/include/fdbclient/CommitTransaction.h
@@ -25,6 +25,7 @@
 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/Knobs.h"
 #include "fdbclient/Tracing.h"
+#include "flow/BlobCipher.h"
 
 // The versioned message has wire format : -1, version, messages
 static const int32_t VERSION_HEADER = -1;
@@ -79,7 +80,7 @@ struct MutationRef {
 		CompareAndClear,
 		Reserved_For_SpanContextMessage /* See fdbserver/SpanContextMessage.h */,
 		Reserved_For_OTELSpanContextMessage,
-		Reserved_For_EncryptedMutationMessage /* See fdbserver/EncryptedMutationMessage.actor.h */,
+		Encrypted, /* Represents an encrypted mutation and cannot be used directly before decrypting */
 		MAX_ATOMIC_OP
 	};
 	// This is stored this way for serialization purposes.
@@ -128,6 +129,64 @@ struct MutationRef {
 		}
 	}
 
+	// An encrypted mutation has type Encrypted, encryption header (which contains encryption metadata) as param1,
+	// and the payload as param2. It can be serialize/deserialize as normal mutation, but can only be used after
+	// decryption via decrypt().
+	bool isEncrypted() const { return type == Encrypted; }
+
+	const BlobCipherEncryptHeader* encryptionHeader() const {
+		ASSERT(isEncrypted());
+		return reinterpret_cast<const BlobCipherEncryptHeader*>(param1.begin());
+	}
+
+	MutationRef encrypt(const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>& cipherKeys,
+	                    const EncryptCipherDomainId& domainId,
+	                    Arena& arena) const {
+		ASSERT_NE(domainId, ENCRYPT_INVALID_DOMAIN_ID);
+		auto textCipherItr = cipherKeys.find(domainId);
+		auto headerCipherItr = cipherKeys.find(ENCRYPT_HEADER_DOMAIN_ID);
+		ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid());
+		ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid());
+		uint8_t iv[AES_256_IV_LENGTH] = { 0 };
+		deterministicRandom()->randomBytes(iv, AES_256_IV_LENGTH);
+		BinaryWriter bw(AssumeVersion(ProtocolVersion::withEncryptionAtRest()));
+		bw << *this;
+		EncryptBlobCipherAes265Ctr cipher(textCipherItr->second,
+		                                  headerCipherItr->second,
+		                                  iv,
+		                                  AES_256_IV_LENGTH,
+		                                  ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
+		BlobCipherEncryptHeader* header = new (arena) BlobCipherEncryptHeader;
+		StringRef headerRef(reinterpret_cast<const uint8_t*>(header), sizeof(BlobCipherEncryptHeader));
+		StringRef payload =
+		    cipher.encrypt(static_cast<const uint8_t*>(bw.getData()), bw.getLength(), header, arena)->toStringRef();
+		return MutationRef(Encrypted, headerRef, payload);
+	}
+
+	MutationRef encryptMetadata(const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>& cipherKeys,
+	                            Arena& arena) const {
+		return encrypt(cipherKeys, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, arena);
+	}
+
+	MutationRef decrypt(const std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>& cipherKeys,
+	                    Arena& arena,
+	                    StringRef* buf = nullptr) const {
+		const BlobCipherEncryptHeader* header = encryptionHeader();
+		auto textCipherItr = cipherKeys.find(header->cipherTextDetails);
+		auto headerCipherItr = cipherKeys.find(header->cipherHeaderDetails);
+		ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid());
+		ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid());
+		DecryptBlobCipherAes256Ctr cipher(textCipherItr->second, headerCipherItr->second, header->iv);
+		StringRef plaintext = cipher.decrypt(param2.begin(), param2.size(), *header, arena)->toStringRef();
+		if (buf != nullptr) {
+			*buf = plaintext;
+		}
+		ArenaReader reader(arena, plaintext, AssumeVersion(ProtocolVersion::withEncryptionAtRest()));
+		MutationRef mutation;
+		reader >> mutation;
+		return mutation;
+	}
+
 	// These masks define which mutation types have particular properties (they are used to implement
 	// isSingleKeyMutation() etc)
 	enum {
diff --git a/fdbclient/include/fdbclient/DatabaseContext.h b/fdbclient/include/fdbclient/DatabaseContext.h
index 092a290a4c..1a4fee7126 100644
--- a/fdbclient/include/fdbclient/DatabaseContext.h
+++ b/fdbclient/include/fdbclient/DatabaseContext.h
@@ -25,6 +25,7 @@
 #include "flow/FastRef.h"
 #include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/StorageServerInterface.h"
+#include "flow/IRandom.h"
 #include "flow/genericactors.actor.h"
 #include <vector>
 #include <unordered_map>
@@ -167,10 +168,11 @@ struct ChangeFeedStorageData : ReferenceCounted<ChangeFeedStorageData> {
 	Future<Void> updater;
 	NotifiedVersion version;
 	NotifiedVersion desired;
-	Promise<Void> destroyed;
 	UID interfToken;
+	DatabaseContext* context;
+	double created;
 
-	~ChangeFeedStorageData() { destroyed.send(Void()); }
+	~ChangeFeedStorageData();
 };
 
 struct ChangeFeedData : ReferenceCounted<ChangeFeedData> {
@@ -180,6 +182,8 @@ struct ChangeFeedData : ReferenceCounted<ChangeFeedData> {
 	Version getVersion();
 	Future<Void> whenAtLeast(Version version);
 
+	UID dbgid;
+	DatabaseContext* context;
 	NotifiedVersion lastReturnedVersion;
 	std::vector<Reference<ChangeFeedStorageData>> storageData;
 	AsyncVar<int> notAtLatest;
@@ -188,8 +192,10 @@ struct ChangeFeedData : ReferenceCounted<ChangeFeedData> {
 	Version endVersion = invalidVersion;
 	Version popVersion =
 	    invalidVersion; // like TLog pop version, set by SS and client can check it to see if they missed data
+	double created = 0;
 
-	ChangeFeedData() : notAtLatest(1) {}
+	explicit ChangeFeedData(DatabaseContext* context = nullptr);
+	~ChangeFeedData();
 };
 
 struct EndpointFailureInfo {
@@ -374,12 +380,18 @@ public:
 	Future<OverlappingChangeFeedsInfo> getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion);
 	Future<Void> popChangeFeedMutations(Key rangeID, Version version);
 
+	// BlobGranule API.
 	Future<Key> purgeBlobGranules(KeyRange keyRange,
 	                              Version purgeVersion,
 	                              Optional<TenantName> tenant,
 	                              bool force = false);
 	Future<Void> waitPurgeGranulesComplete(Key purgeKey);
 
+	Future<bool> blobbifyRange(KeyRange range);
+	Future<bool> unblobbifyRange(KeyRange range);
+	Future<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRanges(KeyRange range, int rangeLimit);
+	Future<Version> verifyBlobRange(const KeyRange& range, Optional<Version> version);
+
 	// private:
 	explicit DatabaseContext(Reference<AsyncVar<Reference<IClusterConnectionRecord>>> connectionRecord,
 	                         Reference<AsyncVar<ClientDBInfo>> clientDBInfo,
@@ -467,9 +479,12 @@ public:
 	std::unordered_map<UID, Reference<TSSMetrics>> tssMetrics;
 	// map from changeFeedId -> changeFeedRange
 	std::unordered_map<Key, KeyRange> changeFeedCache;
-	std::unordered_map<UID, Reference<ChangeFeedStorageData>> changeFeedUpdaters;
+	std::unordered_map<UID, ChangeFeedStorageData*> changeFeedUpdaters;
+	std::map<UID, ChangeFeedData*> notAtLatestChangeFeeds;
 
 	Reference<ChangeFeedStorageData> getStorageData(StorageServerInterface interf);
+	Version getMinimumChangeFeedVersion();
+	void setDesiredChangeFeedVersion(Version v);
 
 	// map from ssid -> ss tag
 	// @note this map allows the client to identify the latest commit versions
diff --git a/fdbserver/include/fdbserver/EncryptKeyProxyInterface.h b/fdbclient/include/fdbclient/EncryptKeyProxyInterface.h
similarity index 100%
rename from fdbserver/include/fdbserver/EncryptKeyProxyInterface.h
rename to fdbclient/include/fdbclient/EncryptKeyProxyInterface.h
diff --git a/fdbclient/include/fdbclient/FDBTypes.h b/fdbclient/include/fdbclient/FDBTypes.h
index 4f4d48cb75..f770c92592 100644
--- a/fdbclient/include/fdbclient/FDBTypes.h
+++ b/fdbclient/include/fdbclient/FDBTypes.h
@@ -331,6 +331,22 @@ struct KeyRangeRef {
 	bool empty() const { return begin == end; }
 	bool singleKeyRange() const { return equalsKeyAfter(begin, end); }
 
+	// Return true if it's fully covered by given range list. Note that ranges should be sorted
+	bool isCovered(std::vector<KeyRangeRef>& ranges) {
+		ASSERT(std::is_sorted(ranges.begin(), ranges.end(), KeyRangeRef::ArbitraryOrder()));
+		KeyRangeRef clone(begin, end);
+		for (auto r : ranges) {
+			if (begin < r.begin)
+				return false; // uncovered gap between clone.begin and r.begin
+			if (end <= r.end)
+				return true; // range is fully covered
+			if (end > r.begin)
+				// {clone.begin, r.end} is covered. need to check coverage for {r.end, clone.end}
+				clone = KeyRangeRef(r.end, clone.end);
+		}
+		return false;
+	}
+
 	Standalone<KeyRangeRef> withPrefix(const StringRef& prefix) const {
 		return KeyRangeRef(begin.withPrefix(prefix), end.withPrefix(prefix));
 	}
@@ -1283,8 +1299,6 @@ struct WorkerBackupStatus {
 
 enum class TransactionPriority : uint8_t { BATCH, DEFAULT, IMMEDIATE, MIN = BATCH, MAX = IMMEDIATE };
 
-enum class ReadType { EAGER = 0, FETCH = 1, LOW = 2, NORMAL = 3, HIGH = 4, MIN = EAGER, MAX = HIGH };
-
 const std::array<TransactionPriority, (int)TransactionPriority::MAX + 1> allTransactionPriorities = {
 	TransactionPriority::BATCH,
 	TransactionPriority::DEFAULT,
@@ -1394,6 +1408,11 @@ struct TenantMode {
 	uint32_t mode;
 };
 
+typedef StringRef ClusterNameRef;
+typedef Standalone<ClusterNameRef> ClusterName;
+
+enum class ClusterType { STANDALONE, METACLUSTER_MANAGEMENT, METACLUSTER_DATA };
+
 struct GRVCacheSpace {
 	Version cachedReadVersion;
 	double lastGrvTime;
@@ -1415,7 +1434,7 @@ struct DatabaseSharedState {
 	std::atomic<int> refCount;
 
 	DatabaseSharedState()
-	  : protocolVersion(currentProtocolVersion), mutexLock(Mutex()), grvCacheSpace(GRVCacheSpace()), refCount(0) {}
+	  : protocolVersion(currentProtocolVersion()), mutexLock(Mutex()), grvCacheSpace(GRVCacheSpace()), refCount(0) {}
 };
 
 inline bool isValidPerpetualStorageWiggleLocality(std::string locality) {
@@ -1462,7 +1481,7 @@ struct StorageMetadataType {
 	bool wrongConfigured = false;
 
 	StorageMetadataType() : createdTime(0) {}
-	StorageMetadataType(uint64_t t, KeyValueStoreType storeType = KeyValueStoreType::END, bool wrongConfigured = false)
+	StorageMetadataType(double t, KeyValueStoreType storeType = KeyValueStoreType::END, bool wrongConfigured = false)
 	  : createdTime(t), storeType(storeType), wrongConfigured(wrongConfigured) {}
 
 	static double currentTime() { return g_network->timer(); }
@@ -1512,6 +1531,44 @@ struct StorageWiggleValue {
 	}
 };
 
+enum class ReadType {
+	EAGER = 0,
+	FETCH = 1,
+	LOW = 2,
+	NORMAL = 3,
+	HIGH = 4,
+	MIN = EAGER,
+	MAX = HIGH
+};
+
+FDB_DECLARE_BOOLEAN_PARAM(CacheResult);
+
+// store options for storage engine read
+// ReadType describes the usage and priority of the read
+// cacheResult determines whether the storage engine cache for this read
+// consistencyCheckStartVersion indicates the consistency check which began at this version
+// debugID helps to trace the path of the read
+struct ReadOptions {
+	ReadType type;
+	// Once CacheResult is serializable, change type from bool to CacheResult
+	bool cacheResult;
+	Optional<UID> debugID;
+	Optional<Version> consistencyCheckStartVersion;
+
+	ReadOptions() : type(ReadType::NORMAL), cacheResult(CacheResult::True){};
+
+	ReadOptions(Optional<UID> debugID,
+	            ReadType type = ReadType::NORMAL,
+	            CacheResult cache = CacheResult::False,
+	            Optional<Version> version = Optional<Version>())
+	  : type(type), cacheResult(cache), debugID(debugID), consistencyCheckStartVersion(version){};
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, type, cacheResult, debugID, consistencyCheckStartVersion);
+	}
+};
+
 // Can be used to identify types (e.g. IDatabase) that can be used to create transactions with a `createTransaction`
 // function
 template <typename, typename = void>
diff --git a/fdbclient/include/fdbclient/GenericManagementAPI.actor.h b/fdbclient/include/fdbclient/GenericManagementAPI.actor.h
index 8fe0d08fd2..4c920f5da6 100644
--- a/fdbclient/include/fdbclient/GenericManagementAPI.actor.h
+++ b/fdbclient/include/fdbclient/GenericManagementAPI.actor.h
@@ -39,6 +39,7 @@ the contents of the system key space.
 #include "fdbclient/Status.h"
 #include "fdbclient/Subspace.h"
 #include "fdbclient/DatabaseConfiguration.h"
+#include "fdbclient/Metacluster.h"
 #include "fdbclient/Status.h"
 #include "fdbclient/SystemData.h"
 #include "flow/actorcompiler.h" // has to be last include
@@ -69,6 +70,7 @@ enum class ConfigurationResult {
 	SUCCESS_WARN_SHARDED_ROCKSDB_EXPERIMENTAL,
 	DATABASE_CREATED_WARN_ROCKSDB_EXPERIMENTAL,
 	DATABASE_CREATED_WARN_SHARDED_ROCKSDB_EXPERIMENTAL,
+	DATABASE_IS_REGISTERED
 };
 
 enum class CoordinatorsResult {
@@ -475,6 +477,14 @@ Future<ConfigurationResult> changeConfig(Reference<DB> db, std::map<std::string,
 					           newConfig.storageServerStoreType == KeyValueStoreType::SSD_SHARDED_ROCKSDB) {
 						warnShardedRocksDBIsExperimental = true;
 					}
+
+					if (newConfig.tenantMode != oldConfig.tenantMode) {
+						Optional<MetaclusterRegistrationEntry> metaclusterRegistration =
+						    wait(MetaclusterMetadata::metaclusterRegistration().get(tr));
+						if (metaclusterRegistration.present()) {
+							return ConfigurationResult::DATABASE_IS_REGISTERED;
+						}
+					}
 				}
 			}
 			if (creating) {
diff --git a/fdbserver/GetEncryptCipherKeys.actor.cpp b/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h
similarity index 66%
rename from fdbserver/GetEncryptCipherKeys.actor.cpp
rename to fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h
index 328ff21587..42537bfacb 100644
--- a/fdbserver/GetEncryptCipherKeys.actor.cpp
+++ b/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h
@@ -1,5 +1,5 @@
 /*
- * GetEncryptCipherKeys.actor.cpp
+ * GetEncryptCipherKeys.actor.h
  *
  * This source file is part of the FoundationDB open source project
  *
@@ -17,18 +17,29 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
+#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_GETCIPHERKEYS_ACTOR_G_H)
+#define FDBCLIENT_GETCIPHERKEYS_ACTOR_G_H
+#include "fdbclient/GetEncryptCipherKeys.actor.g.h"
+#elif !defined(FDBCLIENT_GETCIPHERKEYS_ACTOR_H)
+#define FDBCLIENT_GETCIPHERKEYS_ACTOR_H
 
-#include "fdbserver/GetEncryptCipherKeys.h"
+#include "fdbclient/EncryptKeyProxyInterface.h"
+#include "flow/BlobCipher.h"
+#include "flow/IRandom.h"
 
-#include <boost/functional/hash.hpp>
+#include <unordered_map>
+#include <unordered_set>
 
-namespace {
+#include "flow/actorcompiler.h" // This must be the last #include.
 
-Optional<UID> getEncryptKeyProxyId(const Reference<AsyncVar<ServerDBInfo> const>& db) {
-	return db->get().encryptKeyProxy.map<UID>([](EncryptKeyProxyInterface proxy) { return proxy.id(); });
+template <class T>
+Optional<UID> getEncryptKeyProxyId(const Reference<AsyncVar<T> const>& db) {
+	return db->get().encryptKeyProxy.template map<UID>([](EncryptKeyProxyInterface proxy) { return proxy.id(); });
 }
 
-ACTOR Future<Void> onEncryptKeyProxyChange(Reference<AsyncVar<ServerDBInfo> const> db) {
+ACTOR template <class T>
+Future<Void> onEncryptKeyProxyChange(Reference<AsyncVar<T> const> db) {
 	state Optional<UID> previousProxyId = getEncryptKeyProxyId(db);
 	state Optional<UID> currentProxyId;
 	loop {
@@ -44,9 +55,9 @@ ACTOR Future<Void> onEncryptKeyProxyChange(Reference<AsyncVar<ServerDBInfo> cons
 	return Void();
 }
 
-ACTOR Future<EKPGetLatestBaseCipherKeysReply> getUncachedLatestEncryptCipherKeys(
-    Reference<AsyncVar<ServerDBInfo> const> db,
-    EKPGetLatestBaseCipherKeysRequest request) {
+ACTOR template <class T>
+Future<EKPGetLatestBaseCipherKeysReply> getUncachedLatestEncryptCipherKeys(Reference<AsyncVar<T> const> db,
+                                                                           EKPGetLatestBaseCipherKeysRequest request) {
 	Optional<EncryptKeyProxyInterface> proxy = db->get().encryptKeyProxy;
 	if (!proxy.present()) {
 		// Wait for onEncryptKeyProxyChange.
@@ -71,10 +82,12 @@ ACTOR Future<EKPGetLatestBaseCipherKeysReply> getUncachedLatestEncryptCipherKeys
 	}
 }
 
-} // anonymous namespace
-
-ACTOR Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>> getLatestEncryptCipherKeys(
-    Reference<AsyncVar<ServerDBInfo> const> db,
+// Get latest cipher keys for given encryption domains. It tries to get the cipher keys from local cache.
+// In case of cache miss, it fetches the cipher keys from EncryptKeyProxy and put the result in the local cache
+// before return.
+ACTOR template <class T>
+Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>> getLatestEncryptCipherKeys(
+    Reference<AsyncVar<T> const> db,
     std::unordered_map<EncryptCipherDomainId, EncryptCipherDomainName> domains) {
 	state Reference<BlobCipherKeyCache> cipherKeyCache = BlobCipherKeyCache::getInstance();
 	state std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>> cipherKeys;
@@ -105,8 +118,12 @@ ACTOR Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>
 			for (const EKPBaseCipherDetails& details : reply.baseCipherDetails) {
 				EncryptCipherDomainId domainId = details.encryptDomainId;
 				if (domains.count(domainId) > 0 && cipherKeys.count(domainId) == 0) {
-					Reference<BlobCipherKey> cipherKey = cipherKeyCache->insertCipherKey(
-					    domainId, details.baseCipherId, details.baseCipherKey.begin(), details.baseCipherKey.size());
+					Reference<BlobCipherKey> cipherKey = cipherKeyCache->insertCipherKey(domainId,
+					                                                                     details.baseCipherId,
+					                                                                     details.baseCipherKey.begin(),
+					                                                                     details.baseCipherKey.size(),
+					                                                                     details.refreshAt,
+					                                                                     details.expireAt);
 					ASSERT(cipherKey.isValid());
 					cipherKeys[domainId] = cipherKey;
 				}
@@ -126,10 +143,9 @@ ACTOR Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>
 	return cipherKeys;
 }
 
-namespace {
-
-ACTOR Future<EKPGetBaseCipherKeysByIdsReply> getUncachedEncryptCipherKeys(Reference<AsyncVar<ServerDBInfo> const> db,
-                                                                          EKPGetBaseCipherKeysByIdsRequest request) {
+ACTOR template <class T>
+Future<EKPGetBaseCipherKeysByIdsReply> getUncachedEncryptCipherKeys(Reference<AsyncVar<T> const> db,
+                                                                    EKPGetBaseCipherKeysByIdsRequest request) {
 	Optional<EncryptKeyProxyInterface> proxy = db->get().encryptKeyProxy;
 	if (!proxy.present()) {
 		// Wait for onEncryptKeyProxyChange.
@@ -156,10 +172,12 @@ ACTOR Future<EKPGetBaseCipherKeysByIdsReply> getUncachedEncryptCipherKeys(Refere
 
 using BaseCipherIndex = std::pair<EncryptCipherDomainId, EncryptCipherBaseKeyId>;
 
-} // anonymous namespace
-
-ACTOR Future<std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>> getEncryptCipherKeys(
-    Reference<AsyncVar<ServerDBInfo> const> db,
+// Get cipher keys specified by the list of cipher details. It tries to get the cipher keys from local cache.
+// In case of cache miss, it fetches the cipher keys from EncryptKeyProxy and put the result in the local cache
+// before return.
+ACTOR template <class T>
+Future<std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>> getEncryptCipherKeys(
+    Reference<AsyncVar<T> const> db,
     std::unordered_set<BlobCipherDetails> cipherDetails) {
 	state Reference<BlobCipherKeyCache> cipherKeyCache = BlobCipherKeyCache::getInstance();
 	state std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>> cipherKeys;
@@ -191,10 +209,10 @@ ACTOR Future<std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>> ge
 	// Fetch any uncached cipher keys.
 	loop choose {
 		when(EKPGetBaseCipherKeysByIdsReply reply = wait(getUncachedEncryptCipherKeys(db, request))) {
-			std::unordered_map<BaseCipherIndex, StringRef, boost::hash<BaseCipherIndex>> baseCipherKeys;
+			std::unordered_map<BaseCipherIndex, EKPBaseCipherDetails, boost::hash<BaseCipherIndex>> baseCipherKeys;
 			for (const EKPBaseCipherDetails& baseDetails : reply.baseCipherDetails) {
 				BaseCipherIndex baseIdx = std::make_pair(baseDetails.encryptDomainId, baseDetails.baseCipherId);
-				baseCipherKeys[baseIdx] = baseDetails.baseCipherKey;
+				baseCipherKeys[baseIdx] = baseDetails;
 			}
 			// Insert base cipher keys into cache and construct result.
 			for (const BlobCipherDetails& details : cipherDetails) {
@@ -211,9 +229,11 @@ ACTOR Future<std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>> ge
 				}
 				Reference<BlobCipherKey> cipherKey = cipherKeyCache->insertCipherKey(details.encryptDomainId,
 				                                                                     details.baseCipherId,
-				                                                                     itr->second.begin(),
-				                                                                     itr->second.size(),
-				                                                                     details.salt);
+				                                                                     itr->second.baseCipherKey.begin(),
+				                                                                     itr->second.baseCipherKey.size(),
+				                                                                     details.salt,
+				                                                                     itr->second.refreshAt,
+				                                                                     itr->second.expireAt);
 				ASSERT(cipherKey.isValid());
 				cipherKeys[details] = cipherKey;
 			}
@@ -225,24 +245,35 @@ ACTOR Future<std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>> ge
 	return cipherKeys;
 }
 
-ACTOR Future<TextAndHeaderCipherKeys> getLatestSystemEncryptCipherKeys(Reference<AsyncVar<ServerDBInfo> const> db) {
-	static std::unordered_map<EncryptCipherDomainId, EncryptCipherDomainName> domains = {
-		{ SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME },
-		{ ENCRYPT_HEADER_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME }
-	};
+struct TextAndHeaderCipherKeys {
+	Reference<BlobCipherKey> cipherTextKey;
+	Reference<BlobCipherKey> cipherHeaderKey;
+};
+
+ACTOR template <class T>
+Future<TextAndHeaderCipherKeys> getLatestEncryptCipherKeysForDomain(Reference<AsyncVar<T> const> db,
+                                                                    EncryptCipherDomainId domainId,
+                                                                    EncryptCipherDomainName domainName) {
+	std::unordered_map<EncryptCipherDomainId, EncryptCipherDomainName> domains;
+	domains[domainId] = domainName;
+	domains[ENCRYPT_HEADER_DOMAIN_ID] = FDB_DEFAULT_ENCRYPT_DOMAIN_NAME;
 	std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>> cipherKeys =
 	    wait(getLatestEncryptCipherKeys(db, domains));
-	ASSERT(cipherKeys.count(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) > 0);
+	ASSERT(cipherKeys.count(domainId) > 0);
 	ASSERT(cipherKeys.count(ENCRYPT_HEADER_DOMAIN_ID) > 0);
-	TextAndHeaderCipherKeys result{ cipherKeys.at(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID),
-		                            cipherKeys.at(ENCRYPT_HEADER_DOMAIN_ID) };
+	TextAndHeaderCipherKeys result{ cipherKeys.at(domainId), cipherKeys.at(ENCRYPT_HEADER_DOMAIN_ID) };
 	ASSERT(result.cipherTextKey.isValid());
 	ASSERT(result.cipherHeaderKey.isValid());
 	return result;
 }
 
-ACTOR Future<TextAndHeaderCipherKeys> getEncryptCipherKeys(Reference<AsyncVar<ServerDBInfo> const> db,
-                                                           BlobCipherEncryptHeader header) {
+template <class T>
+Future<TextAndHeaderCipherKeys> getLatestSystemEncryptCipherKeys(const Reference<AsyncVar<T> const>& db) {
+	return getLatestEncryptCipherKeysForDomain(db, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
+}
+
+ACTOR template <class T>
+Future<TextAndHeaderCipherKeys> getEncryptCipherKeys(Reference<AsyncVar<T> const> db, BlobCipherEncryptHeader header) {
 	std::unordered_set<BlobCipherDetails> cipherDetails{ header.cipherTextDetails, header.cipherHeaderDetails };
 	std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>> cipherKeys =
 	    wait(getEncryptCipherKeys(db, cipherDetails));
@@ -254,3 +285,6 @@ ACTOR Future<TextAndHeaderCipherKeys> getEncryptCipherKeys(Reference<AsyncVar<Se
 	ASSERT(result.cipherHeaderKey.isValid());
 	return result;
 }
+
+#include "flow/unactorcompiler.h"
+#endif
\ No newline at end of file
diff --git a/fdbclient/include/fdbclient/GrvProxyInterface.h b/fdbclient/include/fdbclient/GrvProxyInterface.h
index 5c3913fa50..5d1cec15b2 100644
--- a/fdbclient/include/fdbclient/GrvProxyInterface.h
+++ b/fdbclient/include/fdbclient/GrvProxyInterface.h
@@ -1,4 +1,3 @@
-
 /*
  * GrvProxyInterface.h
  *
diff --git a/fdbclient/include/fdbclient/IClientApi.h b/fdbclient/include/fdbclient/IClientApi.h
index 73e743d060..4f9d56b9e4 100644
--- a/fdbclient/include/fdbclient/IClientApi.h
+++ b/fdbclient/include/fdbclient/IClientApi.h
@@ -22,6 +22,7 @@
 #define FDBCLIENT_ICLIENTAPI_H
 #pragma once
 
+#include "fdbclient/BlobGranuleCommon.h"
 #include "fdbclient/FDBOptions.g.h"
 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/Tenant.h"
@@ -78,13 +79,27 @@ public:
 	virtual ThreadFuture<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(const KeyRangeRef& range,
 	                                                                        int64_t chunkSize) = 0;
 
-	virtual ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRangeRef& keyRange) = 0;
+	virtual ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRangeRef& keyRange,
+	                                                                              int rowLimit) = 0;
 
 	virtual ThreadResult<RangeResult> readBlobGranules(const KeyRangeRef& keyRange,
 	                                                   Version beginVersion,
 	                                                   Optional<Version> readVersion,
 	                                                   ReadBlobGranuleContext granuleContext) = 0;
 
+	virtual ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesStart(
+	    const KeyRangeRef& keyRange,
+	    Version beginVersion,
+	    Optional<Version> readVersion,
+	    Version* readVersionOut) = 0;
+
+	virtual ThreadResult<RangeResult> readBlobGranulesFinish(
+	    ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>> startFuture,
+	    const KeyRangeRef& keyRange,
+	    Version beginVersion,
+	    Version readVersion,
+	    ReadBlobGranuleContext granuleContext) = 0;
+
 	virtual void atomicOp(const KeyRef& key, const ValueRef& value, uint32_t operationType) = 0;
 	virtual void set(const KeyRef& key, const ValueRef& value) = 0;
 	virtual void clear(const KeyRef& begin, const KeyRef& end) = 0;
@@ -172,6 +187,13 @@ public:
 	virtual ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) = 0;
 	virtual ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) = 0;
 
+	virtual ThreadFuture<bool> blobbifyRange(const KeyRangeRef& keyRange) = 0;
+	virtual ThreadFuture<bool> unblobbifyRange(const KeyRangeRef& keyRange) = 0;
+	virtual ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRanges(const KeyRangeRef& keyRange,
+	                                                                              int rangeLimit) = 0;
+
+	virtual ThreadFuture<Version> verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) = 0;
+
 	// Interface to manage shared state across multiple connections to the same Database
 	virtual ThreadFuture<DatabaseSharedState*> createSharedState() = 0;
 	virtual void setSharedState(DatabaseSharedState* p) = 0;
@@ -190,6 +212,7 @@ public:
 
 	virtual void selectApiVersion(int apiVersion) = 0;
 	virtual const char* getClientVersion() = 0;
+	virtual void useFutureProtocolVersion() = 0;
 
 	virtual void setNetworkOption(FDBNetworkOptions::Option option,
 	                              Optional<StringRef> value = Optional<StringRef>()) = 0;
diff --git a/fdbclient/include/fdbclient/IConfigTransaction.h b/fdbclient/include/fdbclient/IConfigTransaction.h
index 8f21679e27..9246e4016e 100644
--- a/fdbclient/include/fdbclient/IConfigTransaction.h
+++ b/fdbclient/include/fdbclient/IConfigTransaction.h
@@ -55,7 +55,7 @@ public:
 	Future<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(KeyRange const& range, int64_t chunkSize) override {
 		throw client_invalid_operation();
 	}
-	Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(KeyRange const& range) override {
+	Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(KeyRange const& range, int rowLimit) override {
 		throw client_invalid_operation();
 	}
 	Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranules(KeyRange const& range,
diff --git a/fdbclient/include/fdbclient/ISingleThreadTransaction.h b/fdbclient/include/fdbclient/ISingleThreadTransaction.h
index b44f58b464..6143ec8605 100644
--- a/fdbclient/include/fdbclient/ISingleThreadTransaction.h
+++ b/fdbclient/include/fdbclient/ISingleThreadTransaction.h
@@ -80,7 +80,7 @@ public:
 	virtual Future<Standalone<VectorRef<const char*>>> getAddressesForKey(Key const& key) = 0;
 	virtual Future<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(KeyRange const& range, int64_t chunkSize) = 0;
 	virtual Future<int64_t> getEstimatedRangeSizeBytes(KeyRange const& keys) = 0;
-	virtual Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(KeyRange const& range) = 0;
+	virtual Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(KeyRange const& range, int rangeLimit) = 0;
 	virtual Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranules(KeyRange const& range,
 	                                                                            Version begin,
 	                                                                            Optional<Version> readVersion,
diff --git a/fdbclient/include/fdbclient/KeyBackedTypes.h b/fdbclient/include/fdbclient/KeyBackedTypes.h
index 2977262b6c..a3fee57644 100644
--- a/fdbclient/include/fdbclient/KeyBackedTypes.h
+++ b/fdbclient/include/fdbclient/KeyBackedTypes.h
@@ -25,6 +25,7 @@
 
 #include "fdbclient/ClientBooleanParams.h"
 #include "fdbclient/CommitTransaction.h"
+#include "fdbclient/FDBOptions.g.h"
 #include "fdbclient/GenericTransactionHelper.h"
 #include "fdbclient/Subspace.h"
 #include "flow/ObjectSerializer.h"
@@ -156,6 +157,12 @@ struct NullCodec {
 	static Standalone<StringRef> unpack(Standalone<StringRef> val) { return val; }
 };
 
+template <class T>
+struct BinaryCodec {
+	static Standalone<StringRef> pack(T val) { return BinaryWriter::toValue<T>(val, Unversioned()); }
+	static T unpack(Standalone<StringRef> val) { return BinaryReader::fromStringRef<T>(val, Unversioned()); }
+};
+
 template <typename ResultType>
 struct KeyBackedRangeResult {
 	std::vector<ResultType> results;
@@ -364,6 +371,16 @@ public:
 		    }));
 	}
 
+	// Get key's value or defaultValue if it doesn't exist
+	template <class Transaction>
+	Future<ValueType> getD(Transaction tr,
+	                       KeyType const& key,
+	                       Snapshot snapshot = Snapshot::False,
+	                       ValueType defaultValue = ValueType()) const {
+		return map(get(tr, key, snapshot),
+		           [=](Optional<ValueType> val) -> ValueType { return val.orDefault(defaultValue); });
+	}
+
 	// Returns a Property that can be get/set that represents key's entry in this this.
 	KeyBackedProperty<ValueType> getProperty(KeyType const& key) const {
 		return subspace.begin.withSuffix(KeyCodec::pack(key));
@@ -378,6 +395,13 @@ public:
 		return k.expectedSize() + v.expectedSize();
 	}
 
+	template <class Transaction>
+	void atomicOp(Transaction tr, KeyType const& key, ValueType const& val, MutationRef::Type type) {
+		Key k = subspace.begin.withSuffix(KeyCodec::pack(key));
+		Value v = ValueCodec::pack(val);
+		tr->atomicOp(k, v, type);
+	}
+
 	template <class Transaction>
 	void erase(Transaction tr, KeyType const& key) {
 		tr->clear(subspace.begin.withSuffix(KeyCodec::pack(key)));
diff --git a/fdbclient/include/fdbclient/KeyRangeMap.h b/fdbclient/include/fdbclient/KeyRangeMap.h
index 88cce027a8..f88dc72dda 100644
--- a/fdbclient/include/fdbclient/KeyRangeMap.h
+++ b/fdbclient/include/fdbclient/KeyRangeMap.h
@@ -136,6 +136,16 @@ Future<RangeResult> krmGetRanges(Reference<ReadYourWritesTransaction> const& tr,
                                  KeyRange const& keys,
                                  int const& limit = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT,
                                  int const& limitBytes = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES);
+Future<RangeResult> krmGetRangesUnaligned(Transaction* const& tr,
+                                          Key const& mapPrefix,
+                                          KeyRange const& keys,
+                                          int const& limit = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT,
+                                          int const& limitBytes = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES);
+Future<RangeResult> krmGetRangesUnaligned(Reference<ReadYourWritesTransaction> const& tr,
+                                          Key const& mapPrefix,
+                                          KeyRange const& keys,
+                                          int const& limit = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT,
+                                          int const& limitBytes = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES);
 void krmSetPreviouslyEmptyRange(Transaction* tr,
                                 const KeyRef& mapPrefix,
                                 const KeyRangeRef& keys,
@@ -162,7 +172,7 @@ Future<Void> krmSetRangeCoalescing(Reference<ReadYourWritesTransaction> const& t
                                    KeyRange const& range,
                                    KeyRange const& maxRange,
                                    Value const& value);
-RangeResult krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, RangeResult kv);
+RangeResult krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, RangeResult kv, bool align = true);
 
 template <class Val, class Metric, class MetricFunc>
 std::vector<KeyRangeWith<Val>> KeyRangeMap<Val, Metric, MetricFunc>::getAffectedRangesAfterInsertion(
diff --git a/fdbclient/include/fdbclient/Metacluster.h b/fdbclient/include/fdbclient/Metacluster.h
new file mode 100644
index 0000000000..99abed564b
--- /dev/null
+++ b/fdbclient/include/fdbclient/Metacluster.h
@@ -0,0 +1,183 @@
+/*
+ * Metacluster.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FDBCLIENT_METACLUSTER_H
+#define FDBCLIENT_METACLUSTER_H
+#include "CoordinationInterface.h"
+#include "json_spirit/json_spirit_value.h"
+#pragma once
+
+#include "fdbclient/FDBTypes.h"
+#include "fdbclient/KeyBackedTypes.h"
+#include "flow/flat_buffers.h"
+
+struct ClusterUsage {
+	int numTenantGroups = 0;
+
+	ClusterUsage() = default;
+	ClusterUsage(int numTenantGroups) : numTenantGroups(numTenantGroups) {}
+
+	json_spirit::mObject toJson() const;
+
+	bool operator==(const ClusterUsage& other) const noexcept { return numTenantGroups == other.numTenantGroups; }
+	bool operator!=(const ClusterUsage& other) const noexcept { return !(*this == other); }
+	bool operator<(const ClusterUsage& other) const noexcept { return numTenantGroups < other.numTenantGroups; }
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, numTenantGroups);
+	}
+};
+
+template <>
+struct Traceable<ClusterUsage> : std::true_type {
+	static std::string toString(const ClusterUsage& value) {
+		return format("NumTenantGroups: %d", value.numTenantGroups);
+	}
+};
+
+// Represents the various states that a data cluster could be in.
+//
+// READY - the data cluster is active
+// REMOVING - the data cluster is being removed and cannot have its configuration changed or any tenants created
+// RESTORING - the data cluster is being restored and cannot have its configuration changed or any tenants
+//             created/updated/deleted.
+enum class DataClusterState { READY, REMOVING, RESTORING };
+
+struct DataClusterEntry {
+	constexpr static FileIdentifier file_identifier = 929511;
+
+	static std::string clusterStateToString(DataClusterState clusterState);
+	static DataClusterState stringToClusterState(std::string stateStr);
+
+	UID id;
+	ClusterUsage capacity;
+	ClusterUsage allocated;
+
+	DataClusterState clusterState = DataClusterState::READY;
+
+	DataClusterEntry() = default;
+	DataClusterEntry(ClusterUsage capacity) : capacity(capacity) {}
+	DataClusterEntry(UID id, ClusterUsage capacity, ClusterUsage allocated)
+	  : id(id), capacity(capacity), allocated(allocated) {}
+
+	// Returns true if all configurable properties match
+	bool matchesConfiguration(DataClusterEntry const& other) const {
+		return id == other.id && capacity == other.capacity;
+	}
+
+	bool hasCapacity() const { return allocated < capacity; }
+
+	Value encode() const { return ObjectWriter::toValue(*this, IncludeVersion()); }
+	static DataClusterEntry decode(ValueRef const& value) {
+		return ObjectReader::fromStringRef<DataClusterEntry>(value, IncludeVersion());
+	}
+
+	json_spirit::mObject toJson() const;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, id, capacity, allocated, clusterState);
+	}
+};
+
+struct MetaclusterRegistrationEntry {
+	constexpr static FileIdentifier file_identifier = 13448589;
+
+	ClusterType clusterType;
+
+	ClusterName metaclusterName;
+	ClusterName name;
+	UID metaclusterId;
+	UID id;
+
+	MetaclusterRegistrationEntry() = default;
+	MetaclusterRegistrationEntry(ClusterName metaclusterName, UID metaclusterId)
+	  : clusterType(ClusterType::METACLUSTER_MANAGEMENT), metaclusterName(metaclusterName), name(metaclusterName),
+	    metaclusterId(metaclusterId), id(metaclusterId) {}
+	MetaclusterRegistrationEntry(ClusterName metaclusterName, ClusterName name, UID metaclusterId, UID id)
+	  : clusterType(ClusterType::METACLUSTER_DATA), metaclusterName(metaclusterName), name(name),
+	    metaclusterId(metaclusterId), id(id) {
+		ASSERT(metaclusterName != name && metaclusterId != id);
+	}
+
+	// Returns true if this entry is associated with the same cluster as the passed in entry. If one entry is from the
+	// management cluster and the other is from a data cluster, this checks whether they are part of the same
+	// metacluster.
+	bool matches(MetaclusterRegistrationEntry const& other) const {
+		if (metaclusterName != other.metaclusterName || metaclusterId != other.metaclusterId) {
+			return false;
+		} else if (clusterType == ClusterType::METACLUSTER_DATA && other.clusterType == ClusterType::METACLUSTER_DATA &&
+		           (name != other.name || id != other.id)) {
+			return false;
+		}
+
+		return true;
+	}
+
+	MetaclusterRegistrationEntry toManagementClusterRegistration() const {
+		ASSERT(clusterType == ClusterType::METACLUSTER_DATA);
+		return MetaclusterRegistrationEntry(metaclusterName, metaclusterId);
+	}
+
+	MetaclusterRegistrationEntry toDataClusterRegistration(ClusterName name, UID id) const {
+		ASSERT(clusterType == ClusterType::METACLUSTER_MANAGEMENT);
+		return MetaclusterRegistrationEntry(metaclusterName, name, metaclusterId, id);
+	}
+
+	Value encode() const { return ObjectWriter::toValue(*this, IncludeVersion()); }
+	static MetaclusterRegistrationEntry decode(ValueRef const& value) {
+		return ObjectReader::fromStringRef<MetaclusterRegistrationEntry>(value, IncludeVersion());
+	}
+	static Optional<MetaclusterRegistrationEntry> decode(Optional<Value> value) {
+		return value.map<MetaclusterRegistrationEntry>(
+		    [](ValueRef const& v) { return MetaclusterRegistrationEntry::decode(v); });
+	}
+
+	std::string toString() const {
+		if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) {
+			return fmt::format(
+			    "metacluster name: {}, metacluster id: {}", printable(metaclusterName), metaclusterId.shortString());
+		} else {
+			return fmt::format("metacluster name: {}, metacluster id: {}, data cluster name: {}, data cluster id: {}",
+			                   printable(metaclusterName),
+			                   metaclusterId.shortString(),
+			                   printable(name),
+			                   id.shortString());
+		}
+	}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, clusterType, metaclusterName, name, metaclusterId, id);
+	}
+};
+
+template <>
+struct Traceable<MetaclusterRegistrationEntry> : std::true_type {
+	static std::string toString(MetaclusterRegistrationEntry const& entry) { return entry.toString(); }
+};
+
+struct MetaclusterMetadata {
+	// Registration information for a metacluster, stored on both management and data clusters
+	static KeyBackedObjectProperty<MetaclusterRegistrationEntry, decltype(IncludeVersion())>& metaclusterRegistration();
+};
+
+#endif
\ No newline at end of file
diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h
new file mode 100644
index 0000000000..e0b9c33629
--- /dev/null
+++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h
@@ -0,0 +1,1926 @@
+/*
+ * MetaclusterManagement.actor.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "fdbclient/FDBOptions.g.h"
+#include "flow/IRandom.h"
+#include "flow/ThreadHelper.actor.h"
+#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_G_H)
+#define FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_G_H
+#include "fdbclient/MetaclusterManagement.actor.g.h"
+#elif !defined(FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_H)
+#define FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_H
+
+#include "fdbclient/FDBTypes.h"
+#include "fdbclient/GenericTransactionHelper.h"
+#include "fdbclient/GenericManagementAPI.actor.h"
+#include "fdbclient/KeyBackedTypes.h"
+#include "fdbclient/Metacluster.h"
+#include "fdbclient/MultiVersionTransaction.h"
+#include "fdbclient/SystemData.h"
+#include "fdbclient/TenantManagement.actor.h"
+#include "fdbclient/VersionedMap.h"
+#include "flow/flat_buffers.h"
+#include "flow/actorcompiler.h" // has to be last include
+
+// This file provides the interfaces to manage metacluster metadata.
+//
+// These transactions can operate on clusters at different versions, so care needs to be taken to update the metadata
+// according to the cluster version.
+//
+// Support is maintained in this file for the current and the previous protocol versions.
+
+struct DataClusterMetadata {
+	constexpr static FileIdentifier file_identifier = 5573993;
+
+	DataClusterEntry entry;
+	ClusterConnectionString connectionString;
+
+	DataClusterMetadata() = default;
+	DataClusterMetadata(DataClusterEntry const& entry, ClusterConnectionString const& connectionString)
+	  : entry(entry), connectionString(connectionString) {}
+
+	bool matchesConfiguration(DataClusterMetadata const& other) const {
+		return entry.matchesConfiguration(other.entry) && connectionString == other.connectionString;
+	}
+
+	Value encode() const { return ObjectWriter::toValue(*this, IncludeVersion()); }
+	static DataClusterMetadata decode(ValueRef const& value) {
+		return ObjectReader::fromStringRef<DataClusterMetadata>(value, IncludeVersion());
+	}
+
+	json_spirit::mValue toJson() const {
+		json_spirit::mObject obj = entry.toJson();
+		obj["connection_string"] = connectionString.toString();
+		return obj;
+	}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, connectionString, entry);
+	}
+};
+
+FDB_DECLARE_BOOLEAN_PARAM(AddNewTenants);
+FDB_DECLARE_BOOLEAN_PARAM(RemoveMissingTenants);
+
+namespace MetaclusterAPI {
+
+struct ManagementClusterMetadata {
+	struct ConnectionStringCodec {
+		static inline Standalone<StringRef> pack(ClusterConnectionString const& val) {
+			return StringRef(val.toString());
+		}
+		static inline ClusterConnectionString unpack(Standalone<StringRef> const& val) {
+			return ClusterConnectionString(val.toString());
+		}
+	};
+
+	static TenantMetadataSpecification& tenantMetadata();
+
+	// A map from cluster name to the metadata associated with a cluster
+	static KeyBackedObjectMap<ClusterName, DataClusterEntry, decltype(IncludeVersion())>& dataClusters();
+
+	// A map from cluster name to the connection string for the cluster
+	static KeyBackedMap<ClusterName, ClusterConnectionString, TupleCodec<ClusterName>, ConnectionStringCodec>
+	    dataClusterConnectionRecords;
+
+	// A set of non-full clusters where the key is the tuple (num tenant groups allocated, cluster name).
+	static KeyBackedSet<Tuple> clusterCapacityIndex;
+
+	// A map from cluster name to a count of tenants
+	static KeyBackedMap<ClusterName, int64_t, TupleCodec<ClusterName>, BinaryCodec<int64_t>> clusterTenantCount;
+
+	// A set of (cluster name, tenant name, tenant ID) tuples ordered by cluster
+	static KeyBackedSet<Tuple> clusterTenantIndex;
+
+	// A set of (cluster, tenant group name) tuples ordered by cluster
+	static KeyBackedSet<Tuple> clusterTenantGroupIndex;
+};
+
+ACTOR Future<Reference<IDatabase>> openDatabase(ClusterConnectionString connectionString);
+
+ACTOR template <class Transaction>
+Future<Optional<DataClusterMetadata>> tryGetClusterTransaction(Transaction tr, ClusterName name) {
+	tr->setOption(FDBTransactionOptions::RAW_ACCESS);
+
+	state Future<Void> metaclusterRegistrationCheck =
+	    TenantAPI::checkTenantMode(tr, ClusterType::METACLUSTER_MANAGEMENT);
+
+	state Future<Optional<DataClusterEntry>> clusterEntryFuture =
+	    ManagementClusterMetadata::dataClusters().get(tr, name);
+	state Future<Optional<ClusterConnectionString>> connectionRecordFuture =
+	    ManagementClusterMetadata::dataClusterConnectionRecords.get(tr, name);
+
+	wait(metaclusterRegistrationCheck);
+
+	state Optional<DataClusterEntry> clusterEntry = wait(clusterEntryFuture);
+	Optional<ClusterConnectionString> connectionString = wait(connectionRecordFuture);
+
+	if (clusterEntry.present()) {
+		ASSERT(connectionString.present());
+		return Optional<DataClusterMetadata>(DataClusterMetadata(clusterEntry.get(), connectionString.get()));
+	} else {
+		return Optional<DataClusterMetadata>();
+	}
+}
+
+ACTOR template <class DB>
+Future<Optional<DataClusterMetadata>> tryGetCluster(Reference<DB> db, ClusterName name) {
+	state Reference<typename DB::TransactionT> tr = db->createTransaction();
+
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+			Optional<DataClusterMetadata> metadata = wait(tryGetClusterTransaction(tr, name));
+			return metadata;
+		} catch (Error& e) {
+			wait(safeThreadFutureToFuture(tr->onError(e)));
+		}
+	}
+}
+
+ACTOR template <class Transaction>
+Future<DataClusterMetadata> getClusterTransaction(Transaction tr, ClusterNameRef name) {
+	Optional<DataClusterMetadata> metadata = wait(tryGetClusterTransaction(tr, name));
+	if (!metadata.present()) {
+		throw cluster_not_found();
+	}
+
+	return metadata.get();
+}
+
+ACTOR template <class DB>
+Future<DataClusterMetadata> getCluster(Reference<DB> db, ClusterName name) {
+	Optional<DataClusterMetadata> metadata = wait(tryGetCluster(db, name));
+	if (!metadata.present()) {
+		throw cluster_not_found();
+	}
+
+	return metadata.get();
+}
+
+ACTOR template <class Transaction>
+Future<Reference<IDatabase>> getAndOpenDatabase(Transaction managementTr, ClusterName clusterName) {
+	DataClusterMetadata clusterMetadata = wait(getClusterTransaction(managementTr, clusterName));
+	Reference<IDatabase> db = wait(openDatabase(clusterMetadata.connectionString));
+	return db;
+}
+
+template <class DB>
+struct MetaclusterOperationContext {
+	Reference<DB> managementDb;
+	Reference<IDatabase> dataClusterDb;
+
+	Optional<ClusterName> clusterName;
+
+	Optional<MetaclusterRegistrationEntry> metaclusterRegistration;
+	Optional<DataClusterMetadata> dataClusterMetadata;
+
+	MetaclusterOperationContext(Reference<DB> managementDb, Optional<ClusterName> clusterName = {})
+	  : managementDb(managementDb), clusterName(clusterName) {}
+
+	// Run a transaction on the management cluster. This verifies that the cluster is a management cluster and matches
+	// the same metacluster that we've run any previous transactions on. If a clusterName is set, it also verifies that
+	// the specified cluster is present. Stores the metaclusterRegistration entry and, if a clusterName is set, the
+	// dataClusterMetadata and dataClusterDb in the context.
+	ACTOR template <class Function>
+	static Future<decltype(std::declval<Function>()(Reference<typename DB::TransactionT>()).getValue())>
+	runManagementTransaction(MetaclusterOperationContext* self, Function func) {
+		state Reference<typename DB::TransactionT> tr = self->managementDb->createTransaction();
+		state bool clusterPresentAtStart = self->clusterName.present();
+		loop {
+			try {
+				// If this transaction is retrying and didn't have the cluster name set at the beginning, clear it out
+				// to be set again in the next iteration.
+				if (!clusterPresentAtStart) {
+					self->clearCluster();
+				}
+
+				// Get the data cluster metadata for the specified cluster, if present
+				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+				state Future<Optional<DataClusterMetadata>> dataClusterMetadataFuture;
+				if (self->clusterName.present()) {
+					dataClusterMetadataFuture = tryGetClusterTransaction(tr, self->clusterName.get());
+				}
+
+				// Get the metacluster registration information
+				state Optional<MetaclusterRegistrationEntry> currentMetaclusterRegistration =
+				    wait(MetaclusterMetadata::metaclusterRegistration().get(tr));
+
+				state Optional<DataClusterMetadata> currentDataClusterMetadata;
+				if (self->clusterName.present()) {
+					wait(store(currentDataClusterMetadata, dataClusterMetadataFuture));
+				}
+
+				// Check that this is a management cluster and is the same metacluster that any previous transactions
+				// have run on.
+				if (!currentMetaclusterRegistration.present() ||
+				    currentMetaclusterRegistration.get().clusterType != ClusterType::METACLUSTER_MANAGEMENT) {
+					throw invalid_metacluster_operation();
+				} else if (self->metaclusterRegistration.present() &&
+				           !self->metaclusterRegistration.get().matches(currentMetaclusterRegistration.get())) {
+					throw invalid_metacluster_operation();
+				}
+
+				// If a cluster was specified, check that the cluster metadata is present. If so, load it and store it
+				// in the context. Additionally, store the data cluster details in the local metacluster registration
+				// entry.
+				if (self->clusterName.present()) {
+					if (!currentDataClusterMetadata.present()) {
+						throw cluster_not_found();
+					} else {
+						currentMetaclusterRegistration = currentMetaclusterRegistration.get().toDataClusterRegistration(
+						    self->clusterName.get(), currentDataClusterMetadata.get().entry.id);
+					}
+				}
+
+				// Store the metacluster registration entry
+				if (!self->metaclusterRegistration.present()) {
+					self->metaclusterRegistration = currentMetaclusterRegistration;
+				}
+
+				// Check that our data cluster has the same ID as previous transactions. If so, then store the updated
+				// cluster metadata in the context and open a connection to the data DB.
+				if (self->dataClusterMetadata.present() &&
+				    self->dataClusterMetadata.get().entry.id != currentDataClusterMetadata.get().entry.id) {
+					throw cluster_not_found();
+				} else if (self->clusterName.present()) {
+					self->dataClusterMetadata = currentDataClusterMetadata;
+					if (!self->dataClusterDb) {
+						wait(
+						    store(self->dataClusterDb, openDatabase(self->dataClusterMetadata.get().connectionString)));
+					}
+				}
+
+				state decltype(std::declval<Function>()(Reference<typename DB::TransactionT>()).getValue()) result =
+				    wait(func(tr));
+
+				wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1)));
+				return result;
+			} catch (Error& e) {
+				wait(safeThreadFutureToFuture(tr->onError(e)));
+			}
+		}
+	}
+
+	template <class Function>
+	Future<decltype(std::declval<Function>()(Reference<typename DB::TransactionT>()).getValue())>
+	runManagementTransaction(Function func) {
+		return runManagementTransaction(this, func);
+	}
+
+	// Runs a transaction on the data cluster. This requires that a cluster name be set and that a transaction has
+	// already been run on the management cluster to populate the needed metadata. This verifies that the data cluster
+	// has the expected ID and is part of the metacluster that previous transactions have run on.
+	ACTOR template <class Function>
+	static Future<decltype(std::declval<Function>()(Reference<typename DB::TransactionT>()).getValue())>
+	runDataClusterTransaction(MetaclusterOperationContext* self, Function func) {
+		ASSERT(self->dataClusterDb);
+		ASSERT(self->dataClusterMetadata.present());
+		ASSERT(self->metaclusterRegistration.present() &&
+		       self->metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_DATA);
+
+		state Reference<ITransaction> tr = self->dataClusterDb->createTransaction();
+		loop {
+			try {
+				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+
+				state Optional<MetaclusterRegistrationEntry> currentMetaclusterRegistration =
+				    wait(MetaclusterMetadata::metaclusterRegistration().get(tr));
+
+				// Check that this is the expected data cluster and is part of the right metacluster
+				if (!currentMetaclusterRegistration.present() ||
+				    currentMetaclusterRegistration.get().clusterType != ClusterType::METACLUSTER_DATA) {
+					throw invalid_metacluster_operation();
+				} else if (!self->metaclusterRegistration.get().matches(currentMetaclusterRegistration.get())) {
+					throw invalid_metacluster_operation();
+				}
+
+				state decltype(std::declval<Function>()(Reference<typename DB::TransactionT>()).getValue()) result =
+				    wait(func(tr));
+
+				wait(safeThreadFutureToFuture(tr->commit()));
+				return result;
+			} catch (Error& e) {
+				wait(safeThreadFutureToFuture(tr->onError(e)));
+			}
+		}
+	}
+
+	template <class Function>
+	Future<decltype(std::declval<Function>()(Reference<typename DB::TransactionT>()).getValue())>
+	runDataClusterTransaction(Function func) {
+		return runDataClusterTransaction(this, func);
+	}
+
+	ACTOR static Future<Void> updateClusterName(MetaclusterOperationContext* self,
+	                                            Reference<typename DB::TransactionT> tr) {
+		state DataClusterMetadata currentDataClusterMetadata = wait(getClusterTransaction(tr, self->clusterName.get()));
+
+		self->metaclusterRegistration = self->metaclusterRegistration.get().toDataClusterRegistration(
+		    self->clusterName.get(), currentDataClusterMetadata.entry.id);
+
+		self->dataClusterMetadata = currentDataClusterMetadata;
+		if (!self->dataClusterDb) {
+			wait(store(self->dataClusterDb, openDatabase(self->dataClusterMetadata.get().connectionString)));
+		}
+
+		return Void();
+	}
+
+	// Sets the cluster used in this context. This must be called from a management cluster transaction, and it
+	// will load the cluster metadata and connect to the cluster.
+	Future<Void> setCluster(Reference<typename DB::TransactionT> tr, ClusterName clusterName) {
+		ASSERT(!this->clusterName.present());
+		ASSERT(!dataClusterMetadata.present());
+		ASSERT(metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_MANAGEMENT);
+		this->clusterName = clusterName;
+		return updateClusterName(this, tr);
+	}
+
+	// Clears the chosen cluster for this context. This is useful if we are retrying a transaction that expects an
+	// uninitialized cluster.
+	void clearCluster() {
+		clusterName = {};
+		dataClusterMetadata = {};
+		dataClusterDb = {};
+		if (metaclusterRegistration.present() &&
+		    metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_DATA) {
+			metaclusterRegistration = metaclusterRegistration.get().toManagementClusterRegistration();
+		}
+	}
+};
+
+template <class Transaction>
+Future<Optional<TenantMapEntry>> tryGetTenantTransaction(Transaction tr, TenantName name) {
+	tr->setOption(FDBTransactionOptions::RAW_ACCESS);
+	return ManagementClusterMetadata::tenantMetadata().tenantMap.get(tr, name);
+}
+
+ACTOR template <class DB>
+Future<Optional<TenantMapEntry>> tryGetTenant(Reference<DB> db, TenantName name) {
+	state Reference<typename DB::TransactionT> tr = db->createTransaction();
+
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
+			Optional<TenantMapEntry> entry = wait(tryGetTenantTransaction(tr, name));
+			return entry;
+		} catch (Error& e) {
+			wait(safeThreadFutureToFuture(tr->onError(e)));
+		}
+	}
+}
+
+ACTOR template <class Transaction>
+Future<TenantMapEntry> getTenantTransaction(Transaction tr, TenantName name) {
+	Optional<TenantMapEntry> entry = wait(tryGetTenantTransaction(tr, name));
+	if (!entry.present()) {
+		throw tenant_not_found();
+	}
+
+	return entry.get();
+}
+
+ACTOR template <class DB>
+Future<TenantMapEntry> getTenant(Reference<DB> db, TenantName name) {
+	Optional<TenantMapEntry> entry = wait(tryGetTenant(db, name));
+	if (!entry.present()) {
+		throw tenant_not_found();
+	}
+
+	return entry.get();
+}
+
+ACTOR template <class Transaction>
+Future<Void> managementClusterCheckEmpty(Transaction tr) {
+	state Future<KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>>> tenantsFuture =
+	    TenantMetadata::tenantMap().getRange(tr, {}, {}, 1);
+	state typename transaction_future_type<Transaction, RangeResult>::type dbContentsFuture =
+	    tr->getRange(normalKeys, 1);
+
+	KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> tenants = wait(tenantsFuture);
+	if (!tenants.results.empty()) {
+		throw cluster_not_empty();
+	}
+
+	RangeResult dbContents = wait(safeThreadFutureToFuture(dbContentsFuture));
+	if (!dbContents.empty()) {
+		throw cluster_not_empty();
+	}
+
+	return Void();
+}
+
+ACTOR template <class DB>
+Future<Optional<std::string>> createMetacluster(Reference<DB> db, ClusterName name) {
+	state Reference<typename DB::TransactionT> tr = db->createTransaction();
+	state Optional<UID> metaclusterUid;
+
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+
+			state Future<Optional<MetaclusterRegistrationEntry>> metaclusterRegistrationFuture =
+			    MetaclusterMetadata::metaclusterRegistration().get(tr);
+
+			wait(managementClusterCheckEmpty(tr));
+
+			Optional<MetaclusterRegistrationEntry> existingRegistration = wait(metaclusterRegistrationFuture);
+			if (existingRegistration.present()) {
+				if (metaclusterUid.present() && metaclusterUid.get() == existingRegistration.get().metaclusterId) {
+					return Optional<std::string>();
+				} else {
+					return format("cluster is already registered as a %s named `%s'",
+					              existingRegistration.get().clusterType == ClusterType::METACLUSTER_DATA
+					                  ? "data cluster"
+					                  : "metacluster",
+					              printable(existingRegistration.get().name).c_str());
+				}
+			}
+
+			if (!metaclusterUid.present()) {
+				metaclusterUid = deterministicRandom()->randomUniqueID();
+			}
+
+			MetaclusterMetadata::metaclusterRegistration().set(
+			    tr, MetaclusterRegistrationEntry(name, metaclusterUid.get()));
+
+			wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1)));
+			break;
+		} catch (Error& e) {
+			wait(safeThreadFutureToFuture(tr->onError(e)));
+		}
+	}
+
+	return Optional<std::string>();
+}
+
+ACTOR template <class DB>
+Future<Void> decommissionMetacluster(Reference<DB> db) {
+	state Reference<typename DB::TransactionT> tr = db->createTransaction();
+	state bool firstTry = true;
+
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+
+			ClusterType clusterType = wait(TenantAPI::getClusterType(tr));
+			if (clusterType != ClusterType::METACLUSTER_MANAGEMENT) {
+				if (firstTry) {
+					throw invalid_metacluster_operation();
+				} else {
+					return Void();
+				}
+			}
+
+			// Erase all metadata not associated with specific tenants prior to checking
+			// cluster emptiness
+			ManagementClusterMetadata::tenantMetadata().tenantCount.clear(tr);
+			ManagementClusterMetadata::tenantMetadata().lastTenantId.clear(tr);
+			ManagementClusterMetadata::tenantMetadata().tenantTombstones.clear(tr);
+			ManagementClusterMetadata::tenantMetadata().tombstoneCleanupData.clear(tr);
+
+			wait(managementClusterCheckEmpty(tr));
+			MetaclusterMetadata::metaclusterRegistration().clear(tr);
+
+			firstTry = false;
+			wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1)));
+			break;
+		} catch (Error& e) {
+			wait(safeThreadFutureToFuture(tr->onError(e)));
+		}
+	}
+
+	return Void();
+}
+
+template <class Transaction>
+void updateClusterCapacityIndex(Transaction tr,
+                                ClusterName name,
+                                DataClusterEntry const& previousEntry,
+                                DataClusterEntry const& updatedEntry) {
+	// Entries are put in the cluster capacity index ordered by how many items are already allocated to them
+	if (previousEntry.hasCapacity()) {
+		ManagementClusterMetadata::clusterCapacityIndex.erase(
+		    tr, Tuple::makeTuple(previousEntry.allocated.numTenantGroups, name));
+	}
+	if (updatedEntry.hasCapacity()) {
+		ManagementClusterMetadata::clusterCapacityIndex.insert(
+		    tr, Tuple::makeTuple(updatedEntry.allocated.numTenantGroups, name));
+	}
+}
+
+// This should only be called from a transaction that has already confirmed that the cluster entry
+// is present. The updatedEntry should use the existing entry and modify only those fields that need
+// to be changed.
+template <class Transaction>
+void updateClusterMetadata(Transaction tr,
+                           ClusterNameRef name,
+                           DataClusterMetadata const& previousMetadata,
+                           Optional<ClusterConnectionString> const& updatedConnectionString,
+                           Optional<DataClusterEntry> const& updatedEntry) {
+
+	if (updatedEntry.present()) {
+		if (previousMetadata.entry.clusterState == DataClusterState::REMOVING) {
+			throw cluster_removed();
+		}
+		ManagementClusterMetadata::dataClusters().set(tr, name, updatedEntry.get());
+		updateClusterCapacityIndex(tr, name, previousMetadata.entry, updatedEntry.get());
+	}
+	if (updatedConnectionString.present()) {
+		ManagementClusterMetadata::dataClusterConnectionRecords.set(tr, name, updatedConnectionString.get());
+	}
+}
+
+template <class DB>
+struct RegisterClusterImpl {
+	MetaclusterOperationContext<DB> ctx;
+
+	// Initialization parameters
+	ClusterName clusterName;
+	ClusterConnectionString connectionString;
+	DataClusterEntry clusterEntry;
+
+	RegisterClusterImpl(Reference<DB> managementDb,
+	                    ClusterName clusterName,
+	                    ClusterConnectionString connectionString,
+	                    DataClusterEntry clusterEntry)
+	  : ctx(managementDb), clusterName(clusterName), connectionString(connectionString), clusterEntry(clusterEntry) {}
+
+	// Check that cluster name is available
+	ACTOR static Future<Void> registrationPrecheck(RegisterClusterImpl* self, Reference<typename DB::TransactionT> tr) {
+		state Optional<DataClusterMetadata> dataClusterMetadata = wait(tryGetClusterTransaction(tr, self->clusterName));
+		if (dataClusterMetadata.present()) {
+			throw cluster_already_exists();
+		}
+
+		return Void();
+	}
+
+	ACTOR static Future<Void> configureDataCluster(RegisterClusterImpl* self) {
+		state Reference<IDatabase> dataClusterDb = wait(openDatabase(self->connectionString));
+		state Reference<ITransaction> tr = dataClusterDb->createTransaction();
+		loop {
+			try {
+				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+
+				state Future<std::vector<std::pair<TenantName, TenantMapEntry>>> existingTenantsFuture =
+				    TenantAPI::listTenantsTransaction(tr, ""_sr, "\xff\xff"_sr, 1);
+				state ThreadFuture<RangeResult> existingDataFuture = tr->getRange(normalKeys, 1);
+
+				// Check whether this cluster has already been registered
+				state Optional<MetaclusterRegistrationEntry> existingRegistration =
+				    wait(MetaclusterMetadata::metaclusterRegistration().get(tr));
+				if (existingRegistration.present()) {
+					if (existingRegistration.get().clusterType != ClusterType::METACLUSTER_DATA ||
+					    existingRegistration.get().name != self->clusterName ||
+					    !existingRegistration.get().matches(self->ctx.metaclusterRegistration.get())) {
+						throw cluster_already_registered();
+					} else {
+						// We already successfully registered the cluster with these details, so there's nothing to do
+						self->clusterEntry.id = existingRegistration.get().id;
+						return Void();
+					}
+				}
+
+				// Check for any existing data
+				std::vector<std::pair<TenantName, TenantMapEntry>> existingTenants =
+				    wait(safeThreadFutureToFuture(existingTenantsFuture));
+				if (!existingTenants.empty()) {
+					TraceEvent(SevWarn, "CannotRegisterClusterWithTenants").detail("ClusterName", self->clusterName);
+					throw cluster_not_empty();
+				}
+
+				RangeResult existingData = wait(safeThreadFutureToFuture(existingDataFuture));
+				if (!existingData.empty()) {
+					TraceEvent(SevWarn, "CannotRegisterClusterWithData").detail("ClusterName", self->clusterName);
+					throw cluster_not_empty();
+				}
+
+				self->clusterEntry.id = deterministicRandom()->randomUniqueID();
+				MetaclusterMetadata::metaclusterRegistration().set(
+				    tr,
+				    self->ctx.metaclusterRegistration.get().toDataClusterRegistration(self->clusterName,
+				                                                                      self->clusterEntry.id));
+
+				wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1)));
+
+				TraceEvent("ConfiguredDataCluster")
+				    .detail("ClusterName", self->clusterName)
+				    .detail("ClusterID", self->clusterEntry.id)
+				    .detail("Capacity", self->clusterEntry.capacity)
+				    .detail("Version", tr->getCommittedVersion())
+				    .detail("ConnectionString", self->connectionString.toString());
+
+				return Void();
+			} catch (Error& e) {
+				wait(safeThreadFutureToFuture(tr->onError(e)));
+			}
+		}
+	}
+
+	// Store the cluster entry for the new cluster
+	ACTOR static Future<Void> registerInManagementCluster(RegisterClusterImpl* self,
+	                                                      Reference<typename DB::TransactionT> tr) {
+		state Optional<DataClusterMetadata> dataClusterMetadata = wait(tryGetClusterTransaction(tr, self->clusterName));
+		if (dataClusterMetadata.present() && !dataClusterMetadata.get().matchesConfiguration(
+		                                         DataClusterMetadata(self->clusterEntry, self->connectionString))) {
+			throw cluster_already_exists();
+		} else if (!dataClusterMetadata.present()) {
+			self->clusterEntry.allocated = ClusterUsage();
+
+			if (self->clusterEntry.hasCapacity()) {
+				ManagementClusterMetadata::clusterCapacityIndex.insert(
+				    tr, Tuple::makeTuple(self->clusterEntry.allocated.numTenantGroups, self->clusterName));
+			}
+			ManagementClusterMetadata::dataClusters().set(tr, self->clusterName, self->clusterEntry);
+			ManagementClusterMetadata::dataClusterConnectionRecords.set(tr, self->clusterName, self->connectionString);
+		}
+
+		TraceEvent("RegisteredDataCluster")
+		    .detail("ClusterName", self->clusterName)
+		    .detail("ClusterID", self->clusterEntry.id)
+		    .detail("Capacity", self->clusterEntry.capacity)
+		    .detail("Version", tr->getCommittedVersion())
+		    .detail("ConnectionString", self->connectionString.toString());
+
+		return Void();
+	}
+
+	ACTOR static Future<Void> run(RegisterClusterImpl* self) {
+		wait(self->ctx.runManagementTransaction(
+		    [self = self](Reference<typename DB::TransactionT> tr) { return registrationPrecheck(self, tr); }));
+		// Don't use ctx to run this transaction because we have not set up the data cluster metadata on it and we don't
+		// have a metacluster registration on the data cluster
+		wait(configureDataCluster(self));
+		wait(self->ctx.runManagementTransaction(
+		    [self = self](Reference<typename DB::TransactionT> tr) { return registerInManagementCluster(self, tr); }));
+		return Void();
+	}
+	Future<Void> run() { return run(this); }
+};
+
+ACTOR template <class DB>
+Future<Void> registerCluster(Reference<DB> db,
+                             ClusterName name,
+                             ClusterConnectionString connectionString,
+                             DataClusterEntry entry) {
+	state RegisterClusterImpl<DB> impl(db, name, connectionString, entry);
+	wait(impl.run());
+	return Void();
+}
+
+ACTOR template <class DB>
+Future<Void> restoreCluster(Reference<DB> db,
+                            ClusterName name,
+                            std::string connectionString,
+                            DataClusterEntry entry,
+                            AddNewTenants addNewTenants,
+                            RemoveMissingTenants removeMissingTenants) {
+	// TODO: add implementation
+	wait(delay(0.0));
+	return Void();
+}
+
+template <class DB>
+struct RemoveClusterImpl {
+	MetaclusterOperationContext<DB> ctx;
+
+	// Initialization parameters
+	bool forceRemove;
+
+	// Parameters set in markClusterRemoving
+	Optional<int64_t> lastTenantId;
+
+	RemoveClusterImpl(Reference<DB> managementDb, ClusterName clusterName, bool forceRemove)
+	  : ctx(managementDb, clusterName), forceRemove(forceRemove) {}
+
+	// Returns false if the cluster is no longer present, or true if it is present and the removal should proceed.
+	ACTOR static Future<bool> markClusterRemoving(RemoveClusterImpl* self, Reference<typename DB::TransactionT> tr) {
+		if (!self->forceRemove && self->ctx.dataClusterMetadata.get().entry.allocated.numTenantGroups > 0) {
+			throw cluster_not_empty();
+		} else if (self->ctx.dataClusterMetadata.get().entry.clusterState != DataClusterState::REMOVING) {
+			// Mark the cluster in a removing state while we finish the remaining removal steps. This prevents new
+			// tenants from being assigned to it.
+			DataClusterEntry updatedEntry = self->ctx.dataClusterMetadata.get().entry;
+			updatedEntry.clusterState = DataClusterState::REMOVING;
+			updatedEntry.capacity.numTenantGroups = 0;
+
+			updateClusterMetadata(tr,
+			                      self->ctx.clusterName.get(),
+			                      self->ctx.dataClusterMetadata.get(),
+			                      Optional<ClusterConnectionString>(),
+			                      updatedEntry);
+		}
+
+		ManagementClusterMetadata::clusterCapacityIndex.erase(
+		    tr,
+		    Tuple::makeTuple(self->ctx.dataClusterMetadata.get().entry.allocated.numTenantGroups,
+		                     self->ctx.clusterName.get()));
+
+		// Get the last allocated tenant ID to be used on the detached data cluster
+		if (self->forceRemove) {
+			Optional<int64_t> lastId = wait(ManagementClusterMetadata::tenantMetadata().lastTenantId.get(tr));
+			self->lastTenantId = lastId;
+		}
+
+		TraceEvent("MarkedDataClusterRemoving")
+		    .detail("Name", self->ctx.clusterName.get())
+		    .detail("Version", tr->getCommittedVersion());
+
+		return true;
+	}
+
+	// Delete metacluster metadata from the data cluster
+	ACTOR static Future<Void> updateDataCluster(RemoveClusterImpl* self, Reference<ITransaction> tr) {
+		// Delete metacluster related metadata
+		MetaclusterMetadata::metaclusterRegistration().clear(tr);
+		TenantMetadata::tenantTombstones().clear(tr);
+		TenantMetadata::tombstoneCleanupData().clear(tr);
+
+		// If we are force removing a cluster, then it will potentially contain tenants that have IDs
+		// larger than the next tenant ID to be allocated on the cluster. To avoid collisions, we advance
+		// the ID so that it will be the larger of the current one on the data cluster and the management
+		// cluster.
+		if (self->lastTenantId.present()) {
+			Optional<int64_t> lastId = wait(TenantMetadata::lastTenantId().get(tr));
+			if (!lastId.present() || lastId.get() < self->lastTenantId.get()) {
+				TenantMetadata::lastTenantId().set(tr, self->lastTenantId.get());
+			}
+		}
+
+		TraceEvent("ReconfiguredDataCluster")
+		    .detail("Name", self->ctx.clusterName.get())
+		    .detail("Version", tr->getCommittedVersion());
+
+		return Void();
+	}
+
+	// Returns true if all tenants have been purged
+	ACTOR static Future<bool> purgeTenants(RemoveClusterImpl* self,
+	                                       Reference<typename DB::TransactionT> tr,
+	                                       std::pair<Tuple, Tuple> clusterTupleRange) {
+		ASSERT(self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::REMOVING);
+
+		// Get the list of tenants
+		state Future<KeyBackedRangeResult<Tuple>> tenantEntriesFuture =
+		    ManagementClusterMetadata::clusterTenantIndex.getRange(
+		        tr, clusterTupleRange.first, clusterTupleRange.second, CLIENT_KNOBS->REMOVE_CLUSTER_TENANT_BATCH_SIZE);
+
+		state KeyBackedRangeResult<Tuple> tenantEntries = wait(tenantEntriesFuture);
+
+		// Erase each tenant from the tenant map on the management cluster
+		for (Tuple entry : tenantEntries.results) {
+			ASSERT(entry.getString(0) == self->ctx.clusterName.get());
+			ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, entry.getString(1));
+			ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, entry.getInt(2));
+		}
+
+		// Erase all of the tenants processed in this transaction from the cluster tenant index
+		if (!tenantEntries.results.empty()) {
+			ManagementClusterMetadata::clusterTenantIndex.erase(
+			    tr,
+			    clusterTupleRange.first,
+			    Tuple::makeTuple(self->ctx.clusterName.get(), keyAfter(tenantEntries.results.rbegin()->getString(1))));
+		}
+
+		ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(
+		    tr, -tenantEntries.results.size(), MutationRef::AddValue);
+		ManagementClusterMetadata::clusterTenantCount.atomicOp(
+		    tr, self->ctx.clusterName.get(), -tenantEntries.results.size(), MutationRef::AddValue);
+
+		return !tenantEntries.more;
+	}
+
+	// Returns true if all tenant groups and the data cluster have been purged
+	ACTOR static Future<bool> purgeTenantGroupsAndDataCluster(RemoveClusterImpl* self,
+	                                                          Reference<typename DB::TransactionT> tr,
+	                                                          std::pair<Tuple, Tuple> clusterTupleRange) {
+		ASSERT(self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::REMOVING);
+
+		// Get the list of tenant groups
+		state Future<KeyBackedRangeResult<Tuple>> tenantGroupEntriesFuture =
+		    ManagementClusterMetadata::clusterTenantGroupIndex.getRange(
+		        tr, clusterTupleRange.first, clusterTupleRange.second, CLIENT_KNOBS->REMOVE_CLUSTER_TENANT_BATCH_SIZE);
+
+		// Erase each tenant group from the tenant group map and the tenant group tenant index
+		state KeyBackedRangeResult<Tuple> tenantGroupEntries = wait(tenantGroupEntriesFuture);
+		for (Tuple entry : tenantGroupEntries.results) {
+			ASSERT(entry.getString(0) == self->ctx.clusterName.get());
+			TenantGroupName tenantGroup = entry.getString(1);
+			ManagementClusterMetadata::tenantMetadata().tenantGroupTenantIndex.erase(
+			    tr, Tuple::makeTuple(tenantGroup), Tuple::makeTuple(keyAfter(tenantGroup)));
+			ManagementClusterMetadata::tenantMetadata().tenantGroupMap.erase(tr, tenantGroup);
+		}
+
+		if (!tenantGroupEntries.results.empty()) {
+			// Erase all of the tenant groups processed in this transaction from the cluster tenant group index
+			ManagementClusterMetadata::clusterTenantGroupIndex.erase(
+			    tr,
+			    clusterTupleRange.first,
+			    Tuple::makeTuple(self->ctx.clusterName.get(),
+			                     keyAfter(tenantGroupEntries.results.rbegin()->getString(1))));
+		}
+
+		// Erase the data cluster record from the management cluster if processing our last batch
+		if (!tenantGroupEntries.more) {
+			ManagementClusterMetadata::dataClusters().erase(tr, self->ctx.clusterName.get());
+			ManagementClusterMetadata::dataClusterConnectionRecords.erase(tr, self->ctx.clusterName.get());
+			ManagementClusterMetadata::clusterTenantCount.erase(tr, self->ctx.clusterName.get());
+		}
+
+		return !tenantGroupEntries.more;
+	}
+
+	// Remove all metadata associated with the data cluster from the management cluster
+	ACTOR static Future<Void> managementClusterPurgeDataCluster(RemoveClusterImpl* self) {
+		state std::pair<Tuple, Tuple> clusterTupleRange = std::make_pair(
+		    Tuple::makeTuple(self->ctx.clusterName.get()), Tuple::makeTuple(keyAfter(self->ctx.clusterName.get())));
+
+		// First remove all tenants associated with the data cluster from the management cluster
+		loop {
+			bool clearedAll = wait(self->ctx.runManagementTransaction(
+			    [self = self, clusterTupleRange = clusterTupleRange](Reference<typename DB::TransactionT> tr) {
+				    return purgeTenants(self, tr, clusterTupleRange);
+			    }));
+
+			if (clearedAll) {
+				break;
+			}
+		}
+
+		// Next remove all tenant groups associated with the data cluster from the management cluster
+		loop {
+			bool clearedAll = wait(self->ctx.runManagementTransaction(
+			    [self = self, clusterTupleRange = clusterTupleRange](Reference<typename DB::TransactionT> tr) {
+				    return purgeTenantGroupsAndDataCluster(self, tr, clusterTupleRange);
+			    }));
+			if (clearedAll) {
+				break;
+			}
+		}
+
+		TraceEvent("RemovedDataCluster").detail("Name", self->ctx.clusterName.get());
+		return Void();
+	}
+
+	ACTOR static Future<Void> run(RemoveClusterImpl* self) {
+		state bool clusterIsPresent;
+		try {
+			wait(store(clusterIsPresent,
+			           self->ctx.runManagementTransaction([self = self](Reference<typename DB::TransactionT> tr) {
+				           return markClusterRemoving(self, tr);
+			           })));
+		} catch (Error& e) {
+			// If the transaction retries after success or if we are trying a second time to remove the cluster, it will
+			// throw an error indicating that the removal has already started
+			if (e.code() == error_code_cluster_removed) {
+				clusterIsPresent = true;
+			} else {
+				throw;
+			}
+		}
+
+		if (clusterIsPresent) {
+			try {
+				wait(self->ctx.runDataClusterTransaction(
+				    [self = self](Reference<ITransaction> tr) { return updateDataCluster(self, tr); }));
+			} catch (Error& e) {
+				// If this transaction gets retried, the metacluster information may have already been erased.
+				if (e.code() != error_code_invalid_metacluster_operation) {
+					throw;
+				}
+			}
+
+			// This runs multiple transactions, so the run transaction calls are inside the function
+			try {
+				wait(managementClusterPurgeDataCluster(self));
+			} catch (Error& e) {
+				// If this transaction gets retried, the cluster may have already been deleted.
+				if (e.code() != error_code_cluster_not_found) {
+					throw;
+				}
+			}
+		}
+
+		return Void();
+	}
+	Future<Void> run() { return run(this); }
+};
+
+ACTOR template <class DB>
+Future<Void> removeCluster(Reference<DB> db, ClusterName name, bool forceRemove) {
+	state RemoveClusterImpl<DB> impl(db, name, forceRemove);
+	wait(impl.run());
+	return Void();
+}
+
+ACTOR template <class Transaction>
+Future<std::map<ClusterName, DataClusterMetadata>> listClustersTransaction(Transaction tr,
+                                                                           ClusterNameRef begin,
+                                                                           ClusterNameRef end,
+                                                                           int limit) {
+	tr->setOption(FDBTransactionOptions::RAW_ACCESS);
+
+	state Future<Void> tenantModeCheck = TenantAPI::checkTenantMode(tr, ClusterType::METACLUSTER_MANAGEMENT);
+
+	state Future<KeyBackedRangeResult<std::pair<ClusterName, DataClusterEntry>>> clusterEntriesFuture =
+	    ManagementClusterMetadata::dataClusters().getRange(tr, begin, end, limit);
+	state Future<KeyBackedRangeResult<std::pair<ClusterName, ClusterConnectionString>>> connectionStringFuture =
+	    ManagementClusterMetadata::dataClusterConnectionRecords.getRange(tr, begin, end, limit);
+
+	wait(tenantModeCheck);
+
+	state KeyBackedRangeResult<std::pair<ClusterName, DataClusterEntry>> clusterEntries =
+	    wait(safeThreadFutureToFuture(clusterEntriesFuture));
+	KeyBackedRangeResult<std::pair<ClusterName, ClusterConnectionString>> connectionStrings =
+	    wait(safeThreadFutureToFuture(connectionStringFuture));
+
+	ASSERT(clusterEntries.results.size() == connectionStrings.results.size());
+
+	std::map<ClusterName, DataClusterMetadata> clusters;
+	for (int i = 0; i < clusterEntries.results.size(); ++i) {
+		ASSERT(clusterEntries.results[i].first == connectionStrings.results[i].first);
+		clusters[clusterEntries.results[i].first] =
+		    DataClusterMetadata(clusterEntries.results[i].second, connectionStrings.results[i].second);
+	}
+
+	return clusters;
+}
+
+ACTOR template <class DB>
+Future<std::map<ClusterName, DataClusterMetadata>> listClusters(Reference<DB> db,
+                                                                ClusterName begin,
+                                                                ClusterName end,
+                                                                int limit) {
+	state Reference<typename DB::TransactionT> tr = db->createTransaction();
+
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+			std::map<ClusterName, DataClusterMetadata> clusters = wait(listClustersTransaction(tr, begin, end, limit));
+
+			return clusters;
+		} catch (Error& e) {
+			wait(safeThreadFutureToFuture(tr->onError(e)));
+		}
+	}
+}
+
+template <class Transaction>
+void managementClusterAddTenantToGroup(Transaction tr,
+                                       TenantName tenantName,
+                                       TenantMapEntry tenantEntry,
+                                       DataClusterMetadata* clusterMetadata,
+                                       bool groupAlreadyExists) {
+	if (tenantEntry.tenantGroup.present()) {
+		if (tenantEntry.tenantGroup.get().startsWith("\xff"_sr)) {
+			throw invalid_tenant_group_name();
+		}
+
+		if (!groupAlreadyExists) {
+			ManagementClusterMetadata::tenantMetadata().tenantGroupMap.set(
+			    tr, tenantEntry.tenantGroup.get(), TenantGroupEntry(tenantEntry.assignedCluster));
+			ManagementClusterMetadata::clusterTenantGroupIndex.insert(
+			    tr, Tuple::makeTuple(tenantEntry.assignedCluster.get(), tenantEntry.tenantGroup.get()));
+		}
+		ManagementClusterMetadata::tenantMetadata().tenantGroupTenantIndex.insert(
+		    tr, Tuple::makeTuple(tenantEntry.tenantGroup.get(), tenantName));
+	}
+
+	if (!groupAlreadyExists) {
+		ASSERT(clusterMetadata->entry.hasCapacity());
+
+		DataClusterEntry updatedEntry = clusterMetadata->entry;
+		++updatedEntry.allocated.numTenantGroups;
+
+		updateClusterMetadata(
+		    tr, tenantEntry.assignedCluster.get(), *clusterMetadata, Optional<ClusterConnectionString>(), updatedEntry);
+
+		clusterMetadata->entry = updatedEntry;
+	}
+}
+
+ACTOR template <class Transaction>
+Future<Void> managementClusterRemoveTenantFromGroup(Transaction tr,
+                                                    TenantName tenantName,
+                                                    TenantMapEntry tenantEntry,
+                                                    DataClusterMetadata* clusterMetadata,
+                                                    bool isRenamePair = false) {
+	state bool updateClusterCapacity = !tenantEntry.tenantGroup.present() && !isRenamePair;
+	if (tenantEntry.tenantGroup.present()) {
+		ManagementClusterMetadata::tenantMetadata().tenantGroupTenantIndex.erase(
+		    tr, Tuple::makeTuple(tenantEntry.tenantGroup.get(), tenantName));
+
+		state KeyBackedSet<Tuple>::RangeResultType result =
+		    wait(ManagementClusterMetadata::tenantMetadata().tenantGroupTenantIndex.getRange(
+		        tr,
+		        Tuple::makeTuple(tenantEntry.tenantGroup.get()),
+		        Tuple::makeTuple(keyAfter(tenantEntry.tenantGroup.get())),
+		        1));
+
+		if (result.results.size() == 0) {
+			ManagementClusterMetadata::clusterTenantGroupIndex.erase(
+			    tr, Tuple::makeTuple(tenantEntry.assignedCluster.get(), tenantEntry.tenantGroup.get()));
+
+			ManagementClusterMetadata::tenantMetadata().tenantGroupMap.erase(tr, tenantEntry.tenantGroup.get());
+			updateClusterCapacity = true;
+		}
+	}
+
+	// Update the tenant group count information for the assigned cluster if this tenant group was erased so we
+	// can use the freed capacity.
+	if (updateClusterCapacity) {
+		DataClusterEntry updatedEntry = clusterMetadata->entry;
+		--updatedEntry.allocated.numTenantGroups;
+		updateClusterMetadata(
+		    tr, tenantEntry.assignedCluster.get(), *clusterMetadata, Optional<ClusterConnectionString>(), updatedEntry);
+
+		clusterMetadata->entry = updatedEntry;
+	}
+
+	return Void();
+}
+
+template <class DB>
+struct CreateTenantImpl {
+	MetaclusterOperationContext<DB> ctx;
+
+	// Initialization parameters
+	TenantName tenantName;
+	TenantMapEntry tenantEntry;
+
+	// Parameter set if tenant creation permanently fails on the data cluster
+	Optional<int64_t> replaceExistingTenantId;
+
+	CreateTenantImpl(Reference<DB> managementDb, TenantName tenantName, TenantMapEntry tenantEntry)
+	  : ctx(managementDb), tenantName(tenantName), tenantEntry(tenantEntry) {}
+
+	ACTOR static Future<ClusterName> checkClusterAvailability(Reference<IDatabase> dataClusterDb,
+	                                                          ClusterName clusterName) {
+		state Reference<ITransaction> tr = dataClusterDb->createTransaction();
+		loop {
+			try {
+				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+				tr->addWriteConflictRange(KeyRangeRef("\xff/metacluster/availability_check"_sr,
+				                                      "\xff/metacluster/availability_check\x00"_sr));
+				wait(safeThreadFutureToFuture(tr->commit()));
+				return clusterName;
+			} catch (Error& e) {
+				wait(safeThreadFutureToFuture(tr->onError(e)));
+			}
+		}
+	}
+
+	// Returns true if the tenant is already assigned and can proceed to the next step and false if it needs
+	// to be created. Throws an error if the tenant already exists and cannot be created.
+	ACTOR static Future<bool> checkForExistingTenant(CreateTenantImpl* self, Reference<typename DB::TransactionT> tr) {
+		// Check if the tenant already exists. If it's partially created and matches the parameters we
+		// specified, continue creating it. Otherwise, fail with an error.
+		state Optional<TenantMapEntry> existingEntry = wait(tryGetTenantTransaction(tr, self->tenantName));
+		if (existingEntry.present()) {
+			if (!existingEntry.get().matchesConfiguration(self->tenantEntry) ||
+			    existingEntry.get().tenantState != TenantState::REGISTERING) {
+				// The tenant already exists and is either completely created or has a different
+				// configuration
+				throw tenant_already_exists();
+			} else if (!self->replaceExistingTenantId.present() ||
+			           self->replaceExistingTenantId.get() != existingEntry.get().id) {
+				// The tenant creation has already started, so resume where we left off
+				self->tenantEntry = existingEntry.get();
+				ASSERT(existingEntry.get().assignedCluster.present());
+
+				wait(self->ctx.setCluster(tr, existingEntry.get().assignedCluster.get()));
+				return true;
+			} else {
+				// The previous creation is permanently failed, so cleanup the tenant and create it again from scratch
+				// We don't need to remove it from the tenant map because we will overwrite the existing entry later in
+				// this transaction.
+				ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, existingEntry.get().id);
+				ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue);
+				ManagementClusterMetadata::clusterTenantCount.atomicOp(
+				    tr, existingEntry.get().assignedCluster.get(), -1, MutationRef::AddValue);
+
+				ManagementClusterMetadata::clusterTenantIndex.erase(
+				    tr,
+				    Tuple::makeTuple(
+				        existingEntry.get().assignedCluster.get(), self->tenantName, existingEntry.get().id));
+
+				state DataClusterMetadata previousAssignedClusterMetadata =
+				    wait(getClusterTransaction(tr, existingEntry.get().assignedCluster.get()));
+
+				wait(managementClusterRemoveTenantFromGroup(
+				    tr, self->tenantName, existingEntry.get(), &previousAssignedClusterMetadata));
+			}
+		} else if (self->replaceExistingTenantId.present()) {
+			throw tenant_removed();
+		}
+
+		return false;
+	}
+
+	// Returns a pair with the name of the assigned cluster and whether the group was already assigned
+	ACTOR static Future<std::pair<ClusterName, bool>> assignTenant(CreateTenantImpl* self,
+	                                                               Reference<typename DB::TransactionT> tr) {
+		// If our tenant group is already assigned, then we just use that assignment
+		state Optional<TenantGroupEntry> groupEntry;
+		if (self->tenantEntry.tenantGroup.present()) {
+			Optional<TenantGroupEntry> _groupEntry =
+			    wait(ManagementClusterMetadata::tenantMetadata().tenantGroupMap.get(
+			        tr, self->tenantEntry.tenantGroup.get()));
+			groupEntry = _groupEntry;
+
+			if (groupEntry.present()) {
+				ASSERT(groupEntry.get().assignedCluster.present());
+				return std::make_pair(groupEntry.get().assignedCluster.get(), true);
+			}
+		}
+
+		// Get a set of the most full clusters that still have capacity
+		state KeyBackedSet<Tuple>::RangeResultType availableClusters =
+		    wait(ManagementClusterMetadata::clusterCapacityIndex.getRange(
+		        tr, {}, {}, CLIENT_KNOBS->METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK, Snapshot::False, Reverse::True));
+
+		if (availableClusters.results.empty()) {
+			throw metacluster_no_capacity();
+		}
+
+		state std::vector<Future<Reference<IDatabase>>> dataClusterDbs;
+		for (auto clusterTuple : availableClusters.results) {
+			dataClusterDbs.push_back(getAndOpenDatabase(tr, clusterTuple.getString(1)));
+		}
+
+		wait(waitForAll(dataClusterDbs));
+
+		// Check the availability of our set of clusters
+		state std::vector<Future<ClusterName>> clusterAvailabilityChecks;
+		for (int i = 0; i < availableClusters.results.size(); ++i) {
+			clusterAvailabilityChecks.push_back(
+			    checkClusterAvailability(dataClusterDbs[i].get(), availableClusters.results[i].getString(1)));
+		}
+
+		// Wait for a successful availability check from some cluster. We prefer the most full cluster, but if it
+		// doesn't return quickly we may choose another.
+		Optional<Void> clusterAvailabilityCheck = wait(timeout(
+		    success(clusterAvailabilityChecks[0]) || (delay(CLIENT_KNOBS->METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY) &&
+		                                              waitForAny(clusterAvailabilityChecks)),
+		    CLIENT_KNOBS->METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT));
+
+		if (!clusterAvailabilityCheck.present()) {
+			// If no clusters were available for long enough, then we throw an error and try again
+			throw transaction_too_old();
+		}
+
+		// Get the first cluster that was available
+		state Optional<ClusterName> chosenCluster;
+		for (auto f : clusterAvailabilityChecks) {
+			if (f.isReady()) {
+				chosenCluster = f.get();
+				break;
+			}
+		}
+
+		ASSERT(chosenCluster.present());
+		return std::make_pair(chosenCluster.get(), false);
+	}
+
+	ACTOR static Future<Void> assignTenantAndStoreInManagementCluster(CreateTenantImpl* self,
+	                                                                  Reference<typename DB::TransactionT> tr) {
+		// If the tenant already exists, we either throw an error from this function or move on to the next phase
+		bool tenantExists = wait(checkForExistingTenant(self, tr));
+		if (tenantExists) {
+			return Void();
+		}
+
+		// Choose a cluster for the tenant
+		state std::pair<ClusterName, bool> assignment = wait(assignTenant(self, tr));
+		self->tenantEntry.assignedCluster = assignment.first;
+
+		// Update the context with the chosen cluster
+		state Future<Void> setClusterFuture = self->ctx.setCluster(tr, assignment.first);
+
+		// Create a tenant entry in the management cluster
+		Optional<int64_t> lastId = wait(ManagementClusterMetadata::tenantMetadata().lastTenantId.get(tr));
+		self->tenantEntry.setId(lastId.orDefault(-1) + 1);
+		ManagementClusterMetadata::tenantMetadata().lastTenantId.set(tr, self->tenantEntry.id);
+
+		self->tenantEntry.tenantState = TenantState::REGISTERING;
+		ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, self->tenantEntry);
+		ManagementClusterMetadata::tenantMetadata().tenantIdIndex.set(tr, self->tenantEntry.id, self->tenantName);
+
+		ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue);
+		ManagementClusterMetadata::clusterTenantCount.atomicOp(
+		    tr, self->tenantEntry.assignedCluster.get(), 1, MutationRef::AddValue);
+
+		int64_t clusterTenantCount = wait(ManagementClusterMetadata::clusterTenantCount.getD(
+		    tr, self->tenantEntry.assignedCluster.get(), Snapshot::False, 0));
+
+		if (clusterTenantCount > CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER) {
+			throw cluster_no_capacity();
+		}
+
+		// Updated indexes to include the new tenant
+		ManagementClusterMetadata::clusterTenantIndex.insert(
+		    tr, Tuple::makeTuple(self->tenantEntry.assignedCluster.get(), self->tenantName, self->tenantEntry.id));
+
+		wait(setClusterFuture);
+
+		// If we are part of a tenant group that is assigned to a cluster being removed from the metacluster,
+		// then we fail with an error.
+		if (self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::REMOVING) {
+			throw cluster_removed();
+		}
+
+		managementClusterAddTenantToGroup(
+		    tr, self->tenantName, self->tenantEntry, &self->ctx.dataClusterMetadata.get(), assignment.second);
+
+		return Void();
+	}
+
+	ACTOR static Future<Void> storeTenantInDataCluster(CreateTenantImpl* self, Reference<ITransaction> tr) {
+		std::pair<Optional<TenantMapEntry>, bool> dataClusterTenant = wait(
+		    TenantAPI::createTenantTransaction(tr, self->tenantName, self->tenantEntry, ClusterType::METACLUSTER_DATA));
+
+		// If the tenant map entry is empty, then we encountered a tombstone indicating that the tenant was
+		// simultaneously removed.
+		if (!dataClusterTenant.first.present()) {
+			throw tenant_removed();
+		}
+
+		return Void();
+	}
+
+	ACTOR static Future<Void> markTenantReady(CreateTenantImpl* self, Reference<typename DB::TransactionT> tr) {
+		state Optional<TenantMapEntry> managementEntry = wait(tryGetTenantTransaction(tr, self->tenantName));
+		if (!managementEntry.present()) {
+			throw tenant_removed();
+		} else if (managementEntry.get().id != self->tenantEntry.id) {
+			throw tenant_already_exists();
+		}
+
+		if (managementEntry.get().tenantState == TenantState::REGISTERING) {
+			TenantMapEntry updatedEntry = managementEntry.get();
+			updatedEntry.tenantState = TenantState::READY;
+			ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, updatedEntry);
+		}
+
+		return Void();
+	}
+
+	ACTOR static Future<Void> run(CreateTenantImpl* self) {
+		if (self->tenantName.startsWith("\xff"_sr)) {
+			throw invalid_tenant_name();
+		}
+
+		loop {
+			wait(self->ctx.runManagementTransaction([self = self](Reference<typename DB::TransactionT> tr) {
+				return assignTenantAndStoreInManagementCluster(self, tr);
+			}));
+
+			self->replaceExistingTenantId = {};
+			try {
+				wait(self->ctx.runDataClusterTransaction(
+				    [self = self](Reference<ITransaction> tr) { return storeTenantInDataCluster(self, tr); }));
+
+				wait(self->ctx.runManagementTransaction(
+				    [self = self](Reference<typename DB::TransactionT> tr) { return markTenantReady(self, tr); }));
+
+				return Void();
+			} catch (Error& e) {
+				if (e.code() == error_code_tenant_creation_permanently_failed) {
+					// If the data cluster has permanently failed to create the tenant, then we can reassign it in
+					// the management cluster and start over
+					self->replaceExistingTenantId = self->tenantEntry.id;
+					self->ctx.clearCluster();
+				} else {
+					throw;
+				}
+			}
+		}
+	}
+	Future<Void> run() { return run(this); }
+};
+
+ACTOR template <class DB>
+Future<Void> createTenant(Reference<DB> db, TenantName name, TenantMapEntry tenantEntry) {
+	state CreateTenantImpl<DB> impl(db, name, tenantEntry);
+	wait(impl.run());
+	return Void();
+}
+
+template <class DB>
+struct DeleteTenantImpl {
+	MetaclusterOperationContext<DB> ctx;
+
+	// Initialization parameters
+	TenantName tenantName;
+
+	// Parameters set in getAssignedLocation
+	int64_t tenantId;
+
+	// Parameters set in markTenantInRemovingState
+	Optional<TenantName> pairName;
+
+	DeleteTenantImpl(Reference<DB> managementDb, TenantName tenantName) : ctx(managementDb), tenantName(tenantName) {}
+
+	// Loads the cluster details for the cluster where the tenant is assigned.
+	// Returns true if the deletion is already in progress
+	ACTOR static Future<bool> getAssignedLocation(DeleteTenantImpl* self, Reference<typename DB::TransactionT> tr) {
+		state Optional<TenantMapEntry> tenantEntry = wait(tryGetTenantTransaction(tr, self->tenantName));
+
+		if (!tenantEntry.present()) {
+			throw tenant_not_found();
+		}
+
+		// Disallow removing the "new" name of a renamed tenant before it completes
+		if (tenantEntry.get().tenantState == TenantState::RENAMING_TO) {
+			throw tenant_not_found();
+		}
+
+		if (tenantEntry.get().tenantState == TenantState::REMOVING) {
+			if (tenantEntry.get().renamePair.present()) {
+				self->pairName = tenantEntry.get().renamePair.get();
+			}
+		}
+
+		self->tenantId = tenantEntry.get().id;
+		wait(self->ctx.setCluster(tr, tenantEntry.get().assignedCluster.get()));
+		return tenantEntry.get().tenantState == TenantState::REMOVING;
+	}
+
+	// Does an initial check if the tenant is empty. This is an optimization to prevent us marking a tenant
+	// in the deleted state while it has data, but it is still possible that data gets added to it after this
+	// point.
+	//
+	// SOMEDAY: should this also lock the tenant when locking is supported?
+	ACTOR static Future<Void> checkTenantEmpty(DeleteTenantImpl* self, Reference<ITransaction> tr) {
+		state Optional<TenantMapEntry> tenantEntry = wait(TenantAPI::tryGetTenantTransaction(tr, self->tenantName));
+		if (!tenantEntry.present() || tenantEntry.get().id != self->tenantId) {
+			// The tenant must have been removed simultaneously
+			return Void();
+		}
+
+		ThreadFuture<RangeResult> rangeFuture = tr->getRange(prefixRange(tenantEntry.get().prefix), 1);
+		RangeResult result = wait(safeThreadFutureToFuture(rangeFuture));
+		if (!result.empty()) {
+			throw tenant_not_empty();
+		}
+
+		return Void();
+	}
+
+	// Mark the tenant as being in a removing state on the management cluster
+	ACTOR static Future<Void> markTenantInRemovingState(DeleteTenantImpl* self,
+	                                                    Reference<typename DB::TransactionT> tr) {
+		state Optional<TenantMapEntry> tenantEntry = wait(tryGetTenantTransaction(tr, self->tenantName));
+
+		if (!tenantEntry.present() || tenantEntry.get().id != self->tenantId) {
+			throw tenant_not_found();
+		}
+
+		if (tenantEntry.get().tenantState != TenantState::REMOVING) {
+			// Disallow removing the "new" name of a renamed tenant before it completes
+			if (tenantEntry.get().tenantState == TenantState::RENAMING_TO) {
+				throw tenant_not_found();
+			}
+			state TenantMapEntry updatedEntry = tenantEntry.get();
+			// Check if we are deleting a tenant in the middle of a rename
+			if (updatedEntry.renamePair.present()) {
+				ASSERT(updatedEntry.tenantState == TenantState::RENAMING_FROM);
+				self->pairName = updatedEntry.renamePair.get();
+			}
+			updatedEntry.tenantState = TenantState::REMOVING;
+			ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, updatedEntry);
+			// If this has a rename pair, also mark the other entry for deletion
+			if (self->pairName.present()) {
+				state Optional<TenantMapEntry> pairEntry = wait(tryGetTenantTransaction(tr, self->pairName.get()));
+				TenantMapEntry updatedPairEntry = pairEntry.get();
+				// Sanity check that our pair has us named as their partner
+				ASSERT(updatedPairEntry.renamePair.present());
+				ASSERT(updatedPairEntry.renamePair.get() == self->tenantName);
+				ASSERT(updatedPairEntry.id == self->tenantId);
+				CODE_PROBE(true, "marking pair tenant in removing state");
+				updatedPairEntry.tenantState = TenantState::REMOVING;
+				ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->pairName.get(), updatedPairEntry);
+			}
+		}
+
+		return Void();
+	}
+
+	// Delete the tenant and related metadata on the management cluster
+	ACTOR static Future<Void> deleteTenantFromManagementCluster(DeleteTenantImpl* self,
+	                                                            Reference<typename DB::TransactionT> tr,
+	                                                            bool pairDelete = false) {
+		// If pair is present, and this is not already a pair delete, call this function recursively
+		state Future<Void> pairFuture = Void();
+		if (!pairDelete && self->pairName.present()) {
+			CODE_PROBE(true, "deleting pair tenant from management cluster");
+			pairFuture = deleteTenantFromManagementCluster(self, tr, true);
+		}
+		state TenantName tenantName = pairDelete ? self->pairName.get() : self->tenantName;
+		state Optional<TenantMapEntry> tenantEntry = wait(tryGetTenantTransaction(tr, tenantName));
+
+		if (!tenantEntry.present() || tenantEntry.get().id != self->tenantId) {
+			return Void();
+		}
+
+		ASSERT(tenantEntry.get().tenantState == TenantState::REMOVING);
+
+		// Erase the tenant entry itself
+		ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, tenantName);
+		ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, tenantEntry.get().id);
+
+		// This is idempotent because this function is only called if the tenant is in the map
+		ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue);
+		ManagementClusterMetadata::clusterTenantCount.atomicOp(
+		    tr, tenantEntry.get().assignedCluster.get(), -1, MutationRef::AddValue);
+
+		// Remove the tenant from the cluster -> tenant index
+		ManagementClusterMetadata::clusterTenantIndex.erase(
+		    tr, Tuple::makeTuple(tenantEntry.get().assignedCluster.get(), tenantName, self->tenantId));
+
+		// Remove the tenant from its tenant group
+		wait(managementClusterRemoveTenantFromGroup(
+		    tr, tenantName, tenantEntry.get(), &self->ctx.dataClusterMetadata.get(), pairDelete));
+
+		wait(pairFuture);
+		return Void();
+	}
+
+	ACTOR static Future<Void> run(DeleteTenantImpl* self) {
+		// Get information about the tenant and where it is assigned
+		bool deletionInProgress = wait(self->ctx.runManagementTransaction(
+		    [self = self](Reference<typename DB::TransactionT> tr) { return getAssignedLocation(self, tr); }));
+
+		if (!deletionInProgress) {
+			wait(self->ctx.runDataClusterTransaction(
+			    [self = self](Reference<ITransaction> tr) { return checkTenantEmpty(self, tr); }));
+
+			wait(self->ctx.runManagementTransaction([self = self](Reference<typename DB::TransactionT> tr) {
+				return markTenantInRemovingState(self, tr);
+			}));
+		}
+
+		// Delete tenant on the data cluster
+		wait(self->ctx.runDataClusterTransaction([self = self](Reference<ITransaction> tr) {
+			// If the removed tenant is being renamed, attempt to delete both the old and new names.
+			// At most one should be present with the given ID, and the other will be a no-op.
+			Future<Void> pairDelete = Void();
+			if (self->pairName.present()) {
+				CODE_PROBE(true, "deleting pair tenant from data cluster");
+				pairDelete = TenantAPI::deleteTenantTransaction(
+				    tr, self->pairName.get(), self->tenantId, ClusterType::METACLUSTER_DATA);
+			}
+			return pairDelete && TenantAPI::deleteTenantTransaction(
+			                         tr, self->tenantName, self->tenantId, ClusterType::METACLUSTER_DATA);
+		}));
+		wait(self->ctx.runManagementTransaction([self = self](Reference<typename DB::TransactionT> tr) {
+			return deleteTenantFromManagementCluster(self, tr);
+		}));
+
+		return Void();
+	}
+	Future<Void> run() { return run(this); }
+};
+
+ACTOR template <class DB>
+Future<Void> deleteTenant(Reference<DB> db, TenantName name) {
+	state DeleteTenantImpl<DB> impl(db, name);
+	wait(impl.run());
+	return Void();
+}
+
+ACTOR template <class Transaction>
+Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantsTransaction(Transaction tr,
+                                                                                  TenantNameRef begin,
+                                                                                  TenantNameRef end,
+                                                                                  int limit) {
+	tr->setOption(FDBTransactionOptions::RAW_ACCESS);
+
+	KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> results =
+	    wait(ManagementClusterMetadata::tenantMetadata().tenantMap.getRange(tr, begin, end, limit));
+
+	return results.results;
+}
+
+ACTOR template <class DB>
+Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenants(Reference<DB> db,
+                                                                       TenantName begin,
+                                                                       TenantName end,
+                                                                       int limit) {
+	state Reference<typename DB::TransactionT> tr = db->createTransaction();
+
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
+			std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
+			    wait(listTenantsTransaction(tr, begin, end, limit));
+			return tenants;
+		} catch (Error& e) {
+			wait(safeThreadFutureToFuture(tr->onError(e)));
+		}
+	}
+}
+
+template <class DB>
+struct ConfigureTenantImpl {
+	MetaclusterOperationContext<DB> ctx;
+
+	// Initialization parameters
+	TenantName tenantName;
+	std::map<Standalone<StringRef>, Optional<Value>> configurationParameters;
+
+	// Parameters set in updateManagementCluster
+	TenantMapEntry updatedEntry;
+
+	ConfigureTenantImpl(Reference<DB> managementDb,
+	                    TenantName tenantName,
+	                    std::map<Standalone<StringRef>, Optional<Value>> configurationParameters)
+	  : ctx(managementDb), tenantName(tenantName), configurationParameters(configurationParameters) {}
+
+	// This verifies that the tenant group can be changed, and if so it updates all of the tenant group data
+	// structures. It does not update the TenantMapEntry stored in the tenant map.
+	ACTOR static Future<Void> updateTenantGroup(ConfigureTenantImpl* self,
+	                                            Reference<typename DB::TransactionT> tr,
+	                                            TenantMapEntry tenantEntry,
+	                                            Optional<TenantGroupName> desiredGroup) {
+
+		state TenantMapEntry entryWithUpdatedGroup = tenantEntry;
+		entryWithUpdatedGroup.tenantGroup = desiredGroup;
+
+		if (tenantEntry.tenantGroup == desiredGroup) {
+			return Void();
+		}
+
+		// Removing a tenant group is only possible if we have capacity for more groups on the current cluster
+		else if (!desiredGroup.present()) {
+			if (!self->ctx.dataClusterMetadata.get().entry.hasCapacity()) {
+				throw metacluster_no_capacity();
+			}
+
+			wait(managementClusterRemoveTenantFromGroup(
+			    tr, self->tenantName, tenantEntry, &self->ctx.dataClusterMetadata.get()));
+			managementClusterAddTenantToGroup(
+			    tr, self->tenantName, entryWithUpdatedGroup, &self->ctx.dataClusterMetadata.get(), false);
+			return Void();
+		}
+
+		state Optional<TenantGroupEntry> tenantGroupEntry =
+		    wait(ManagementClusterMetadata::tenantMetadata().tenantGroupMap.get(tr, desiredGroup.get()));
+
+		// If we are creating a new tenant group, we need to have capacity on the current cluster
+		if (!tenantGroupEntry.present()) {
+			if (!self->ctx.dataClusterMetadata.get().entry.hasCapacity()) {
+				throw metacluster_no_capacity();
+			}
+			wait(managementClusterRemoveTenantFromGroup(
+			    tr, self->tenantName, tenantEntry, &self->ctx.dataClusterMetadata.get()));
+			managementClusterAddTenantToGroup(
+			    tr, self->tenantName, entryWithUpdatedGroup, &self->ctx.dataClusterMetadata.get(), false);
+			return Void();
+		}
+
+		// Moves between groups in the same cluster are freely allowed
+		else if (tenantGroupEntry.get().assignedCluster == tenantEntry.assignedCluster) {
+			wait(managementClusterRemoveTenantFromGroup(
+			    tr, self->tenantName, tenantEntry, &self->ctx.dataClusterMetadata.get()));
+			managementClusterAddTenantToGroup(
+			    tr, self->tenantName, entryWithUpdatedGroup, &self->ctx.dataClusterMetadata.get(), true);
+			return Void();
+		}
+
+		// We don't currently support movement between groups on different clusters
+		else {
+			throw cluster_no_capacity();
+		}
+	}
+
+	// Updates the configuration in the management cluster and marks it as being in the UPDATING_CONFIGURATION state
+	ACTOR static Future<Void> updateManagementCluster(ConfigureTenantImpl* self,
+	                                                  Reference<typename DB::TransactionT> tr) {
+		state Optional<TenantMapEntry> tenantEntry = wait(tryGetTenantTransaction(tr, self->tenantName));
+
+		if (!tenantEntry.present()) {
+			throw tenant_not_found();
+		}
+
+		if (tenantEntry.get().tenantState != TenantState::READY &&
+		    tenantEntry.get().tenantState != TenantState::UPDATING_CONFIGURATION) {
+			throw invalid_tenant_state();
+		}
+
+		wait(self->ctx.setCluster(tr, tenantEntry.get().assignedCluster.get()));
+
+		self->updatedEntry = tenantEntry.get();
+		self->updatedEntry.tenantState = TenantState::UPDATING_CONFIGURATION;
+
+		state std::map<Standalone<StringRef>, Optional<Value>>::iterator configItr;
+		for (configItr = self->configurationParameters.begin(); configItr != self->configurationParameters.end();
+		     ++configItr) {
+			if (configItr->first == "tenant_group"_sr) {
+				wait(updateTenantGroup(self, tr, self->updatedEntry, configItr->second));
+			}
+			self->updatedEntry.configure(configItr->first, configItr->second);
+		}
+
+		++self->updatedEntry.configurationSequenceNum;
+		ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, self->updatedEntry);
+
+		return Void();
+	}
+
+	// Updates the configuration in the data cluster
+	ACTOR static Future<Void> updateDataCluster(ConfigureTenantImpl* self, Reference<ITransaction> tr) {
+		state Optional<TenantMapEntry> tenantEntry = wait(TenantAPI::tryGetTenantTransaction(tr, self->tenantName));
+
+		if (!tenantEntry.present() || tenantEntry.get().id != self->updatedEntry.id ||
+		    tenantEntry.get().configurationSequenceNum >= self->updatedEntry.configurationSequenceNum) {
+			// If the tenant isn't in the metacluster, it must have been concurrently removed
+			return Void();
+		}
+
+		TenantMapEntry dataClusterEntry = self->updatedEntry;
+		dataClusterEntry.tenantState = TenantState::READY;
+		dataClusterEntry.assignedCluster = {};
+
+		wait(TenantAPI::configureTenantTransaction(tr, self->tenantName, tenantEntry.get(), dataClusterEntry));
+		return Void();
+	}
+
+	// Updates the tenant state in the management cluster to READY
+	ACTOR static Future<Void> markManagementTenantAsReady(ConfigureTenantImpl* self,
+	                                                      Reference<typename DB::TransactionT> tr) {
+		state Optional<TenantMapEntry> tenantEntry = wait(tryGetTenantTransaction(tr, self->tenantName));
+
+		if (!tenantEntry.present() || tenantEntry.get().id != self->updatedEntry.id ||
+		    tenantEntry.get().tenantState != TenantState::UPDATING_CONFIGURATION ||
+		    tenantEntry.get().configurationSequenceNum > self->updatedEntry.configurationSequenceNum) {
+			return Void();
+		}
+
+		tenantEntry.get().tenantState = TenantState::READY;
+		ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, tenantEntry.get());
+		return Void();
+	}
+
+	ACTOR static Future<Void> run(ConfigureTenantImpl* self) {
+		wait(self->ctx.runManagementTransaction(
+		    [self = self](Reference<typename DB::TransactionT> tr) { return updateManagementCluster(self, tr); }));
+		wait(self->ctx.runDataClusterTransaction(
+		    [self = self](Reference<ITransaction> tr) { return updateDataCluster(self, tr); }));
+		wait(self->ctx.runManagementTransaction(
+		    [self = self](Reference<typename DB::TransactionT> tr) { return markManagementTenantAsReady(self, tr); }));
+
+		return Void();
+	}
+	Future<Void> run() { return run(this); }
+};
+
+ACTOR template <class DB>
+Future<Void> configureTenant(Reference<DB> db,
+                             TenantName name,
+                             std::map<Standalone<StringRef>, Optional<Value>> configurationParameters) {
+	state ConfigureTenantImpl<DB> impl(db, name, configurationParameters);
+	wait(impl.run());
+	return Void();
+}
+
+template <class DB>
+struct RenameTenantImpl {
+	MetaclusterOperationContext<DB> ctx;
+
+	// Initialization parameters
+	TenantName oldName;
+	TenantName newName;
+
+	// Parameters set in markTenantsInRenamingState
+	int64_t tenantId = -1;
+	int64_t configurationSequenceNum = -1;
+
+	RenameTenantImpl(Reference<DB> managementDb, TenantName oldName, TenantName newName)
+	  : ctx(managementDb), oldName(oldName), newName(newName) {}
+
+	// Delete the tenant and related metadata on the management cluster
+	ACTOR static Future<Void> deleteTenantFromManagementCluster(RenameTenantImpl* self,
+	                                                            Reference<typename DB::TransactionT> tr,
+	                                                            TenantMapEntry tenantEntry) {
+		// Erase the tenant entry itself
+		ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, self->oldName);
+
+		// Remove old tenant from tenant count
+		ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue);
+		ManagementClusterMetadata::clusterTenantCount.atomicOp(
+		    tr, tenantEntry.assignedCluster.get(), -1, MutationRef::AddValue);
+
+		// Clean up cluster based tenant indices and remove the old entry from its tenant group
+		// Remove the tenant from the cluster -> tenant index
+		ManagementClusterMetadata::clusterTenantIndex.erase(
+		    tr, Tuple::makeTuple(tenantEntry.assignedCluster.get(), self->oldName, self->tenantId));
+
+		// Remove the tenant from its tenant group
+		wait(managementClusterRemoveTenantFromGroup(
+		    tr, self->oldName, tenantEntry, &self->ctx.dataClusterMetadata.get(), true));
+
+		return Void();
+	}
+
+	ACTOR static Future<Void> markTenantsInRenamingState(RenameTenantImpl* self,
+	                                                     Reference<typename DB::TransactionT> tr) {
+		state TenantMapEntry oldTenantEntry;
+		state Optional<TenantMapEntry> newTenantEntry;
+		wait(store(oldTenantEntry, getTenantTransaction(tr, self->oldName)) &&
+		     store(newTenantEntry, tryGetTenantTransaction(tr, self->newName)));
+
+		if (self->tenantId != -1 && oldTenantEntry.id != self->tenantId) {
+			// The tenant must have been removed simultaneously
+			CODE_PROBE(true, "Metacluster rename old tenant ID mismatch");
+			throw tenant_removed();
+		}
+
+		// If marked for deletion, abort the rename
+		if (oldTenantEntry.tenantState == TenantState::REMOVING) {
+			CODE_PROBE(true, "Metacluster rename candidates marked for deletion");
+			throw tenant_removed();
+		}
+
+		// If the new entry is present, we can only continue if this is a retry of the same rename
+		// To check this, verify both entries are in the correct state
+		// and have each other as pairs
+		if (newTenantEntry.present()) {
+			if (newTenantEntry.get().tenantState == TenantState::RENAMING_TO &&
+			    oldTenantEntry.tenantState == TenantState::RENAMING_FROM && newTenantEntry.get().renamePair.present() &&
+			    newTenantEntry.get().renamePair.get() == self->oldName && oldTenantEntry.renamePair.present() &&
+			    oldTenantEntry.renamePair.get() == self->newName) {
+				wait(self->ctx.setCluster(tr, oldTenantEntry.assignedCluster.get()));
+				self->tenantId = newTenantEntry.get().id;
+				self->configurationSequenceNum = newTenantEntry.get().configurationSequenceNum;
+				CODE_PROBE(true, "Metacluster rename retry in progress");
+				return Void();
+			} else {
+				CODE_PROBE(true, "Metacluster rename new name already exists");
+				throw tenant_already_exists();
+			};
+		} else {
+			if (self->tenantId == -1) {
+				self->tenantId = oldTenantEntry.id;
+			}
+			++oldTenantEntry.configurationSequenceNum;
+			self->configurationSequenceNum = oldTenantEntry.configurationSequenceNum;
+			wait(self->ctx.setCluster(tr, oldTenantEntry.assignedCluster.get()));
+			if (oldTenantEntry.tenantState != TenantState::READY) {
+				CODE_PROBE(true, "Metacluster unable to proceed with rename operation");
+				throw invalid_tenant_state();
+			}
+		}
+
+		// Check cluster capacity. If we would exceed the amount due to temporary extra tenants
+		// then we deny the rename request altogether.
+		int64_t clusterTenantCount = wait(ManagementClusterMetadata::clusterTenantCount.getD(
+		    tr, oldTenantEntry.assignedCluster.get(), Snapshot::False, 0));
+
+		if (clusterTenantCount + 1 > CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER) {
+			throw cluster_no_capacity();
+		}
+
+		TenantMapEntry updatedOldEntry = oldTenantEntry;
+		TenantMapEntry updatedNewEntry(updatedOldEntry);
+		ASSERT(updatedOldEntry.configurationSequenceNum == self->configurationSequenceNum);
+		ASSERT(updatedNewEntry.configurationSequenceNum == self->configurationSequenceNum);
+		updatedOldEntry.tenantState = TenantState::RENAMING_FROM;
+		updatedNewEntry.tenantState = TenantState::RENAMING_TO;
+		updatedOldEntry.renamePair = self->newName;
+		updatedNewEntry.renamePair = self->oldName;
+
+		ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->oldName, updatedOldEntry);
+		ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->newName, updatedNewEntry);
+
+		// Add temporary tenant to tenantCount to prevent exceeding capacity during a rename
+		ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue);
+		ManagementClusterMetadata::clusterTenantCount.atomicOp(
+		    tr, updatedNewEntry.assignedCluster.get(), 1, MutationRef::AddValue);
+
+		// Updated indexes to include the new tenant
+		ManagementClusterMetadata::clusterTenantIndex.insert(
+		    tr, Tuple::makeTuple(updatedNewEntry.assignedCluster.get(), self->newName, self->tenantId));
+
+		// Add new name to tenant group. It should already exist since the old name was part of it.
+		managementClusterAddTenantToGroup(
+		    tr, self->newName, updatedNewEntry, &self->ctx.dataClusterMetadata.get(), true);
+		return Void();
+	}
+
+	ACTOR static Future<Void> updateDataCluster(RenameTenantImpl* self, Reference<typename DB::TransactionT> tr) {
+		ASSERT(self->tenantId != -1);
+		ASSERT(self->configurationSequenceNum != -1);
+		wait(TenantAPI::renameTenantTransaction(tr,
+		                                        self->oldName,
+		                                        self->newName,
+		                                        self->tenantId,
+		                                        ClusterType::METACLUSTER_DATA,
+		                                        self->configurationSequenceNum));
+		return Void();
+	}
+
+	ACTOR static Future<Void> finishRenameFromManagementCluster(RenameTenantImpl* self,
+	                                                            Reference<typename DB::TransactionT> tr) {
+		state Optional<TenantMapEntry> oldTenantEntry;
+		state Optional<TenantMapEntry> newTenantEntry;
+		wait(store(oldTenantEntry, tryGetTenantTransaction(tr, self->oldName)) &&
+		     store(newTenantEntry, tryGetTenantTransaction(tr, self->newName)));
+
+		// Another (or several other) operations have already removed/changed the old entry
+		// Possible for the new entry to also have been tampered with,
+		// so it may or may not be present with or without the same id, which are all
+		// legal states. Assume the rename completed properly in this case
+		if (!oldTenantEntry.present() || oldTenantEntry.get().id != self->tenantId ||
+		    oldTenantEntry.get().configurationSequenceNum > self->configurationSequenceNum) {
+			CODE_PROBE(true,
+			           "Metacluster finished rename with missing entries, mismatched id, and/or mismatched "
+			           "configuration sequence.");
+			return Void();
+		}
+		if (oldTenantEntry.get().tenantState == TenantState::REMOVING) {
+			ASSERT(newTenantEntry.get().tenantState == TenantState::REMOVING);
+			throw tenant_removed();
+		}
+		ASSERT(newTenantEntry.present());
+		ASSERT(newTenantEntry.get().id == self->tenantId);
+
+		TenantMapEntry updatedOldEntry = oldTenantEntry.get();
+		TenantMapEntry updatedNewEntry = newTenantEntry.get();
+
+		// Only update if in the expected state
+		if (updatedNewEntry.tenantState == TenantState::RENAMING_TO) {
+			updatedNewEntry.tenantState = TenantState::READY;
+			updatedNewEntry.renamePair.reset();
+			ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->newName, updatedNewEntry);
+			ManagementClusterMetadata::tenantMetadata().tenantIdIndex.set(tr, self->tenantId, self->newName);
+		}
+
+		// We will remove the old entry from the management cluster
+		// This should still be the same old entry since the tenantId matches from the check above.
+		wait(deleteTenantFromManagementCluster(self, tr, updatedOldEntry));
+		return Void();
+	}
+
+	ACTOR static Future<Void> run(RenameTenantImpl* self) {
+		wait(self->ctx.runManagementTransaction(
+		    [self = self](Reference<typename DB::TransactionT> tr) { return markTenantsInRenamingState(self, tr); }));
+
+		// Rename tenant on the data cluster
+		try {
+			wait(self->ctx.runDataClusterTransaction(
+			    [self = self](Reference<ITransaction> tr) { return updateDataCluster(self, tr); }));
+		} catch (Error& e) {
+			// Since we track the tenant entries on the management cluster, these error codes should only appear
+			// on a retry of the transaction, typically caused by commit_unknown_result.
+			// Operating on the assumption that the first transaction completed successfully, we keep going
+			// so we can finish the rename on the management cluster.
+			if (e.code() == error_code_tenant_not_found || e.code() == error_code_tenant_already_exists) {
+				CODE_PROBE(true, "Metacluster rename ran into commit_unknown_result");
+			} else {
+				throw e;
+			}
+		}
+
+		wait(self->ctx.runManagementTransaction([self = self](Reference<typename DB::TransactionT> tr) {
+			return finishRenameFromManagementCluster(self, tr);
+		}));
+		return Void();
+	}
+	Future<Void> run() { return run(this); }
+};
+
+ACTOR template <class DB>
+Future<Void> renameTenant(Reference<DB> db, TenantName oldName, TenantName newName) {
+	state RenameTenantImpl<DB> impl(db, oldName, newName);
+	wait(impl.run());
+	return Void();
+}
+
+} // namespace MetaclusterAPI
+
+#include "flow/unactorcompiler.h"
+#endif
\ No newline at end of file
diff --git a/fdbclient/include/fdbclient/MultiVersionTransaction.h b/fdbclient/include/fdbclient/MultiVersionTransaction.h
index 9593a3bc67..b17601cb19 100644
--- a/fdbclient/include/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/include/fdbclient/MultiVersionTransaction.h
@@ -122,6 +122,8 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	// Network
 	fdb_error_t (*selectApiVersion)(int runtimeVersion, int headerVersion);
 	const char* (*getClientVersion)();
+	void (*useFutureProtocolVersion)();
+
 	fdb_error_t (*setNetworkOption)(FDBNetworkOption option, uint8_t const* value, int valueLength);
 	fdb_error_t (*setupNetwork)();
 	fdb_error_t (*runNetwork)();
@@ -169,6 +171,32 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	                                                uint8_t const* purge_key_name,
 	                                                int purge_key_name_length);
 
+	FDBFuture* (*databaseBlobbifyRange)(FDBDatabase* db,
+	                                    uint8_t const* begin_key_name,
+	                                    int begin_key_name_length,
+	                                    uint8_t const* end_key_name,
+	                                    int end_key_name_length);
+
+	FDBFuture* (*databaseUnblobbifyRange)(FDBDatabase* db,
+	                                      uint8_t const* begin_key_name,
+	                                      int begin_key_name_length,
+	                                      uint8_t const* end_key_name,
+	                                      int end_key_name_length);
+
+	FDBFuture* (*databaseListBlobbifiedRanges)(FDBDatabase* db,
+	                                           uint8_t const* begin_key_name,
+	                                           int begin_key_name_length,
+	                                           uint8_t const* end_key_name,
+	                                           int end_key_name_length,
+	                                           int rangeLimit);
+
+	FDBFuture* (*databaseVerifyBlobRange)(FDBDatabase* db,
+	                                      uint8_t const* begin_key_name,
+	                                      int begin_key_name_length,
+	                                      uint8_t const* end_key_name,
+	                                      int end_key_name_length,
+	                                      Optional<Version> version);
+
 	// Tenant
 	fdb_error_t (*tenantCreateTransaction)(FDBTenant* tenant, FDBTransaction** outTransaction);
 
@@ -270,20 +298,39 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	                                             int end_key_name_length,
 	                                             int64_t chunkSize);
 
-	FDBFuture* (*transactionGetBlobGranuleRanges)(FDBTransaction* db,
+	FDBFuture* (*transactionGetBlobGranuleRanges)(FDBTransaction* tr,
 	                                              uint8_t const* begin_key_name,
 	                                              int begin_key_name_length,
 	                                              uint8_t const* end_key_name,
-	                                              int end_key_name_length);
+	                                              int end_key_name_length,
+	                                              int rangeLimit);
 
-	FDBResult* (*transactionReadBlobGranules)(FDBTransaction* db,
+	FDBResult* (*transactionReadBlobGranules)(FDBTransaction* tr,
 	                                          uint8_t const* begin_key_name,
 	                                          int begin_key_name_length,
 	                                          uint8_t const* end_key_name,
 	                                          int end_key_name_length,
 	                                          int64_t beginVersion,
-	                                          int64_t readVersion,
-	                                          FDBReadBlobGranuleContext granule_context);
+	                                          int64_t readVersion);
+
+	FDBFuture* (*transactionReadBlobGranulesStart)(FDBTransaction* tr,
+	                                               uint8_t const* begin_key_name,
+	                                               int begin_key_name_length,
+	                                               uint8_t const* end_key_name,
+	                                               int end_key_name_length,
+	                                               int64_t beginVersion,
+	                                               int64_t readVersion,
+	                                               int64_t* readVersionOut);
+
+	FDBResult* (*transactionReadBlobGranulesFinish)(FDBTransaction* tr,
+	                                                FDBFuture* startFuture,
+	                                                uint8_t const* begin_key_name,
+	                                                int begin_key_name_length,
+	                                                uint8_t const* end_key_name,
+	                                                int end_key_name_length,
+	                                                int64_t beginVersion,
+	                                                int64_t readVersion,
+	                                                FDBReadBlobGranuleContext* granule_context);
 
 	FDBFuture* (*transactionCommit)(FDBTransaction* tr);
 	fdb_error_t (*transactionGetCommittedVersion)(FDBTransaction* tr, int64_t* outVersion);
@@ -374,13 +421,26 @@ public:
 	ThreadFuture<int64_t> getEstimatedRangeSizeBytes(const KeyRangeRef& keys) override;
 	ThreadFuture<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(const KeyRangeRef& range,
 	                                                                int64_t chunkSize) override;
-	ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRangeRef& keyRange) override;
+	ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRangeRef& keyRange,
+	                                                                      int rangeLimit) override;
 
 	ThreadResult<RangeResult> readBlobGranules(const KeyRangeRef& keyRange,
 	                                           Version beginVersion,
 	                                           Optional<Version> readVersion,
 	                                           ReadBlobGranuleContext granule_context) override;
 
+	ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesStart(const KeyRangeRef& keyRange,
+	                                                                               Version beginVersion,
+	                                                                               Optional<Version> readVersion,
+	                                                                               Version* readVersionOut) override;
+
+	ThreadResult<RangeResult> readBlobGranulesFinish(
+	    ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>> startFuture,
+	    const KeyRangeRef& keyRange,
+	    Version beginVersion,
+	    Version readVersion,
+	    ReadBlobGranuleContext granuleContext) override;
+
 	void addReadConflictRange(const KeyRangeRef& keys) override;
 
 	void atomicOp(const KeyRef& key, const ValueRef& value, uint32_t operationType) override;
@@ -474,6 +534,12 @@ public:
 	ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override;
 	ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) override;
 
+	ThreadFuture<bool> blobbifyRange(const KeyRangeRef& keyRange) override;
+	ThreadFuture<bool> unblobbifyRange(const KeyRangeRef& keyRange) override;
+	ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRanges(const KeyRangeRef& keyRange,
+	                                                                      int rangeLimit) override;
+	ThreadFuture<Version> verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) override;
+
 	ThreadFuture<DatabaseSharedState*> createSharedState() override;
 	void setSharedState(DatabaseSharedState* p) override;
 
@@ -492,6 +558,7 @@ public:
 
 	void selectApiVersion(int apiVersion) override;
 	const char* getClientVersion() override;
+	void useFutureProtocolVersion() override;
 
 	void setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	void setupNetwork() override;
@@ -571,13 +638,26 @@ public:
 
 	ThreadFuture<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(const KeyRangeRef& range,
 	                                                                int64_t chunkSize) override;
-	ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRangeRef& keyRange) override;
+	ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRangeRef& keyRange,
+	                                                                      int rangeLimit) override;
 
 	ThreadResult<RangeResult> readBlobGranules(const KeyRangeRef& keyRange,
 	                                           Version beginVersion,
 	                                           Optional<Version> readVersion,
 	                                           ReadBlobGranuleContext granule_context) override;
 
+	ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesStart(const KeyRangeRef& keyRange,
+	                                                                               Version beginVersion,
+	                                                                               Optional<Version> readVersion,
+	                                                                               Version* readVersionOut) override;
+
+	ThreadResult<RangeResult> readBlobGranulesFinish(
+	    ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>> startFuture,
+	    const KeyRangeRef& keyRange,
+	    Version beginVersion,
+	    Version readVersion,
+	    ReadBlobGranuleContext granuleContext) override;
+
 	void atomicOp(const KeyRef& key, const ValueRef& value, uint32_t operationType) override;
 	void set(const KeyRef& key, const ValueRef& value) override;
 	void clear(const KeyRef& begin, const KeyRef& end) override;
@@ -643,6 +723,9 @@ private:
 	template <class T>
 	ThreadResult<T> abortableTimeoutResult(ThreadFuture<Void> abortSignal);
 
+	template <class T>
+	ThreadResult<T> abortableResult(ThreadResult<T> result, ThreadFuture<Void> abortSignal);
+
 	TransactionInfo transaction;
 
 	TransactionInfo getTransaction();
@@ -655,8 +738,10 @@ private:
 struct ClientDesc {
 	std::string const libPath;
 	bool const external;
+	bool const useFutureVersion;
 
-	ClientDesc(std::string libPath, bool external) : libPath(libPath), external(external) {}
+	ClientDesc(std::string libPath, bool external, bool useFutureVersion)
+	  : libPath(libPath), external(external), useFutureVersion(useFutureVersion) {}
 };
 
 struct ClientInfo : ClientDesc, ThreadSafeReferenceCounted<ClientInfo> {
@@ -668,11 +753,11 @@ struct ClientInfo : ClientDesc, ThreadSafeReferenceCounted<ClientInfo> {
 	std::vector<std::pair<void (*)(void*), void*>> threadCompletionHooks;
 
 	ClientInfo()
-	  : ClientDesc(std::string(), false), protocolVersion(0), api(nullptr), failed(true), initialized(false) {}
+	  : ClientDesc(std::string(), false, false), protocolVersion(0), api(nullptr), failed(true), initialized(false) {}
 	ClientInfo(IClientApi* api)
-	  : ClientDesc("internal", false), protocolVersion(0), api(api), failed(false), initialized(false) {}
-	ClientInfo(IClientApi* api, std::string libPath)
-	  : ClientDesc(libPath, true), protocolVersion(0), api(api), failed(false), initialized(false) {}
+	  : ClientDesc("internal", false, false), protocolVersion(0), api(api), failed(false), initialized(false) {}
+	ClientInfo(IClientApi* api, std::string libPath, bool useFutureVersion)
+	  : ClientDesc(libPath, true, useFutureVersion), protocolVersion(0), api(api), failed(false), initialized(false) {}
 
 	void loadVersion();
 	bool canReplace(Reference<ClientInfo> other) const;
@@ -812,6 +897,12 @@ public:
 	ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override;
 	ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) override;
 
+	ThreadFuture<bool> blobbifyRange(const KeyRangeRef& keyRange) override;
+	ThreadFuture<bool> unblobbifyRange(const KeyRangeRef& keyRange) override;
+	ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRanges(const KeyRangeRef& keyRange,
+	                                                                      int rangeLimit) override;
+	ThreadFuture<Version> verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) override;
+
 	ThreadFuture<DatabaseSharedState*> createSharedState() override;
 	void setSharedState(DatabaseSharedState* p) override;
 
@@ -919,6 +1010,7 @@ class MultiVersionApi : public IClientApi {
 public:
 	void selectApiVersion(int apiVersion) override;
 	const char* getClientVersion() override;
+	void useFutureProtocolVersion() override;
 
 	void setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	void setupNetwork() override;
@@ -965,7 +1057,7 @@ private:
 
 	void disableMultiVersionClientApi();
 	void setCallbacksOnExternalThreads();
-	void addExternalLibrary(std::string path);
+	void addExternalLibrary(std::string path, bool useFutureVersion);
 	void addExternalLibraryDirectory(std::string path);
 	// Return a vector of (pathname, unlink_on_close) pairs.  Makes threadCount - 1 copies of the library stored in
 	// path, and returns a vector of length threadCount.
diff --git a/fdbclient/include/fdbclient/NativeAPI.actor.h b/fdbclient/include/fdbclient/NativeAPI.actor.h
index 9411db3eee..02ccf6d500 100644
--- a/fdbclient/include/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/include/fdbclient/NativeAPI.actor.h
@@ -239,14 +239,12 @@ FDB_DECLARE_BOOLEAN_PARAM(AllowInvalidTenantID);
 
 struct TransactionState : ReferenceCounted<TransactionState> {
 	Database cx;
-	int64_t tenantId = TenantInfo::INVALID_TENANT;
 	Optional<Standalone<StringRef>> authToken;
 	Reference<TransactionLogInfo> trLogInfo;
 	TransactionOptions options;
+	Optional<ReadOptions> readOptions;
 
-	Optional<UID> debugID;
 	TaskPriority taskID;
-	ReadType readType = ReadType::NORMAL;
 	SpanContext spanContext;
 	UseProvisionalProxies useProvisionalProxies = UseProvisionalProxies::False;
 	bool readVersionObtainedFromGrvProxy;
@@ -286,8 +284,18 @@ struct TransactionState : ReferenceCounted<TransactionState> {
 	Optional<TenantName> const& tenant();
 	bool hasTenant() const;
 
+	int64_t tenantId() const { return tenantId_; }
+	void trySetTenantId(int64_t tenantId) {
+		if (tenantId_ == TenantInfo::INVALID_TENANT) {
+			tenantId_ = tenantId;
+		}
+	}
+
+	Future<Void> handleUnknownTenant();
+
 private:
 	Optional<TenantName> tenant_;
+	int64_t tenantId_ = TenantInfo::INVALID_TENANT;
 	bool tenantSet;
 };
 
@@ -407,12 +415,16 @@ public:
 	// The returned list would still be in form of [keys.begin, splitPoint1, splitPoint2, ... , keys.end]
 	Future<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(KeyRange const& keys, int64_t chunkSize);
 
-	Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRange& range);
+	Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRange& range, int rangeLimit);
 	Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranules(const KeyRange& range,
 	                                                                    Version begin,
 	                                                                    Optional<Version> readVersion,
 	                                                                    Version* readVersionOut = nullptr);
 
+	Future<Standalone<VectorRef<BlobGranuleSummaryRef>>> summarizeBlobGranules(const KeyRange& range,
+	                                                                           Version summaryVersion,
+	                                                                           int rangeLimit);
+
 	// If checkWriteConflictRanges is true, existing write conflict ranges will be searched for this key
 	void set(const KeyRef& key, const ValueRef& value, AddConflictRange = AddConflictRange::True);
 	void atomicOp(const KeyRef& key,
@@ -447,7 +459,13 @@ public:
 	void fullReset();
 	double getBackoff(int errCode);
 
-	void debugTransaction(UID dID) { trState->debugID = dID; }
+	void debugTransaction(UID dID) {
+		if (trState->readOptions.present()) {
+			trState->readOptions.get().debugID = dID;
+		} else {
+			trState->readOptions = ReadOptions(dID);
+		}
+	}
 	VersionVector getVersionVector() const;
 	SpanContext getSpanContext() const { return trState->spanContext; }
 
diff --git a/fdbclient/include/fdbclient/ReadYourWrites.h b/fdbclient/include/fdbclient/ReadYourWrites.h
index 89de979bc1..46650be3d3 100644
--- a/fdbclient/include/fdbclient/ReadYourWrites.h
+++ b/fdbclient/include/fdbclient/ReadYourWrites.h
@@ -20,6 +20,7 @@
 
 #ifndef FDBCLIENT_READYOURWRITES_H
 #define FDBCLIENT_READYOURWRITES_H
+#include "Status.h"
 #pragma once
 
 #include "fdbclient/NativeAPI.actor.h"
@@ -120,7 +121,7 @@ public:
 	Future<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(const KeyRange& range, int64_t chunkSize) override;
 	Future<int64_t> getEstimatedRangeSizeBytes(const KeyRange& keys) override;
 
-	Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRange& range) override;
+	Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRange& range, int rangeLimit) override;
 	Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranules(const KeyRange& range,
 	                                                                    Version begin,
 	                                                                    Optional<Version> readVersion,
@@ -192,7 +193,17 @@ public:
 	KeyRangeMap<std::pair<bool, Optional<Value>>>& getSpecialKeySpaceWriteMap() { return specialKeySpaceWriteMap; }
 	bool readYourWritesDisabled() const { return options.readYourWritesDisabled; }
 	const Optional<std::string>& getSpecialKeySpaceErrorMsg() { return specialKeySpaceErrorMsg; }
-	void setSpecialKeySpaceErrorMsg(const std::string& msg) { specialKeySpaceErrorMsg = msg; }
+	void setSpecialKeySpaceErrorMsg(const std::string& msg) {
+		if (g_network && g_network->isSimulated()) {
+			try {
+				readJSONStrictly(msg);
+			} catch (Error& e) {
+				TraceEvent(SevError, "InvalidSpecialKeySpaceErrorMessage").error(e).detail("Message", msg);
+				ASSERT(false);
+			}
+		}
+		specialKeySpaceErrorMsg = msg;
+	}
 	Transaction& getTransaction() { return tr; }
 
 	Optional<TenantName> getTenant() { return tr.getTenant(); }
diff --git a/fdbclient/include/fdbclient/ServerKnobs.h b/fdbclient/include/fdbclient/ServerKnobs.h
index 48702be13c..ab55f3dc43 100644
--- a/fdbclient/include/fdbclient/ServerKnobs.h
+++ b/fdbclient/include/fdbclient/ServerKnobs.h
@@ -50,7 +50,6 @@ public:
 	bool PEEK_USING_STREAMING;
 	double TLOG_TIMEOUT; // tlog OR commit proxy failure - master's reaction time
 	double TLOG_SLOW_REJOIN_WARN_TIMEOUT_SECS; // Warns if a tlog takes too long to rejoin
-	double RECOVERY_TLOG_SMART_QUORUM_DELAY; // smaller might be better for bug amplification
 	double TLOG_STORAGE_MIN_UPDATE_INTERVAL;
 	double BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL;
 	int DESIRED_TOTAL_BYTES;
@@ -58,10 +57,6 @@ public:
 	double UPDATE_DELAY;
 	int MAXIMUM_PEEK_BYTES;
 	int APPLY_MUTATION_BYTES;
-	int RECOVERY_DATA_BYTE_LIMIT;
-	int BUGGIFY_RECOVERY_DATA_LIMIT;
-	double LONG_TLOG_COMMIT_TIME;
-	int64_t LARGE_TLOG_COMMIT_BYTES;
 	double BUGGIFY_RECOVER_MEMORY_LIMIT;
 	double BUGGIFY_WORKER_REMOVED_MAX_LAG;
 	int64_t UPDATE_STORAGE_BYTE_LIMIT;
@@ -123,16 +118,16 @@ public:
 	double BG_REBALANCE_POLLING_INTERVAL;
 	double BG_REBALANCE_SWITCH_CHECK_INTERVAL;
 	double DD_QUEUE_LOGGING_INTERVAL;
+	double DD_QUEUE_COUNTER_REFRESH_INTERVAL;
+	double DD_QUEUE_COUNTER_MAX_LOG; // max number of servers for which trace events will be generated in each round of
+	                                 // DD_QUEUE_COUNTER_REFRESH_INTERVAL duration
+	bool DD_QUEUE_COUNTER_SUMMARIZE; // Enable summary of remaining servers when the number of servers with ongoing
+	                                 // relocations in the last minute exceeds DD_QUEUE_COUNTER_MAX_LOG
 	double RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
 	double RELOCATION_PARALLELISM_PER_DEST_SERVER;
 	int DD_QUEUE_MAX_KEY_SERVERS;
 	int DD_REBALANCE_PARALLELISM;
 	int DD_REBALANCE_RESET_AMOUNT;
-	double BG_DD_MAX_WAIT;
-	double BG_DD_MIN_WAIT;
-	double BG_DD_INCREASE_RATE;
-	double BG_DD_DECREASE_RATE;
-	double BG_DD_SATURATION_DELAY;
 	double INFLIGHT_PENALTY_HEALTHY;
 	double INFLIGHT_PENALTY_REDUNDANT;
 	double INFLIGHT_PENALTY_UNHEALTHY;
@@ -161,9 +156,14 @@ public:
 	int PRIORITY_TEAM_FAILED; // Priority when a server in the team is excluded as failed
 	int PRIORITY_TEAM_0_LEFT;
 	int PRIORITY_SPLIT_SHARD;
+	int PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD; // Priority when a physical shard is oversize or anonymous
 
 	// Data distribution
 	bool SHARD_ENCODE_LOCATION_METADATA; // If true, location metadata will contain shard ID.
+	bool ENABLE_DD_PHYSICAL_SHARD; // EXPERIMENTAL; If true, SHARD_ENCODE_LOCATION_METADATA must be true.
+	int64_t MAX_PHYSICAL_SHARD_BYTES;
+	double PHYSICAL_SHARD_METRICS_DELAY;
+	double ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME;
 
 	double READ_REBALANCE_CPU_THRESHOLD; // read rebalance only happens if the source servers' CPU > threshold
 	int READ_REBALANCE_SRC_PARALLELISM; // the max count a server become a source server within a certain interval
@@ -195,7 +195,6 @@ public:
 	double SERVER_LIST_DELAY;
 	double RECRUITMENT_IDLE_DELAY;
 	double STORAGE_RECRUITMENT_DELAY;
-	double BLOB_WORKER_RECRUITMENT_DELAY;
 	bool TSS_HACK_IDENTITY_MAPPING;
 	double TSS_RECRUITMENT_TIMEOUT;
 	double TSS_DD_CHECK_INTERVAL;
@@ -234,6 +233,8 @@ public:
 	int DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY;
 	int DD_STORAGE_WIGGLE_PAUSE_THRESHOLD; // How many unhealthy relocations are ongoing will pause storage wiggle
 	int DD_STORAGE_WIGGLE_STUCK_THRESHOLD; // How many times bestTeamStuck accumulate will pause storage wiggle
+	int64_t
+	    DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC; // Minimal age of a correct-configured server before it's chosen to be wiggled
 	bool DD_TENANT_AWARENESS_ENABLED;
 	int TENANT_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantCache is refreshed
 
@@ -255,9 +256,8 @@ public:
 
 	// Run storage enginee on a child process on the same machine with storage process
 	bool REMOTE_KV_STORE;
-	// A delay to avoid race on file resources if the new kv store process started immediately after the previous kv
-	// store process died
-	double REMOTE_KV_STORE_INIT_DELAY;
+	// A delay to avoid race on file resources after seeing lock_file_failure
+	double REBOOT_KV_STORE_DELAY;
 	// max waiting time for the remote kv store to initialize
 	double REMOTE_KV_STORE_MAX_INIT_DURATION;
 
@@ -302,6 +302,7 @@ public:
 	int64_t REPLACE_CONTENTS_BYTES;
 
 	// KeyValueStoreRocksDB
+	int ROCKSDB_READ_RANGE_ROW_LIMIT;
 	int ROCKSDB_BACKGROUND_PARALLELISM;
 	int ROCKSDB_READ_PARALLELISM;
 	int64_t ROCKSDB_MEMTABLE_BYTES;
@@ -328,6 +329,7 @@ public:
 	std::string DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY;
 	bool ROCKSDB_PERFCONTEXT_ENABLE; // Enable rocks perf context metrics. May cause performance overhead
 	double ROCKSDB_PERFCONTEXT_SAMPLE_RATE;
+	double ROCKSDB_METRICS_SAMPLE_INTERVAL;
 	int ROCKSDB_MAX_SUBCOMPACTIONS;
 	int64_t ROCKSDB_SOFT_PENDING_COMPACT_BYTES_LIMIT;
 	int64_t ROCKSDB_HARD_PENDING_COMPACT_BYTES_LIMIT;
@@ -337,6 +339,12 @@ public:
 	int64_t ROCKSDB_COMPACTION_READAHEAD_SIZE;
 	int64_t ROCKSDB_BLOCK_SIZE;
 	bool ENABLE_SHARDED_ROCKSDB;
+	int64_t ROCKSDB_WRITE_BUFFER_SIZE;
+	int64_t ROCKSDB_CF_WRITE_BUFFER_SIZE;
+	int64_t ROCKSDB_MAX_TOTAL_WAL_SIZE;
+	int64_t ROCKSDB_MAX_BACKGROUND_JOBS;
+	int64_t ROCKSDB_DELETE_OBSOLETE_FILE_PERIOD;
+	double ROCKSDB_PHYSICAL_SHARD_CLEAN_UP_DELAY;
 
 	// Leader election
 	int MAX_NOTIFICATIONS;
@@ -548,6 +556,10 @@ public:
 	double RATEKEEPER_DEFAULT_LIMIT;
 	double RATEKEEPER_LIMIT_REASON_SAMPLE_RATE;
 	bool RATEKEEPER_PRINT_LIMIT_REASON;
+	double RATEKEEPER_MIN_RATE;
+	double RATEKEEPER_MAX_RATE;
+	double RATEKEEPER_BATCH_MIN_RATE;
+	double RATEKEEPER_BATCH_MAX_RATE;
 
 	int64_t TARGET_BYTES_PER_STORAGE_SERVER;
 	int64_t SPRING_BYTES_STORAGE_SERVER;
@@ -591,6 +603,8 @@ public:
 	// Use global tag throttling strategy. i.e. throttle based on the cluster-wide
 	// throughput for tags and their associated quotas.
 	bool GLOBAL_TAG_THROTTLING;
+	// Enforce tag throttling on proxies rather than on clients
+	bool ENFORCE_TAG_THROTTLING_ON_PROXIES;
 	// Minimum number of transactions per second that the global tag throttler must allow for each tag
 	double GLOBAL_TAG_THROTTLING_MIN_RATE;
 	// Used by global tag throttling counters
@@ -618,8 +632,18 @@ public:
 	double INITIAL_DURABILITY_LAG_MULTIPLIER;
 	double DURABILITY_LAG_REDUCTION_RATE;
 	double DURABILITY_LAG_INCREASE_RATE;
-
 	double STORAGE_SERVER_LIST_FETCH_TIMEOUT;
+	bool BW_THROTTLING_ENABLED;
+	double TARGET_BW_LAG;
+	double TARGET_BW_LAG_BATCH;
+	double TARGET_BW_LAG_UPDATE;
+	int MIN_BW_HISTORY;
+	double BW_ESTIMATION_INTERVAL;
+	double BW_LAG_INCREASE_AMOUNT;
+	double BW_LAG_DECREASE_AMOUNT;
+	double BW_FETCH_WORKERS_INTERVAL;
+	double BW_RW_LOGGING_INTERVAL;
+	double BW_MAX_BLOCKED_INTERVAL;
 
 	// disk snapshot
 	int64_t MAX_FORKED_PROCESS_OUTPUT;
@@ -658,12 +682,12 @@ public:
 	int STORAGE_LIMIT_BYTES;
 	int BUGGIFY_LIMIT_BYTES;
 	bool FETCH_USING_STREAMING;
+	bool FETCH_USING_BLOB;
 	int FETCH_BLOCK_BYTES;
 	int FETCH_KEYS_PARALLELISM_BYTES;
 	int FETCH_KEYS_PARALLELISM;
 	int FETCH_KEYS_PARALLELISM_FULL;
 	int FETCH_KEYS_LOWER_PRIORITY;
-	int FETCH_CHANGEFEED_PARALLELISM;
 	int SERVE_FETCH_CHECKPOINT_PARALLELISM;
 	int BUGGIFY_BLOCK_BYTES;
 	int64_t STORAGE_RECOVERY_VERSION_LAG_LIMIT;
@@ -672,7 +696,6 @@ public:
 	int STORAGE_COMMIT_BYTES;
 	int STORAGE_FETCH_BYTES;
 	double STORAGE_COMMIT_INTERVAL;
-	double UPDATE_SHARD_VERSION_INTERVAL;
 	int BYTE_SAMPLING_FACTOR;
 	int BYTE_SAMPLING_OVERHEAD;
 	int MAX_STORAGE_SERVER_WATCH_BYTES;
@@ -681,7 +704,6 @@ public:
 	int BYTE_SAMPLE_LOAD_PARALLELISM;
 	double BYTE_SAMPLE_LOAD_DELAY;
 	double BYTE_SAMPLE_START_DELAY;
-	double UPDATE_STORAGE_PROCESS_STATS_INTERVAL;
 	double BEHIND_CHECK_DELAY;
 	int BEHIND_CHECK_COUNT;
 	int64_t BEHIND_CHECK_VERSIONS;
@@ -755,7 +777,6 @@ public:
 
 	// Dynamic Knobs (implementation)
 	double COMPACTION_INTERVAL;
-	double UPDATE_NODE_TIMEOUT;
 	double GET_COMMITTED_VERSION_TIMEOUT;
 	double GET_SNAPSHOT_AND_CHANGES_TIMEOUT;
 	double FETCH_CHANGES_TIMEOUT;
@@ -771,14 +792,6 @@ public:
 	bool DISABLE_DUPLICATE_LOG_WARNING;
 	double HISTOGRAM_REPORT_INTERVAL;
 
-	// IPager
-	int PAGER_RESERVED_PAGES;
-
-	// IndirectShadowPager
-	int FREE_PAGE_VACUUM_THRESHOLD;
-	int VACUUM_QUEUE_SIZE;
-	int VACUUM_BYTES_PER_SECOND;
-
 	// Timekeeper
 	int64_t TIME_KEEPER_DELAY;
 	int64_t TIME_KEEPER_MAX_ENTRIES;
@@ -801,11 +814,9 @@ public:
 	int64_t FASTRESTORE_ROLE_LOGGING_DELAY;
 	int64_t FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL; // How quickly to update process metrics for restore
 	int64_t FASTRESTORE_ATOMICOP_WEIGHT; // workload amplication factor for atomic op
-	int64_t FASTRESTORE_APPLYING_PARALLELISM; // number of outstanding txns writing to dest. DB
 	int64_t FASTRESTORE_MONITOR_LEADER_DELAY;
 	int64_t FASTRESTORE_STRAGGLER_THRESHOLD_SECONDS;
 	bool FASTRESTORE_TRACK_REQUEST_LATENCY; // true to track reply latency of each request in a request batch
-	bool FASTRESTORE_TRACK_LOADER_SEND_REQUESTS; // track requests of load send mutations to appliers?
 	int64_t FASTRESTORE_MEMORY_THRESHOLD_MB_SOFT; // threshold when pipelined actors should be delayed
 	int64_t FASTRESTORE_WAIT_FOR_MEMORY_LATENCY;
 	int64_t FASTRESTORE_HEARTBEAT_DELAY; // interval for master to ping loaders and appliers
@@ -877,6 +888,7 @@ public:
 	int SIM_KMS_MAX_KEYS;
 	int ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH;
 	bool ENABLE_TLOG_ENCRYPTION;
+	bool ENABLE_STORAGE_SERVER_ENCRYPTION; // Currently only Redwood engine supports encryption
 	bool ENABLE_BLOB_GRANULE_ENCRYPTION;
 
 	// Compression
@@ -890,8 +902,6 @@ public:
 	// FIXME: configure url with database configuration instead of knob eventually
 	std::string BG_URL;
 
-	// whether to use blobRangeKeys or tenants for blob granule range sources
-	std::string BG_RANGE_SOURCE;
 	// Whether to use knobs or EKP for blob metadata and credentials
 	std::string BG_METADATA_SOURCE;
 
@@ -911,10 +921,15 @@ public:
 	int BG_KEY_TUPLE_TRUNCATE_OFFSET;
 
 	int BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM;
+	int BLOB_WORKER_RESNAPSHOT_PARALLELISM;
+	int BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM;
+
 	double BLOB_WORKER_TIMEOUT; // Blob Manager's reaction time to a blob worker failure
 	double BLOB_WORKER_REQUEST_TIMEOUT; // Blob Worker's server-side request timeout
 	double BLOB_WORKERLIST_FETCH_INTERVAL;
 	double BLOB_WORKER_BATCH_GRV_INTERVAL;
+	bool BLOB_WORKER_DO_REJECT_WHEN_FULL;
+	double BLOB_WORKER_REJECT_WHEN_FULL_THRESHOLD;
 
 	double BLOB_MANAGER_STATUS_EXP_BACKOFF_MIN;
 	double BLOB_MANAGER_STATUS_EXP_BACKOFF_MAX;
diff --git a/fdbclient/include/fdbclient/SpecialKeySpace.actor.h b/fdbclient/include/fdbclient/SpecialKeySpace.actor.h
index e665b83124..75cae1fc47 100644
--- a/fdbclient/include/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/include/fdbclient/SpecialKeySpace.actor.h
@@ -548,6 +548,15 @@ public:
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 };
 
+class WorkerInterfacesSpecialKeyImpl : public SpecialKeyRangeReadImpl {
+public:
+	explicit WorkerInterfacesSpecialKeyImpl(KeyRangeRef kr);
+
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw,
+	                             KeyRangeRef kr,
+	                             GetRangeLimits limitsHint) const override;
+};
+
 // If the underlying set of key-value pairs of a key space is not changing, then we expect repeating a read to give the
 // same result. Additionally, we can generate the expected result of any read if that read is reading a subrange. This
 // actor performs a read of an arbitrary subrange of [begin, end) and validates the results.
diff --git a/fdbclient/include/fdbclient/StorageServerInterface.h b/fdbclient/include/fdbclient/StorageServerInterface.h
index 76c51aaea1..00e9ff2aef 100644
--- a/fdbclient/include/fdbclient/StorageServerInterface.h
+++ b/fdbclient/include/fdbclient/StorageServerInterface.h
@@ -294,15 +294,13 @@ struct GetValueRequest : TimedRequest {
 	TenantInfo tenantInfo;
 	Key key;
 	Version version;
-	ReadType readType;
 	Optional<TagSet> tags;
-	Optional<UID> debugID;
 	ReplyPromise<GetValueReply> reply;
+	Optional<ReadOptions> options;
 	VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known
 	                                      // to this client, of all storage replicas that
 	                                      // serve the given key
-
-	GetValueRequest() : readType(ReadType::NORMAL) {}
+	GetValueRequest() {}
 
 	bool verify() const { return tenantInfo.isAuthorized(); }
 
@@ -310,16 +308,15 @@ struct GetValueRequest : TimedRequest {
 	                const TenantInfo& tenantInfo,
 	                const Key& key,
 	                Version ver,
-	                ReadType type,
 	                Optional<TagSet> tags,
-	                Optional<UID> debugID,
+	                Optional<ReadOptions> options,
 	                VersionVector latestCommitVersions)
-	  : spanContext(spanContext), tenantInfo(tenantInfo), key(key), version(ver), readType(type), tags(tags),
-	    debugID(debugID), ssLatestCommitVersions(latestCommitVersions) {}
+	  : spanContext(spanContext), tenantInfo(tenantInfo), key(key), version(ver), tags(tags), options(options),
+	    ssLatestCommitVersions(latestCommitVersions) {}
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, key, version, readType, tags, debugID, reply, spanContext, tenantInfo, ssLatestCommitVersions);
+		serializer(ar, key, version, tags, reply, spanContext, tenantInfo, options, ssLatestCommitVersions);
 	}
 };
 
@@ -395,15 +392,14 @@ struct GetKeyValuesRequest : TimedRequest {
 	KeyRef mapper = KeyRef();
 	Version version; // or latestVersion
 	int limit, limitBytes;
-	ReadType readType;
 	Optional<TagSet> tags;
-	Optional<UID> debugID;
+	Optional<ReadOptions> options;
 	ReplyPromise<GetKeyValuesReply> reply;
 	VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known
 	                                      // to this client, of all storage replicas that
 	                                      // serve the given key
 
-	GetKeyValuesRequest() : readType(ReadType::NORMAL) {}
+	GetKeyValuesRequest() {}
 
 	bool verify() const { return tenantInfo.isAuthorized(); }
 
@@ -415,12 +411,11 @@ struct GetKeyValuesRequest : TimedRequest {
 		           version,
 		           limit,
 		           limitBytes,
-		           readType,
 		           tags,
-		           debugID,
 		           reply,
 		           spanContext,
 		           tenantInfo,
+		           options,
 		           arena,
 		           ssLatestCommitVersions);
 	}
@@ -454,15 +449,14 @@ struct GetMappedKeyValuesRequest : TimedRequest {
 	Version version; // or latestVersion
 	int limit, limitBytes;
 	int matchIndex;
-	ReadType readType;
 	Optional<TagSet> tags;
-	Optional<UID> debugID;
+	Optional<ReadOptions> options;
 	ReplyPromise<GetMappedKeyValuesReply> reply;
 	VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known
 	                                      // to this client, of all storage replicas that
 	                                      // serve the given key range
 
-	GetMappedKeyValuesRequest() : readType(ReadType::NORMAL) {}
+	GetMappedKeyValuesRequest() {}
 
 	bool verify() const { return tenantInfo.isAuthorized(); }
 
@@ -475,12 +469,11 @@ struct GetMappedKeyValuesRequest : TimedRequest {
 		           version,
 		           limit,
 		           limitBytes,
-		           readType,
 		           tags,
-		           debugID,
 		           reply,
 		           spanContext,
 		           tenantInfo,
+		           options,
 		           arena,
 		           ssLatestCommitVersions,
 		           matchIndex);
@@ -522,15 +515,14 @@ struct GetKeyValuesStreamRequest {
 	KeySelectorRef begin, end;
 	Version version; // or latestVersion
 	int limit, limitBytes;
-	ReadType readType;
 	Optional<TagSet> tags;
-	Optional<UID> debugID;
+	Optional<ReadOptions> options;
 	ReplyPromiseStream<GetKeyValuesStreamReply> reply;
 	VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known
 	                                      // to this client, of all storage replicas that
 	                                      // serve the given key range
 
-	GetKeyValuesStreamRequest() : readType(ReadType::NORMAL) {}
+	GetKeyValuesStreamRequest() {}
 
 	bool verify() const { return tenantInfo.isAuthorized(); }
 
@@ -542,12 +534,11 @@ struct GetKeyValuesStreamRequest {
 		           version,
 		           limit,
 		           limitBytes,
-		           readType,
 		           tags,
-		           debugID,
 		           reply,
 		           spanContext,
 		           tenantInfo,
+		           options,
 		           arena,
 		           ssLatestCommitVersions);
 	}
@@ -574,15 +565,14 @@ struct GetKeyRequest : TimedRequest {
 	TenantInfo tenantInfo;
 	KeySelectorRef sel;
 	Version version; // or latestVersion
-	ReadType readType;
 	Optional<TagSet> tags;
-	Optional<UID> debugID;
 	ReplyPromise<GetKeyReply> reply;
+	Optional<ReadOptions> options;
 	VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known
 	                                      // to this client, of all storage replicas that
 	                                      // serve the given key
 
-	GetKeyRequest() : readType(ReadType::NORMAL) {}
+	GetKeyRequest() {}
 
 	bool verify() const { return tenantInfo.isAuthorized(); }
 
@@ -590,17 +580,15 @@ struct GetKeyRequest : TimedRequest {
 	              TenantInfo tenantInfo,
 	              KeySelectorRef const& sel,
 	              Version version,
-	              ReadType type,
 	              Optional<TagSet> tags,
-	              Optional<UID> debugID,
+	              Optional<ReadOptions> options,
 	              VersionVector latestCommitVersions)
-	  : spanContext(spanContext), tenantInfo(tenantInfo), sel(sel), version(version), readType(type), debugID(debugID),
+	  : spanContext(spanContext), tenantInfo(tenantInfo), sel(sel), version(version), tags(tags), options(options),
 	    ssLatestCommitVersions(latestCommitVersions) {}
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(
-		    ar, sel, version, readType, tags, debugID, reply, spanContext, tenantInfo, arena, ssLatestCommitVersions);
+		serializer(ar, sel, version, tags, reply, spanContext, tenantInfo, options, arena, ssLatestCommitVersions);
 	}
 };
 
diff --git a/fdbclient/include/fdbclient/SystemData.h b/fdbclient/include/fdbclient/SystemData.h
index 2571b5aa53..b41809691e 100644
--- a/fdbclient/include/fdbclient/SystemData.h
+++ b/fdbclient/include/fdbclient/SystemData.h
@@ -594,6 +594,8 @@ const Value blobManagerEpochValueFor(int64_t epoch);
 int64_t decodeBlobManagerEpochValue(ValueRef const& value);
 
 // blob granule keys
+extern const StringRef blobRangeActive;
+extern const StringRef blobRangeInactive;
 
 extern const uint8_t BG_FILE_TYPE_DELTA;
 extern const uint8_t BG_FILE_TYPE_SNAPSHOT;
@@ -621,7 +623,8 @@ extern const KeyRangeRef blobGranuleHistoryKeys;
 
 // \xff\x02/bgp/(start,end) = (version, force)
 extern const KeyRangeRef blobGranulePurgeKeys;
-extern const KeyRangeRef blobGranuleVersionKeys;
+// \xff\x02/bgpforce/(start) = {1|0} (key range map)
+extern const KeyRangeRef blobGranuleForcePurgedKeys;
 extern const KeyRef blobGranulePurgeChangeKey;
 
 const Key blobGranuleFileKeyFor(UID granuleID, Version fileVersion, uint8_t fileType);
diff --git a/fdbclient/include/fdbclient/Tenant.h b/fdbclient/include/fdbclient/Tenant.h
index 7cce7dcb05..0af19b85f1 100644
--- a/fdbclient/include/fdbclient/Tenant.h
+++ b/fdbclient/include/fdbclient/Tenant.h
@@ -25,6 +25,7 @@
 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/KeyBackedTypes.h"
 #include "fdbclient/VersionedMap.h"
+#include "fdbclient/KeyBackedTypes.h"
 #include "fdbrpc/TenantInfo.h"
 #include "flow/flat_buffers.h"
 
@@ -33,7 +34,35 @@ typedef Standalone<TenantNameRef> TenantName;
 typedef StringRef TenantGroupNameRef;
 typedef Standalone<TenantGroupNameRef> TenantGroupName;
 
-enum class TenantState { REGISTERING, READY, REMOVING, UPDATING_CONFIGURATION, ERROR };
+// Represents the various states that a tenant could be in.
+// In a standalone cluster, a tenant should only ever be in the READY state.
+// In a metacluster, a tenant on the management cluster could be in the other states while changes are applied to the
+// data cluster.
+//
+// REGISTERING - the tenant has been created on the management cluster and is being created on the data cluster
+// READY - the tenant has been created on both clusters, is active, and is consistent between the two clusters
+// REMOVING - the tenant has been marked for removal and is being removed on the data cluster
+// UPDATING_CONFIGURATION - the tenant configuration has changed on the management cluster and is being applied to the
+//                          data cluster
+// RENAMING_FROM - the tenant is being renamed to a new name and is awaiting the rename to complete on the data cluster
+// RENAMING_TO - the tenant is being created as a rename from an existing tenant and is awaiting the rename to complete
+//               on the data cluster
+// ERROR - the tenant is in an error state
+//
+// A tenant in any configuration is allowed to be removed. Only tenants in the READY or UPDATING_CONFIGURATION phases
+// can have their configuration updated. A tenant must not exist or be in the REGISTERING phase to be created. To be
+// renamed, a tenant must be in the READY or RENAMING_FROM state. In the latter case, the rename destination must match
+// the original rename attempt.
+//
+// If an operation fails and the tenant is left in a non-ready state, re-running the same operation is legal. If
+// successful, the tenant will return to the READY state.
+enum class TenantState { REGISTERING, READY, REMOVING, UPDATING_CONFIGURATION, RENAMING_FROM, RENAMING_TO, ERROR };
+
+// Represents the lock state the tenant could be in.
+// Can be used in conjunction with the other tenant states above.
+enum class TenantLockState { UNLOCKED, READ_ONLY, LOCKED };
+
+constexpr int TENANT_PREFIX_SIZE = sizeof(int64_t);
 
 struct TenantMapEntry {
 	constexpr static FileIdentifier file_identifier = 12247338;
@@ -44,15 +73,24 @@ struct TenantMapEntry {
 	static std::string tenantStateToString(TenantState tenantState);
 	static TenantState stringToTenantState(std::string stateStr);
 
+	static std::string tenantLockStateToString(TenantLockState tenantState);
+	static TenantLockState stringToTenantLockState(std::string stateStr);
+
 	int64_t id = -1;
 	Key prefix;
 	TenantState tenantState = TenantState::READY;
+	TenantLockState tenantLockState = TenantLockState::UNLOCKED;
 	Optional<TenantGroupName> tenantGroup;
 	bool encrypted = false;
+	Optional<ClusterName> assignedCluster;
+	int64_t configurationSequenceNum = 0;
+	Optional<TenantName> renamePair;
+
+	// Can be set to an error string if the tenant is in the ERROR state
+	std::string error;
 
 	constexpr static int PREFIX_SIZE = sizeof(id);
 
-public:
 	TenantMapEntry();
 	TenantMapEntry(int64_t id, TenantState tenantState, bool encrypted);
 	TenantMapEntry(int64_t id, TenantState tenantState, Optional<TenantGroupName> tenantGroup, bool encrypted);
@@ -70,7 +108,16 @@ public:
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, id, tenantState, tenantGroup, encrypted);
+		serializer(ar,
+		           id,
+		           tenantState,
+		           tenantLockState,
+		           tenantGroup,
+		           encrypted,
+		           assignedCluster,
+		           configurationSequenceNum,
+		           renamePair,
+		           error);
 		if constexpr (Ar::isDeserializing) {
 			if (id >= 0) {
 				prefix = idToPrefix(id);
@@ -83,7 +130,10 @@ public:
 struct TenantGroupEntry {
 	constexpr static FileIdentifier file_identifier = 10764222;
 
+	Optional<ClusterName> assignedCluster;
+
 	TenantGroupEntry() = default;
+	TenantGroupEntry(Optional<ClusterName> assignedCluster) : assignedCluster(assignedCluster) {}
 
 	Value encode() { return ObjectWriter::toValue(*this, IncludeVersion()); }
 	static TenantGroupEntry decode(ValueRef const& value) {
@@ -92,41 +142,67 @@ struct TenantGroupEntry {
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar);
+		serializer(ar, assignedCluster);
+	}
+};
+
+struct TenantTombstoneCleanupData {
+	constexpr static FileIdentifier file_identifier = 3291339;
+
+	// All tombstones have been erased up to and including this id.
+	// We should not generate new tombstones at IDs equal to or older than this.
+	int64_t tombstonesErasedThrough = -1;
+
+	// The version at which we will next erase tombstones.
+	Version nextTombstoneEraseVersion = invalidVersion;
+
+	// When we reach the nextTombstoneEraseVersion, we will erase tombstones up through this ID.
+	int64_t nextTombstoneEraseId = -1;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, tombstonesErasedThrough, nextTombstoneEraseVersion, nextTombstoneEraseId);
 	}
 };
 
 struct TenantMetadataSpecification {
-	static KeyRef subspace;
+	Key subspace;
 
 	KeyBackedObjectMap<TenantName, TenantMapEntry, decltype(IncludeVersion()), NullCodec> tenantMap;
+	KeyBackedMap<int64_t, TenantName> tenantIdIndex;
 	KeyBackedProperty<int64_t> lastTenantId;
 	KeyBackedBinaryValue<int64_t> tenantCount;
+	KeyBackedSet<int64_t> tenantTombstones;
+	KeyBackedObjectProperty<TenantTombstoneCleanupData, decltype(IncludeVersion())> tombstoneCleanupData;
 	KeyBackedSet<Tuple> tenantGroupTenantIndex;
 	KeyBackedObjectMap<TenantGroupName, TenantGroupEntry, decltype(IncludeVersion()), NullCodec> tenantGroupMap;
 
-	TenantMetadataSpecification(KeyRef subspace)
-	  : tenantMap(subspace.withSuffix("tenant/map/"_sr), IncludeVersion()),
-	    lastTenantId(subspace.withSuffix("tenant/lastId"_sr)), tenantCount(subspace.withSuffix("tenant/count"_sr)),
-	    tenantGroupTenantIndex(subspace.withSuffix("tenant/tenantGroup/tenantIndex/"_sr)),
-	    tenantGroupMap(subspace.withSuffix("tenant/tenantGroup/map/"_sr), IncludeVersion()) {}
+	TenantMetadataSpecification(KeyRef prefix)
+	  : subspace(prefix.withSuffix("tenant/"_sr)), tenantMap(subspace.withSuffix("map/"_sr), IncludeVersion()),
+	    tenantIdIndex(subspace.withSuffix("idIndex/"_sr)), lastTenantId(subspace.withSuffix("lastId"_sr)),
+	    tenantCount(subspace.withSuffix("count"_sr)), tenantTombstones(subspace.withSuffix("tombstones/"_sr)),
+	    tombstoneCleanupData(subspace.withSuffix("tombstoneCleanup"_sr), IncludeVersion()),
+	    tenantGroupTenantIndex(subspace.withSuffix("tenantGroup/tenantIndex/"_sr)),
+	    tenantGroupMap(subspace.withSuffix("tenantGroup/map/"_sr), IncludeVersion()) {}
 };
 
 struct TenantMetadata {
-private:
-	static inline TenantMetadataSpecification instance = TenantMetadataSpecification("\xff/"_sr);
+	static TenantMetadataSpecification& instance();
 
-public:
-	static inline auto& tenantMap = instance.tenantMap;
-	static inline auto& lastTenantId = instance.lastTenantId;
-	static inline auto& tenantCount = instance.tenantCount;
-	static inline auto& tenantGroupTenantIndex = instance.tenantGroupTenantIndex;
-	static inline auto& tenantGroupMap = instance.tenantGroupMap;
+	static inline auto& subspace() { return instance().subspace; }
+	static inline auto& tenantMap() { return instance().tenantMap; }
+	static inline auto& tenantIdIndex() { return instance().tenantIdIndex; }
+	static inline auto& lastTenantId() { return instance().lastTenantId; }
+	static inline auto& tenantCount() { return instance().tenantCount; }
+	static inline auto& tenantTombstones() { return instance().tenantTombstones; }
+	static inline auto& tombstoneCleanupData() { return instance().tombstoneCleanupData; }
+	static inline auto& tenantGroupTenantIndex() { return instance().tenantGroupTenantIndex; }
+	static inline auto& tenantGroupMap() { return instance().tenantGroupMap; }
 
-	static inline Key tenantMapPrivatePrefix = "\xff"_sr.withSuffix(tenantMap.subspace.begin);
+	static Key tenantMapPrivatePrefix();
 };
 
 typedef VersionedMap<TenantName, TenantMapEntry> TenantMap;
-typedef VersionedMap<Key, TenantName> TenantPrefixIndex;
+class TenantPrefixIndex : public VersionedMap<Key, TenantName>, public ReferenceCounted<TenantPrefixIndex> {};
 
 #endif
diff --git a/fdbclient/include/fdbclient/TenantManagement.actor.h b/fdbclient/include/fdbclient/TenantManagement.actor.h
index b9e26d0df7..7499c8ddb7 100644
--- a/fdbclient/include/fdbclient/TenantManagement.actor.h
+++ b/fdbclient/include/fdbclient/TenantManagement.actor.h
@@ -21,6 +21,7 @@
 #pragma once
 #include "fdbclient/ClientBooleanParams.h"
 #include "flow/IRandom.h"
+#include "flow/ThreadHelper.actor.h"
 #if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_TENANT_MANAGEMENT_ACTOR_G_H)
 #define FDBCLIENT_TENANT_MANAGEMENT_ACTOR_G_H
 #include "fdbclient/TenantManagement.actor.g.h"
@@ -30,6 +31,7 @@
 #include <string>
 #include <map>
 #include "fdbclient/GenericTransactionHelper.h"
+#include "fdbclient/Metacluster.h"
 #include "fdbclient/SystemData.h"
 #include "flow/actorcompiler.h" // has to be last include
 
@@ -38,7 +40,7 @@ namespace TenantAPI {
 template <class Transaction>
 Future<Optional<TenantMapEntry>> tryGetTenantTransaction(Transaction tr, TenantName name) {
 	tr->setOption(FDBTransactionOptions::RAW_ACCESS);
-	return TenantMetadata::tenantMap.get(tr, name);
+	return TenantMetadata::tenantMap().get(tr, name);
 }
 
 ACTOR template <class DB>
@@ -78,26 +80,60 @@ Future<TenantMapEntry> getTenant(Reference<DB> db, TenantName name) {
 }
 
 ACTOR template <class Transaction>
-Future<Void> checkTenantMode(Transaction tr) {
+Future<ClusterType> getClusterType(Transaction tr) {
+	Optional<MetaclusterRegistrationEntry> metaclusterRegistration =
+	    wait(MetaclusterMetadata::metaclusterRegistration().get(tr));
+
+	return metaclusterRegistration.present() ? metaclusterRegistration.get().clusterType : ClusterType::STANDALONE;
+}
+
+ACTOR template <class Transaction>
+Future<Void> checkTenantMode(Transaction tr, ClusterType expectedClusterType) {
 	state typename transaction_future_type<Transaction, Optional<Value>>::type tenantModeFuture =
 	    tr->get(configKeysPrefix.withSuffix("tenant_mode"_sr));
 
+	state ClusterType actualClusterType = wait(getClusterType(tr));
 	Optional<Value> tenantModeValue = wait(safeThreadFutureToFuture(tenantModeFuture));
 
 	TenantMode tenantMode = TenantMode::fromValue(tenantModeValue.castTo<ValueRef>());
-	if (tenantMode == TenantMode::DISABLED) {
+	if (actualClusterType != expectedClusterType) {
+		throw invalid_metacluster_operation();
+	} else if (actualClusterType == ClusterType::STANDALONE && tenantMode == TenantMode::DISABLED) {
 		throw tenants_disabled();
 	}
 
 	return Void();
 }
 
+TenantMode tenantModeForClusterType(ClusterType clusterType, TenantMode tenantMode);
+
+// Returns true if the specified ID has already been deleted and false if not. If the ID is old enough
+// that we no longer keep tombstones for it, an error is thrown.
+ACTOR template <class Transaction>
+Future<bool> checkTombstone(Transaction tr, int64_t id) {
+	state Future<bool> tombstoneFuture = TenantMetadata::tenantTombstones().exists(tr, id);
+
+	// If we are trying to create a tenant older than the oldest tombstones we still maintain, then we fail it
+	// with an error.
+	Optional<TenantTombstoneCleanupData> tombstoneCleanupData = wait(TenantMetadata::tombstoneCleanupData().get(tr));
+	if (tombstoneCleanupData.present() && tombstoneCleanupData.get().tombstonesErasedThrough >= id) {
+		throw tenant_creation_permanently_failed();
+	}
+
+	state bool hasTombstone = wait(tombstoneFuture);
+	return hasTombstone;
+}
+
 // Creates a tenant with the given name. If the tenant already exists, the boolean return parameter will be false
 // and the existing entry will be returned. If the tenant cannot be created, then the optional will be empty.
 ACTOR template <class Transaction>
-Future<std::pair<Optional<TenantMapEntry>, bool>> createTenantTransaction(Transaction tr,
-                                                                          TenantNameRef name,
-                                                                          TenantMapEntry tenantEntry) {
+Future<std::pair<Optional<TenantMapEntry>, bool>> createTenantTransaction(
+    Transaction tr,
+    TenantNameRef name,
+    TenantMapEntry tenantEntry,
+    ClusterType clusterType = ClusterType::STANDALONE) {
+
+	ASSERT(clusterType != ClusterType::METACLUSTER_MANAGEMENT);
 	ASSERT(tenantEntry.id >= 0);
 
 	if (name.startsWith("\xff"_sr)) {
@@ -110,17 +146,25 @@ Future<std::pair<Optional<TenantMapEntry>, bool>> createTenantTransaction(Transa
 	tr->setOption(FDBTransactionOptions::RAW_ACCESS);
 
 	state Future<Optional<TenantMapEntry>> existingEntryFuture = tryGetTenantTransaction(tr, name);
-	wait(checkTenantMode(tr));
+	state Future<Void> tenantModeCheck = checkTenantMode(tr, clusterType);
+	state Future<bool> tombstoneFuture =
+	    (clusterType == ClusterType::STANDALONE) ? false : checkTombstone(tr, tenantEntry.id);
 	state Future<Optional<TenantGroupEntry>> existingTenantGroupEntryFuture;
 	if (tenantEntry.tenantGroup.present()) {
-		existingTenantGroupEntryFuture = TenantMetadata::tenantGroupMap.get(tr, tenantEntry.tenantGroup.get());
+		existingTenantGroupEntryFuture = TenantMetadata::tenantGroupMap().get(tr, tenantEntry.tenantGroup.get());
 	}
 
+	wait(tenantModeCheck);
 	Optional<TenantMapEntry> existingEntry = wait(existingEntryFuture);
 	if (existingEntry.present()) {
 		return std::make_pair(existingEntry.get(), false);
 	}
 
+	state bool hasTombstone = wait(tombstoneFuture);
+	if (hasTombstone) {
+		return std::make_pair(Optional<TenantMapEntry>(), false);
+	}
+
 	state typename transaction_future_type<Transaction, RangeResult>::type prefixRangeFuture =
 	    tr->getRange(prefixRange(tenantEntry.prefix), 1);
 
@@ -130,23 +174,27 @@ Future<std::pair<Optional<TenantMapEntry>, bool>> createTenantTransaction(Transa
 	}
 
 	tenantEntry.tenantState = TenantState::READY;
-	TenantMetadata::tenantMap.set(tr, name, tenantEntry);
+	tenantEntry.assignedCluster = Optional<ClusterName>();
+
+	TenantMetadata::tenantMap().set(tr, name, tenantEntry);
+	TenantMetadata::tenantIdIndex().set(tr, tenantEntry.id, name);
+
 	if (tenantEntry.tenantGroup.present()) {
-		TenantMetadata::tenantGroupTenantIndex.insert(tr, Tuple::makeTuple(tenantEntry.tenantGroup.get(), name));
+		TenantMetadata::tenantGroupTenantIndex().insert(tr, Tuple::makeTuple(tenantEntry.tenantGroup.get(), name));
 
 		// Create the tenant group associated with this tenant if it doesn't already exist
 		Optional<TenantGroupEntry> existingTenantGroup = wait(existingTenantGroupEntryFuture);
 		if (!existingTenantGroup.present()) {
-			TenantMetadata::tenantGroupMap.set(tr, tenantEntry.tenantGroup.get(), TenantGroupEntry());
+			TenantMetadata::tenantGroupMap().set(tr, tenantEntry.tenantGroup.get(), TenantGroupEntry());
 		}
 	}
 
 	// This is idempotent because we only add an entry to the tenant map if it isn't already there
-	TenantMetadata::tenantCount.atomicOp(tr, 1, MutationRef::AddValue);
+	TenantMetadata::tenantCount().atomicOp(tr, 1, MutationRef::AddValue);
 
 	// Read the tenant count after incrementing the counter so that simultaneous attempts to create
 	// tenants in the same transaction are properly reflected.
-	int64_t tenantCount = wait(TenantMetadata::tenantCount.getD(tr, Snapshot::False, 0));
+	int64_t tenantCount = wait(TenantMetadata::tenantCount().getD(tr, Snapshot::False, 0));
 	if (tenantCount > CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER) {
 		throw cluster_no_capacity();
 	}
@@ -156,7 +204,7 @@ Future<std::pair<Optional<TenantMapEntry>, bool>> createTenantTransaction(Transa
 
 ACTOR template <class Transaction>
 Future<int64_t> getNextTenantId(Transaction tr) {
-	Optional<int64_t> lastId = wait(TenantMetadata::lastTenantId.get(tr));
+	Optional<int64_t> lastId = wait(TenantMetadata::lastTenantId().get(tr));
 	int64_t tenantId = lastId.orDefault(-1) + 1;
 	if (BUGGIFY) {
 		tenantId += deterministicRandom()->randomSkewedUInt32(1, 1e9);
@@ -167,12 +215,15 @@ Future<int64_t> getNextTenantId(Transaction tr) {
 ACTOR template <class DB>
 Future<Optional<TenantMapEntry>> createTenant(Reference<DB> db,
                                               TenantName name,
-                                              TenantMapEntry tenantEntry = TenantMapEntry()) {
+                                              TenantMapEntry tenantEntry = TenantMapEntry(),
+                                              ClusterType clusterType = ClusterType::STANDALONE) {
 	state Reference<typename DB::TransactionT> tr = db->createTransaction();
 
-	state bool checkExistence = true;
+	state bool checkExistence = clusterType != ClusterType::METACLUSTER_DATA;
 	state bool generateTenantId = tenantEntry.id < 0;
 
+	ASSERT(clusterType == ClusterType::STANDALONE || !generateTenantId);
+
 	loop {
 		try {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
@@ -195,11 +246,11 @@ Future<Optional<TenantMapEntry>> createTenant(Reference<DB> db,
 			if (generateTenantId) {
 				int64_t tenantId = wait(tenantIdFuture);
 				tenantEntry.setId(tenantId);
-				TenantMetadata::lastTenantId.set(tr, tenantId);
+				TenantMetadata::lastTenantId().set(tr, tenantId);
 			}
 
 			state std::pair<Optional<TenantMapEntry>, bool> newTenant =
-			    wait(createTenantTransaction(tr, name, tenantEntry));
+			    wait(createTenantTransaction(tr, name, tenantEntry, clusterType));
 
 			if (newTenant.second) {
 				ASSERT(newTenant.first.present());
@@ -220,6 +271,50 @@ Future<Optional<TenantMapEntry>> createTenant(Reference<DB> db,
 	}
 }
 
+ACTOR template <class Transaction>
+Future<Void> markTenantTombstones(Transaction tr, int64_t tenantId) {
+	// In data clusters, we store a tombstone
+	state Future<KeyBackedRangeResult<int64_t>> latestTombstoneFuture =
+	    TenantMetadata::tenantTombstones().getRange(tr, {}, {}, 1, Snapshot::False, Reverse::True);
+	state Optional<TenantTombstoneCleanupData> cleanupData = wait(TenantMetadata::tombstoneCleanupData().get(tr));
+	state Version transactionReadVersion = wait(safeThreadFutureToFuture(tr->getReadVersion()));
+
+	// If it has been long enough since we last cleaned up the tenant tombstones, we do that first
+	if (!cleanupData.present() || cleanupData.get().nextTombstoneEraseVersion <= transactionReadVersion) {
+		state int64_t deleteThroughId = cleanupData.present() ? cleanupData.get().nextTombstoneEraseId : -1;
+		// Delete all tombstones up through the one currently marked in the cleanup data
+		if (deleteThroughId >= 0) {
+			TenantMetadata::tenantTombstones().erase(tr, 0, deleteThroughId + 1);
+		}
+
+		KeyBackedRangeResult<int64_t> latestTombstone = wait(latestTombstoneFuture);
+		int64_t nextDeleteThroughId = std::max(deleteThroughId, tenantId);
+		if (!latestTombstone.results.empty()) {
+			nextDeleteThroughId = std::max(nextDeleteThroughId, latestTombstone.results[0]);
+		}
+
+		// The next cleanup will happen at or after TENANT_TOMBSTONE_CLEANUP_INTERVAL seconds have elapsed and
+		// will clean up tombstones through the most recently allocated ID.
+		TenantTombstoneCleanupData updatedCleanupData;
+		updatedCleanupData.tombstonesErasedThrough = deleteThroughId;
+		updatedCleanupData.nextTombstoneEraseId = nextDeleteThroughId;
+		updatedCleanupData.nextTombstoneEraseVersion =
+		    transactionReadVersion +
+		    CLIENT_KNOBS->TENANT_TOMBSTONE_CLEANUP_INTERVAL * CLIENT_KNOBS->VERSIONS_PER_SECOND;
+
+		TenantMetadata::tombstoneCleanupData().set(tr, updatedCleanupData);
+
+		// If the tenant being deleted is within the tombstone window, record the tombstone
+		if (tenantId > updatedCleanupData.tombstonesErasedThrough) {
+			TenantMetadata::tenantTombstones().insert(tr, tenantId);
+		}
+	} else if (tenantId > cleanupData.get().tombstonesErasedThrough) {
+		// If the tenant being deleted is within the tombstone window, record the tombstone
+		TenantMetadata::tenantTombstones().insert(tr, tenantId);
+	}
+	return Void();
+}
+
 // Deletes the tenant with the given name. If tenantId is specified, the tenant being deleted must also have the same
 // ID. If no matching tenant is found, this function returns without deleting anything. This behavior allows the
 // function to be used idempotently: if the transaction is retried after having succeeded, it will see that the tenant
@@ -227,11 +322,15 @@ Future<Optional<TenantMapEntry>> createTenant(Reference<DB> db,
 ACTOR template <class Transaction>
 Future<Void> deleteTenantTransaction(Transaction tr,
                                      TenantNameRef name,
-                                     Optional<int64_t> tenantId = Optional<int64_t>()) {
+                                     Optional<int64_t> tenantId = Optional<int64_t>(),
+                                     ClusterType clusterType = ClusterType::STANDALONE) {
+	ASSERT(clusterType == ClusterType::STANDALONE || tenantId.present());
+	ASSERT(clusterType != ClusterType::METACLUSTER_MANAGEMENT);
+
 	tr->setOption(FDBTransactionOptions::RAW_ACCESS);
 
 	state Future<Optional<TenantMapEntry>> tenantEntryFuture = tryGetTenantTransaction(tr, name);
-	wait(checkTenantMode(tr));
+	wait(checkTenantMode(tr, clusterType));
 
 	state Optional<TenantMapEntry> tenantEntry = wait(tenantEntryFuture);
 	if (tenantEntry.present() && (!tenantId.present() || tenantEntry.get().id == tenantId.get())) {
@@ -244,34 +343,43 @@ Future<Void> deleteTenantTransaction(Transaction tr,
 		}
 
 		// This is idempotent because we only erase an entry from the tenant map if it is present
-		TenantMetadata::tenantMap.erase(tr, name);
-		TenantMetadata::tenantCount.atomicOp(tr, -1, MutationRef::AddValue);
+		TenantMetadata::tenantMap().erase(tr, name);
+		TenantMetadata::tenantIdIndex().erase(tr, tenantEntry.get().id);
+		TenantMetadata::tenantCount().atomicOp(tr, -1, MutationRef::AddValue);
 
 		if (tenantEntry.get().tenantGroup.present()) {
-			TenantMetadata::tenantGroupTenantIndex.erase(tr,
-			                                             Tuple::makeTuple(tenantEntry.get().tenantGroup.get(), name));
-			KeyBackedSet<Tuple>::RangeResultType tenantsInGroup = wait(TenantMetadata::tenantGroupTenantIndex.getRange(
-			    tr,
-			    Tuple::makeTuple(tenantEntry.get().tenantGroup.get()),
-			    Tuple::makeTuple(keyAfter(tenantEntry.get().tenantGroup.get())),
-			    2));
+			TenantMetadata::tenantGroupTenantIndex().erase(tr,
+			                                               Tuple::makeTuple(tenantEntry.get().tenantGroup.get(), name));
+			KeyBackedSet<Tuple>::RangeResultType tenantsInGroup =
+			    wait(TenantMetadata::tenantGroupTenantIndex().getRange(
+			        tr,
+			        Tuple::makeTuple(tenantEntry.get().tenantGroup.get()),
+			        Tuple::makeTuple(keyAfter(tenantEntry.get().tenantGroup.get())),
+			        2));
 			if (tenantsInGroup.results.empty() ||
 			    (tenantsInGroup.results.size() == 1 && tenantsInGroup.results[0].getString(1) == name)) {
-				TenantMetadata::tenantGroupMap.erase(tr, tenantEntry.get().tenantGroup.get());
+				TenantMetadata::tenantGroupMap().erase(tr, tenantEntry.get().tenantGroup.get());
 			}
 		}
 	}
 
+	if (clusterType == ClusterType::METACLUSTER_DATA) {
+		wait(markTenantTombstones(tr, tenantId.get()));
+	}
+
 	return Void();
 }
 
 // Deletes the tenant with the given name. If tenantId is specified, the tenant being deleted must also have the same
 // ID.
 ACTOR template <class DB>
-Future<Void> deleteTenant(Reference<DB> db, TenantName name, Optional<int64_t> tenantId = Optional<int64_t>()) {
+Future<Void> deleteTenant(Reference<DB> db,
+                          TenantName name,
+                          Optional<int64_t> tenantId = Optional<int64_t>(),
+                          ClusterType clusterType = ClusterType::STANDALONE) {
 	state Reference<typename DB::TransactionT> tr = db->createTransaction();
 
-	state bool checkExistence = true;
+	state bool checkExistence = clusterType == ClusterType::STANDALONE;
 	loop {
 		try {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
@@ -289,7 +397,7 @@ Future<Void> deleteTenant(Reference<DB> db, TenantName name, Optional<int64_t> t
 				checkExistence = false;
 			}
 
-			wait(deleteTenantTransaction(tr, name, tenantId));
+			wait(deleteTenantTransaction(tr, name, tenantId, clusterType));
 			wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1)));
 
 			TraceEvent("DeletedTenant").detail("Tenant", name).detail("Version", tr->getCommittedVersion());
@@ -308,8 +416,10 @@ Future<Void> configureTenantTransaction(Transaction tr,
                                         TenantNameRef tenantName,
                                         TenantMapEntry originalEntry,
                                         TenantMapEntry updatedTenantEntry) {
+	ASSERT(updatedTenantEntry.id == originalEntry.id);
+
 	tr->setOption(FDBTransactionOptions::RAW_ACCESS);
-	TenantMetadata::tenantMap.set(tr, tenantName, updatedTenantEntry);
+	TenantMetadata::tenantMap().set(tr, tenantName, updatedTenantEntry);
 
 	// If the tenant group was changed, we need to update the tenant group metadata structures
 	if (originalEntry.tenantGroup != updatedTenantEntry.tenantGroup) {
@@ -318,11 +428,11 @@ Future<Void> configureTenantTransaction(Transaction tr,
 		}
 		if (originalEntry.tenantGroup.present()) {
 			// Remove this tenant from the original tenant group index
-			TenantMetadata::tenantGroupTenantIndex.erase(tr,
-			                                             Tuple::makeTuple(originalEntry.tenantGroup.get(), tenantName));
+			TenantMetadata::tenantGroupTenantIndex().erase(
+			    tr, Tuple::makeTuple(originalEntry.tenantGroup.get(), tenantName));
 
 			// Check if the original tenant group is now empty. If so, remove the tenant group.
-			KeyBackedSet<Tuple>::RangeResultType tenants = wait(TenantMetadata::tenantGroupTenantIndex.getRange(
+			KeyBackedSet<Tuple>::RangeResultType tenants = wait(TenantMetadata::tenantGroupTenantIndex().getRange(
 			    tr,
 			    Tuple::makeTuple(originalEntry.tenantGroup.get()),
 			    Tuple::makeTuple(keyAfter(originalEntry.tenantGroup.get())),
@@ -330,19 +440,19 @@ Future<Void> configureTenantTransaction(Transaction tr,
 
 			if (tenants.results.empty() ||
 			    (tenants.results.size() == 1 && tenants.results[0].getString(1) == tenantName)) {
-				TenantMetadata::tenantGroupMap.erase(tr, originalEntry.tenantGroup.get());
+				TenantMetadata::tenantGroupMap().erase(tr, originalEntry.tenantGroup.get());
 			}
 		}
 		if (updatedTenantEntry.tenantGroup.present()) {
 			// If this is creating a new tenant group, add it to the tenant group map
 			Optional<TenantGroupEntry> entry =
-			    wait(TenantMetadata::tenantGroupMap.get(tr, updatedTenantEntry.tenantGroup.get()));
+			    wait(TenantMetadata::tenantGroupMap().get(tr, updatedTenantEntry.tenantGroup.get()));
 			if (!entry.present()) {
-				TenantMetadata::tenantGroupMap.set(tr, updatedTenantEntry.tenantGroup.get(), TenantGroupEntry());
+				TenantMetadata::tenantGroupMap().set(tr, updatedTenantEntry.tenantGroup.get(), TenantGroupEntry());
 			}
 
 			// Insert this tenant in the tenant group index
-			TenantMetadata::tenantGroupTenantIndex.insert(
+			TenantMetadata::tenantGroupTenantIndex().insert(
 			    tr, Tuple::makeTuple(updatedTenantEntry.tenantGroup.get(), tenantName));
 		}
 	}
@@ -358,7 +468,7 @@ Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantsTransactio
 	tr->setOption(FDBTransactionOptions::RAW_ACCESS);
 
 	KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> results =
-	    wait(TenantMetadata::tenantMap.getRange(tr, begin, end, limit));
+	    wait(TenantMetadata::tenantMap().getRange(tr, begin, end, limit));
 
 	return results.results;
 }
@@ -384,33 +494,58 @@ Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenants(Reference
 }
 
 ACTOR template <class Transaction>
-Future<Void> renameTenantTransaction(Transaction tr, TenantNameRef oldName, TenantNameRef newName) {
+Future<Void> renameTenantTransaction(Transaction tr,
+                                     TenantName oldName,
+                                     TenantName newName,
+                                     Optional<int64_t> tenantId = Optional<int64_t>(),
+                                     ClusterType clusterType = ClusterType::STANDALONE,
+                                     Optional<int64_t> configureSequenceNum = Optional<int64_t>()) {
+	ASSERT(clusterType == ClusterType::STANDALONE || (tenantId.present() && configureSequenceNum.present()));
+	ASSERT(clusterType != ClusterType::METACLUSTER_MANAGEMENT);
+	wait(checkTenantMode(tr, clusterType));
 	tr->setOption(FDBTransactionOptions::RAW_ACCESS);
 	state Optional<TenantMapEntry> oldEntry;
 	state Optional<TenantMapEntry> newEntry;
 	wait(store(oldEntry, tryGetTenantTransaction(tr, oldName)) &&
 	     store(newEntry, tryGetTenantTransaction(tr, newName)));
-	if (!oldEntry.present()) {
+	if (!oldEntry.present() || (tenantId.present() && tenantId.get() != oldEntry.get().id)) {
 		throw tenant_not_found();
 	}
 	if (newEntry.present()) {
 		throw tenant_already_exists();
 	}
-	TenantMetadata::tenantMap.erase(tr, oldName);
-	TenantMetadata::tenantMap.set(tr, newName, oldEntry.get());
+	if (configureSequenceNum.present()) {
+		if (oldEntry.get().configurationSequenceNum >= configureSequenceNum.get()) {
+			return Void();
+		}
+		oldEntry.get().configurationSequenceNum = configureSequenceNum.get();
+	}
+	TenantMetadata::tenantMap().erase(tr, oldName);
+	TenantMetadata::tenantMap().set(tr, newName, oldEntry.get());
+	TenantMetadata::tenantIdIndex().set(tr, oldEntry.get().id, newName);
 
 	// Update the tenant group index to reflect the new tenant name
 	if (oldEntry.get().tenantGroup.present()) {
-		TenantMetadata::tenantGroupTenantIndex.erase(tr, Tuple::makeTuple(oldEntry.get().tenantGroup.get(), oldName));
-		TenantMetadata::tenantGroupTenantIndex.insert(tr, Tuple::makeTuple(oldEntry.get().tenantGroup.get(), newName));
+		TenantMetadata::tenantGroupTenantIndex().erase(tr, Tuple::makeTuple(oldEntry.get().tenantGroup.get(), oldName));
+		TenantMetadata::tenantGroupTenantIndex().insert(tr,
+		                                                Tuple::makeTuple(oldEntry.get().tenantGroup.get(), newName));
+	}
+
+	if (clusterType == ClusterType::METACLUSTER_DATA) {
+		wait(markTenantTombstones(tr, tenantId.get()));
 	}
 
 	return Void();
 }
 
 ACTOR template <class DB>
-Future<Void> renameTenant(Reference<DB> db, TenantName oldName, TenantName newName) {
+Future<Void> renameTenant(Reference<DB> db,
+                          TenantName oldName,
+                          TenantName newName,
+                          Optional<int64_t> tenantId = Optional<int64_t>(),
+                          ClusterType clusterType = ClusterType::STANDALONE) {
 	state Reference<typename DB::TransactionT> tr = db->createTransaction();
+	ASSERT(clusterType == ClusterType::STANDALONE || tenantId.present());
 
 	state bool firstTry = true;
 	state int64_t id;
@@ -454,7 +589,7 @@ Future<Void> renameTenant(Reference<DB> db, TenantName oldName, TenantName newNa
 					throw tenant_not_found();
 				}
 			}
-			wait(renameTenantTransaction(tr, oldName, newName));
+			wait(renameTenantTransaction(tr, oldName, newName, tenantId, clusterType));
 			wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1)));
 			TraceEvent("RenameTenantSuccess").detail("OldName", oldName).detail("NewName", newName);
 			return Void();
diff --git a/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h b/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h
index 7c9ccc7cc2..af9d02c371 100644
--- a/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h
+++ b/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h
@@ -137,7 +137,7 @@ private:
 	    std::map<TenantName, std::vector<std::pair<Standalone<StringRef>, Optional<Value>>>> tenants,
 	    std::map<TenantGroupName, int>* tenantGroupNetTenantDelta) {
 		state Future<int64_t> tenantCountFuture =
-		    TenantMetadata::tenantCount.getD(&ryw->getTransaction(), Snapshot::False, 0);
+		    TenantMetadata::tenantCount().getD(&ryw->getTransaction(), Snapshot::False, 0);
 		int64_t _nextId = wait(TenantAPI::getNextTenantId(&ryw->getTransaction()));
 		state int64_t nextId = _nextId;
 
@@ -146,7 +146,7 @@ private:
 			createFutures.push_back(createTenant(ryw, tenant, config, nextId++, tenantGroupNetTenantDelta));
 		}
 
-		TenantMetadata::lastTenantId.set(&ryw->getTransaction(), nextId - 1);
+		TenantMetadata::lastTenantId().set(&ryw->getTransaction(), nextId - 1);
 		wait(waitForAll(createFutures));
 
 		state int numCreatedTenants = 0;
@@ -240,14 +240,14 @@ private:
 		ASSERT(tenantDelta < 0);
 		state int removedTenants = -tenantDelta;
 		KeyBackedSet<Tuple>::RangeResultType tenantsInGroup =
-		    wait(TenantMetadata::tenantGroupTenantIndex.getRange(&ryw->getTransaction(),
-		                                                         Tuple::makeTuple(tenantGroup),
-		                                                         Tuple::makeTuple(keyAfter(tenantGroup)),
-		                                                         removedTenants + 1));
+		    wait(TenantMetadata::tenantGroupTenantIndex().getRange(&ryw->getTransaction(),
+		                                                           Tuple::makeTuple(tenantGroup),
+		                                                           Tuple::makeTuple(keyAfter(tenantGroup)),
+		                                                           removedTenants + 1));
 
 		ASSERT(tenantsInGroup.results.size() >= removedTenants);
 		if (tenantsInGroup.results.size() == removedTenants) {
-			TenantMetadata::tenantGroupMap.erase(&ryw->getTransaction(), tenantGroup);
+			TenantMetadata::tenantGroupMap().erase(&ryw->getTransaction(), tenantGroup);
 		}
 
 		return Void();
@@ -289,7 +289,7 @@ public:
 		state std::set<TenantName> renameSet;
 		state std::vector<std::pair<TenantName, TenantName>> renameMutations;
 
-		tenantManagementFutures.push_back(TenantAPI::checkTenantMode(&ryw->getTransaction()));
+		tenantManagementFutures.push_back(TenantAPI::checkTenantMode(&ryw->getTransaction(), ClusterType::STANDALONE));
 
 		for (auto range : ranges) {
 			if (!range.value().first) {
diff --git a/fdbclient/include/fdbclient/ThreadSafeTransaction.h b/fdbclient/include/fdbclient/ThreadSafeTransaction.h
index 875664ea76..d72c4c8fc5 100644
--- a/fdbclient/include/fdbclient/ThreadSafeTransaction.h
+++ b/fdbclient/include/fdbclient/ThreadSafeTransaction.h
@@ -62,6 +62,13 @@ public:
 	ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override;
 	ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) override;
 
+	ThreadFuture<bool> blobbifyRange(const KeyRangeRef& keyRange) override;
+	ThreadFuture<bool> unblobbifyRange(const KeyRangeRef& keyRange) override;
+	ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRanges(const KeyRangeRef& keyRange,
+	                                                                      int rangeLimit) override;
+
+	ThreadFuture<Version> verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) override;
+
 	ThreadFuture<DatabaseSharedState*> createSharedState() override;
 	void setSharedState(DatabaseSharedState* p) override;
 
@@ -72,7 +79,8 @@ private:
 	DatabaseContext* db;
 
 public: // Internal use only
-	ThreadSafeDatabase(Reference<IClusterConnectionRecord> connectionRecord, int apiVersion);
+	enum class ConnectionRecordType { FILE, CONNECTION_STRING };
+	ThreadSafeDatabase(ConnectionRecordType connectionRecordType, std::string connectionRecord, int apiVersion);
 	ThreadSafeDatabase(DatabaseContext* db) : db(db) {}
 	DatabaseContext* unsafeGetPtr() const { return db; }
 };
@@ -148,13 +156,26 @@ public:
 	ThreadFuture<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(const KeyRangeRef& range,
 	                                                                int64_t chunkSize) override;
 
-	ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRangeRef& keyRange) override;
+	ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRangeRef& keyRange,
+	                                                                      int rangeLimit) override;
 
 	ThreadResult<RangeResult> readBlobGranules(const KeyRangeRef& keyRange,
 	                                           Version beginVersion,
 	                                           Optional<Version> readVersion,
 	                                           ReadBlobGranuleContext granuleContext) override;
 
+	ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesStart(const KeyRangeRef& keyRange,
+	                                                                               Version beginVersion,
+	                                                                               Optional<Version> readVersion,
+	                                                                               Version* readVersionOut) override;
+
+	ThreadResult<RangeResult> readBlobGranulesFinish(
+	    ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>> startFuture,
+	    const KeyRangeRef& keyRange,
+	    Version beginVersion,
+	    Version readVersion,
+	    ReadBlobGranuleContext granuleContext) override;
+
 	void addReadConflictRange(const KeyRangeRef& keys) override;
 	void makeSelfConflicting();
 
@@ -205,6 +226,7 @@ class ThreadSafeApi : public IClientApi, ThreadSafeReferenceCounted<ThreadSafeAp
 public:
 	void selectApiVersion(int apiVersion) override;
 	const char* getClientVersion() override;
+	void useFutureProtocolVersion() override;
 
 	void setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	void setupNetwork() override;
@@ -221,7 +243,7 @@ private:
 	ThreadSafeApi();
 
 	int apiVersion;
-	const std::string clientVersion;
+	std::string clientVersion;
 	uint64_t transportId;
 
 	Mutex lock;
diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options
index 242675e0b1..da092f5463 100644
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@@ -115,6 +115,9 @@ description is not currently required but encouraged.
     <Option name="client_threads_per_version" code="65"
             paramType="Int" paramDescription="Number of client threads to be spawned.  Each cluster will be serviced by a single client thread."
             description="Spawns multiple worker threads for each version of the client that is loaded.  Setting this to a number greater than one implies disable_local_client." />
+    <Option name="future_version_client_library" code="66"
+            paramType="String" paramDescription="path to client library"
+            description="Adds an external client library to be used with a future version protocol. This option can be used testing purposes only!" />
     <Option name="disable_client_statistics_logging" code="70"
             description="Disables logging of client statistics, such as sampled transaction activity." />
     <Option name="enable_slow_task_profiling" code="71"
@@ -303,9 +306,8 @@ description is not currently required but encouraged.
             description="Specifically instruct this transaction to NOT use cached GRV. Primarily used for the read version cache's background updater to avoid attempting to read a cached entry in specific situations."
             hidden="true"/>
     <Option name="authorization_token" code="2000"
-            description="Add a given authorization token to the network thread so that future requests are authorized"
-            paramType="String" paramDescription="A signed token serialized using flatbuffers"
-            hidden="true" />
+            description="Attach given authorization token to the transaction such that subsequent tenant-aware requests are authorized"
+            paramType="String" paramDescription="A JSON Web Token authorized to access data belonging to one or more tenants, indicated by 'tenants' claim of the token's payload."/>
   </Scope>
 
   <!-- The enumeration values matter - do not change them without
diff --git a/fdbrpc/CMakeLists.txt b/fdbrpc/CMakeLists.txt
index b8fea65dad..48ece5eedc 100644
--- a/fdbrpc/CMakeLists.txt
+++ b/fdbrpc/CMakeLists.txt
@@ -80,3 +80,5 @@ target_compile_definitions(fdbrpc_sampling PRIVATE -DENABLE_SAMPLING)
 if(WIN32)
   add_dependencies(fdbrpc_sampling_actors fdbrpc_actors)
 endif()
+
+add_subdirectory(tests)
diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index 4791750226..27a184b76c 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -34,6 +34,7 @@
 #include "fdbrpc/fdbrpc.h"
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbrpc/HealthMonitor.h"
+#include "fdbrpc/JsonWebKeySet.h"
 #include "fdbrpc/genericactors.actor.h"
 #include "fdbrpc/IPAllowList.h"
 #include "fdbrpc/TokenCache.h"
@@ -44,8 +45,10 @@
 #include "flow/Net2Packet.h"
 #include "flow/TDMetric.actor.h"
 #include "flow/ObjectSerializer.h"
+#include "flow/Platform.h"
 #include "flow/ProtocolVersion.h"
 #include "flow/UnitTest.h"
+#include "flow/WatchFile.actor.h"
 #define XXH_INLINE_ALL
 #include "flow/xxhash.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
@@ -309,6 +312,7 @@ public:
 
 	// Returns true if given network address 'address' is one of the address we are listening on.
 	bool isLocalAddress(const NetworkAddress& address) const;
+	void applyPublicKeySet(StringRef jwkSetString);
 
 	NetworkAddressCachedString localAddresses;
 	std::vector<Future<Void>> listeners;
@@ -341,6 +345,7 @@ public:
 
 	Future<Void> multiVersionCleanup;
 	Future<Void> pingLogger;
+	Future<Void> publicKeyFileWatch;
 
 	std::unordered_map<Standalone<StringRef>, PublicKey> publicKeys;
 };
@@ -958,7 +963,7 @@ void Peer::onIncomingConnection(Reference<Peer> self, Reference<IConnection> con
 		    .detail("FromAddr", conn->getPeerAddress())
 		    .detail("CanonicalAddr", destination)
 		    .detail("IsPublic", destination.isPublic())
-		    .detail("Trusted", self->transport->allowList(conn->getPeerAddress().ip));
+		    .detail("Trusted", self->transport->allowList(conn->getPeerAddress().ip) && conn->hasTrustedPeer());
 
 		connect.cancel();
 		prependConnectPacket();
@@ -1257,7 +1262,7 @@ ACTOR static Future<Void> connectionReader(TransportData* transport,
 	state bool incompatiblePeerCounted = false;
 	state NetworkAddress peerAddress;
 	state ProtocolVersion peerProtocolVersion;
-	state bool trusted = transport->allowList(conn->getPeerAddress().ip);
+	state bool trusted = transport->allowList(conn->getPeerAddress().ip) && conn->hasTrustedPeer();
 	peerAddress = conn->getPeerAddress();
 
 	if (!peer) {
@@ -1475,6 +1480,7 @@ ACTOR static Future<Void> listen(TransportData* self, NetworkAddress listenAddr)
 		TraceEvent(SevInfo, "UpdatingListenAddress")
 		    .detail("AssignedListenAddress", listener->getListenAddress().toString());
 		self->localAddresses.setNetworkAddress(listener->getListenAddress());
+		setTraceLocalAddress(listener->getListenAddress());
 	}
 	state uint64_t connectionCount = 0;
 	try {
@@ -1528,6 +1534,27 @@ bool TransportData::isLocalAddress(const NetworkAddress& address) const {
 	        address == localAddresses.getAddressList().secondaryAddress.get());
 }
 
+void TransportData::applyPublicKeySet(StringRef jwkSetString) {
+	auto jwks = JsonWebKeySet::parse(jwkSetString, {});
+	if (!jwks.present())
+		throw pkey_decode_error();
+	const auto& keySet = jwks.get().keys;
+	publicKeys.clear();
+	int numPrivateKeys = 0;
+	for (auto [keyName, key] : keySet) {
+		// ignore private keys
+		if (key.isPublic()) {
+			publicKeys[keyName] = key.getPublic();
+		} else {
+			numPrivateKeys++;
+		}
+	}
+	TraceEvent(SevInfo, "AuthzPublicKeySetApply").detail("NumPublicKeys", publicKeys.size());
+	if (numPrivateKeys > 0) {
+		TraceEvent(SevWarnAlways, "AuthzPublicKeySetContainsPrivateKeys").detail("NumPrivateKeys", numPrivateKeys);
+	}
+}
+
 ACTOR static Future<Void> multiVersionCleanupWorker(TransportData* self) {
 	loop {
 		wait(delay(FLOW_KNOBS->CONNECTION_CLEANUP_DELAY));
@@ -1713,7 +1740,7 @@ static void sendLocal(TransportData* self, ISerializeSource const& what, const E
 		deliver(self,
 		        destination,
 		        priority,
-		        ArenaReader(copy.arena(), copy, AssumeVersion(currentProtocolVersion)),
+		        ArenaReader(copy.arena(), copy, AssumeVersion(currentProtocolVersion())),
 		        NetworkAddress(),
 		        true,
 		        InReadSocket::False,
@@ -1966,3 +1993,62 @@ void FlowTransport::removePublicKey(StringRef name) {
 void FlowTransport::removeAllPublicKeys() {
 	self->publicKeys.clear();
 }
+
+void FlowTransport::loadPublicKeyFile(const std::string& filePath) {
+	if (!fileExists(filePath)) {
+		throw file_not_found();
+	}
+	int64_t const len = fileSize(filePath);
+	if (len <= 0) {
+		TraceEvent(SevWarn, "AuthzPublicKeySetEmpty").detail("Path", filePath);
+	} else if (len > FLOW_KNOBS->PUBLIC_KEY_FILE_MAX_SIZE) {
+		throw file_too_large();
+	} else {
+		auto json = readFileBytes(filePath, len);
+		self->applyPublicKeySet(StringRef(json));
+	}
+}
+
+ACTOR static Future<Void> watchPublicKeyJwksFile(std::string filePath, TransportData* self) {
+	state AsyncTrigger fileChanged;
+	state Future<Void> fileWatch;
+	state unsigned errorCount = 0; // error since watch start or last successful refresh
+
+	// Make sure this watch setup does not break due to async file system initialization not having been called
+	loop {
+		if (IAsyncFileSystem::filesystem())
+			break;
+		wait(delay(1.0));
+	}
+	const int& intervalSeconds = FLOW_KNOBS->PUBLIC_KEY_FILE_REFRESH_INTERVAL_SECONDS;
+	fileWatch = watchFileForChanges(filePath, &fileChanged, &intervalSeconds, "AuthzPublicKeySetRefreshStatError");
+	loop {
+		try {
+			wait(fileChanged.onTrigger());
+			state Reference<IAsyncFile> file = wait(IAsyncFileSystem::filesystem()->open(
+			    filePath, IAsyncFile::OPEN_READONLY | IAsyncFile::OPEN_UNCACHED, 0));
+			state int64_t filesize = wait(file->size());
+			state std::string json(filesize, '\0');
+			if (filesize > FLOW_KNOBS->PUBLIC_KEY_FILE_MAX_SIZE)
+				throw file_too_large();
+			if (filesize <= 0) {
+				TraceEvent(SevWarn, "AuthzPublicKeySetEmpty").suppressFor(60);
+				continue;
+			}
+			wait(success(file->read(&json[0], filesize, 0)));
+			self->applyPublicKeySet(StringRef(json));
+			errorCount = 0;
+		} catch (Error& e) {
+			if (e.code() == error_code_actor_cancelled) {
+				throw;
+			}
+			// parse/read error
+			errorCount++;
+			TraceEvent(SevWarn, "AuthzPublicKeySetRefreshError").error(e).detail("ErrorCount", errorCount);
+		}
+	}
+}
+
+void FlowTransport::watchPublicKeyFile(const std::string& publicKeyFilePath) {
+	self->publicKeyFileWatch = watchPublicKeyJwksFile(publicKeyFilePath, self);
+}
diff --git a/fdbrpc/HTTP.actor.cpp b/fdbrpc/HTTP.actor.cpp
index 0aa33e1711..5fde809e2e 100644
--- a/fdbrpc/HTTP.actor.cpp
+++ b/fdbrpc/HTTP.actor.cpp
@@ -411,7 +411,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest(Reference<IConnection> conn,
 		}
 
 		state Reference<HTTP::Response> r(new HTTP::Response());
-		state Future<Void> responseReading = r->read(conn, verb == "HEAD" || verb == "DELETE");
+		state Future<Void> responseReading = r->read(conn, verb == "HEAD" || verb == "DELETE" || verb == "CONNECT");
 
 		send_start = timer();
 
@@ -525,4 +525,135 @@ ACTOR Future<Reference<HTTP::Response>> doRequest(Reference<IConnection> conn,
 	}
 }
 
+ACTOR Future<Void> sendProxyConnectRequest(Reference<IConnection> conn,
+                                           std::string remoteHost,
+                                           std::string remoteService) {
+	state Headers headers;
+	headers["Host"] = remoteHost + ":" + remoteService;
+	headers["Accept"] = "application/xml";
+	headers["Proxy-Connection"] = "Keep-Alive";
+	state int requestTimeout = 60;
+	state int maxTries = FLOW_KNOBS->RESTCLIENT_CONNECT_TRIES;
+	state int thisTry = 1;
+	state double nextRetryDelay = 2.0;
+	state Reference<IRateControl> sendReceiveRate = makeReference<Unlimited>();
+	state int64_t bytes_sent = 0;
+
+	loop {
+		state Optional<Error> err;
+		state Reference<Response> r;
+
+		try {
+			Reference<Response> _r = wait(timeoutError(doRequest(conn,
+			                                                     "CONNECT",
+			                                                     remoteHost + ":" + remoteService,
+			                                                     headers,
+			                                                     nullptr,
+			                                                     0,
+			                                                     sendReceiveRate,
+			                                                     &bytes_sent,
+			                                                     sendReceiveRate),
+			                                           requestTimeout));
+			r = _r;
+		} catch (Error& e) {
+			if (e.code() == error_code_actor_cancelled)
+				throw;
+			err = e;
+		}
+
+		// If err is not present then r is valid.
+		// If r->code is in successCodes then record the successful request and return r.
+		if (!err.present() && r->code == 200) {
+			return Void();
+		}
+
+		// All errors in err are potentially retryable as well as certain HTTP response codes...
+		bool retryable = err.present() || r->code == 500 || r->code == 502 || r->code == 503 || r->code == 429;
+
+		// But only if our previous attempt was not the last allowable try.
+		retryable = retryable && (thisTry < maxTries);
+
+		TraceEvent event(SevWarn, retryable ? "ProxyConnectCommandFailedRetryable" : "ProxyConnectCommandFailed");
+
+		// Attach err to trace event if present, otherwise extract some stuff from the response
+		if (err.present()) {
+			event.errorUnsuppressed(err.get());
+		}
+		event.suppressFor(60);
+		if (!err.present()) {
+			event.detail("ResponseCode", r->code);
+		}
+
+		event.detail("ThisTry", thisTry);
+
+		// If r is not valid or not code 429 then increment the try count.  429's will not count against the attempt
+		// limit.
+		if (!r || r->code != 429)
+			++thisTry;
+
+		// We will wait delay seconds before the next retry, start with nextRetryDelay.
+		double delay = nextRetryDelay;
+		// Double but limit the *next* nextRetryDelay.
+		nextRetryDelay = std::min(nextRetryDelay * 2, 60.0);
+
+		if (retryable) {
+			// If r is valid then obey the Retry-After response header if present.
+			if (r) {
+				auto iRetryAfter = r->headers.find("Retry-After");
+				if (iRetryAfter != r->headers.end()) {
+					event.detail("RetryAfterHeader", iRetryAfter->second);
+					char* pEnd;
+					double retryAfter = strtod(iRetryAfter->second.c_str(), &pEnd);
+					if (*pEnd) // If there were other characters then don't trust the parsed value, use a probably safe
+					           // value of 5 minutes.
+						retryAfter = 300;
+					// Update delay
+					delay = std::max(delay, retryAfter);
+				}
+			}
+
+			// Log the delay then wait.
+			event.detail("RetryDelay", delay);
+			wait(::delay(delay));
+		} else {
+			// We can't retry, so throw something.
+
+			// This error code means the authentication header was not accepted, likely the account or key is wrong.
+			if (r && r->code == 406)
+				throw http_not_accepted();
+
+			if (r && r->code == 401)
+				throw http_auth_failed();
+
+			throw connection_failed();
+		}
+	}
+}
+
+ACTOR Future<Reference<IConnection>> proxyConnectImpl(std::string remoteHost,
+                                                      std::string remoteService,
+                                                      std::string proxyHost,
+                                                      std::string proxyService) {
+	state NetworkAddress remoteEndpoint =
+	    wait(map(INetworkConnections::net()->resolveTCPEndpoint(remoteHost, remoteService),
+	             [=](std::vector<NetworkAddress> const& addresses) -> NetworkAddress {
+		             NetworkAddress addr = addresses[deterministicRandom()->randomInt(0, addresses.size())];
+		             addr.fromHostname = true;
+		             addr.flags = NetworkAddress::FLAG_TLS;
+		             return addr;
+	             }));
+	state Reference<IConnection> connection = wait(INetworkConnections::net()->connect(proxyHost, proxyService));
+	wait(sendProxyConnectRequest(connection, remoteHost, remoteService));
+	boost::asio::ip::tcp::socket socket = std::move(connection->getSocket());
+	Reference<IConnection> remoteConnection = wait(INetworkConnections::net()->connect(remoteEndpoint, &socket));
+	return remoteConnection;
+}
+
+Future<Reference<IConnection>> proxyConnect(const std::string& remoteHost,
+                                            const std::string& remoteService,
+                                            const std::string& proxyHost,
+                                            const std::string& proxyService) {
+	return proxyConnectImpl(remoteHost, remoteService, proxyHost, proxyService);
+}
+
 } // namespace HTTP
diff --git a/fdbrpc/JsonWebKeySet.cpp b/fdbrpc/JsonWebKeySet.cpp
index a83fd7051e..bf78799bd4 100644
--- a/fdbrpc/JsonWebKeySet.cpp
+++ b/fdbrpc/JsonWebKeySet.cpp
@@ -830,11 +830,18 @@ TEST_CASE("/fdbrpc/JsonWebKeySet/EC/PrivateKey") {
 }
 
 TEST_CASE("/fdbrpc/JsonWebKeySet/RSA/PublicKey") {
-	testPublicKey(&mkcert::makeRsa2048Bit);
+	testPublicKey(&mkcert::makeRsa4096Bit);
 	return Void();
 }
 
 TEST_CASE("/fdbrpc/JsonWebKeySet/RSA/PrivateKey") {
-	testPrivateKey(&mkcert::makeRsa2048Bit);
+	testPrivateKey(&mkcert::makeRsa4096Bit);
+	return Void();
+}
+
+TEST_CASE("/fdbrpc/JsonWebKeySet/Empty") {
+	auto keyset = JsonWebKeySet::parse("{\"keys\":[]}"_sr, {});
+	ASSERT(keyset.present());
+	ASSERT(keyset.get().keys.empty());
 	return Void();
 }
diff --git a/fdbrpc/SimExternalConnection.actor.cpp b/fdbrpc/SimExternalConnection.actor.cpp
index 5eb0de7f62..b26ea9fc9d 100644
--- a/fdbrpc/SimExternalConnection.actor.cpp
+++ b/fdbrpc/SimExternalConnection.actor.cpp
@@ -125,6 +125,10 @@ NetworkAddress SimExternalConnection::getPeerAddress() const {
 	}
 }
 
+bool SimExternalConnection::hasTrustedPeer() const {
+	return true;
+}
+
 UID SimExternalConnection::getDebugID() const {
 	return dbgid;
 }
diff --git a/fdbrpc/TokenCache.actor.cpp b/fdbrpc/TokenCache.actor.cpp
index 7c6d84a8d6..2ab2f8d3ef 100644
--- a/fdbrpc/TokenCache.actor.cpp
+++ b/fdbrpc/TokenCache.actor.cpp
@@ -177,6 +177,10 @@ bool TokenCacheImpl::validateAndAdd(double currentTime, StringRef token, Network
 		CODE_PROBE(true, "Token referencing non-existing key");
 		TRACE_INVALID_PARSED_TOKEN("UnknownKey", t);
 		return false;
+	} else if (!t.issuedAtUnixTime.present()) {
+		CODE_PROBE(true, "Token has no issued-at field");
+		TRACE_INVALID_PARSED_TOKEN("NoIssuedAt", t);
+		return false;
 	} else if (!t.expiresAtUnixTime.present()) {
 		CODE_PROBE(true, "Token has no expiration time");
 		TRACE_INVALID_PARSED_TOKEN("NoExpirationTime", t);
@@ -203,7 +207,7 @@ bool TokenCacheImpl::validateAndAdd(double currentTime, StringRef token, Network
 		return false;
 	} else {
 		CacheEntry c;
-		c.expirationTime = double(t.expiresAtUnixTime.get());
+		c.expirationTime = t.expiresAtUnixTime.get();
 		c.tenants.reserve(c.arena, t.tenants.get().size());
 		for (auto tenant : t.tenants.get()) {
 			c.tenants.push_back_deep(c.arena, tenant);
@@ -265,7 +269,7 @@ TEST_CASE("/fdbrpc/authz/TokenCache/BadTokens") {
 		},
 		{
 		    [](Arena&, IRandom& rng, authz::jwt::TokenRef& token) {
-		        token.expiresAtUnixTime = uint64_t(std::max<double>(g_network->timer() - 10 - rng.random01() * 50, 0));
+		        token.expiresAtUnixTime = std::max<double>(g_network->timer() - 10 - rng.random01() * 50, 0);
 		    },
 		    "ExpiredToken",
 		},
@@ -275,10 +279,15 @@ TEST_CASE("/fdbrpc/authz/TokenCache/BadTokens") {
 		},
 		{
 		    [](Arena&, IRandom& rng, authz::jwt::TokenRef& token) {
-		        token.notBeforeUnixTime = uint64_t(g_network->timer() + 10 + rng.random01() * 50);
+		        token.notBeforeUnixTime = g_network->timer() + 10 + rng.random01() * 50;
 		    },
 		    "TokenNotYetValid",
 		},
+		{
+		    [](Arena&, IRandom&, authz::jwt::TokenRef& token) { token.issuedAtUnixTime.reset(); },
+		    "NoIssuedAt",
+		},
+
 		{
 		    [](Arena& arena, IRandom&, authz::jwt::TokenRef& token) { token.tenants.reset(); },
 		    "NoTenants",
@@ -336,7 +345,7 @@ TEST_CASE("/fdbrpc/authz/TokenCache/GoodTokens") {
 	    authz::jwt::makeRandomTokenSpec(arena, *deterministicRandom(), authz::Algorithm::ES256);
 	state StringRef signedToken;
 	FlowTransport::transport().addPublicKey(pubKeyName, privateKey.toPublic());
-	tokenSpec.expiresAtUnixTime = static_cast<uint64_t>(g_network->timer() + 2.0);
+	tokenSpec.expiresAtUnixTime = g_network->timer() + 2.0;
 	tokenSpec.keyId = pubKeyName;
 	signedToken = authz::jwt::signToken(arena, tokenSpec, privateKey);
 	if (!TokenCache::instance().validate(tokenSpec.tenants.get()[0], signedToken)) {
diff --git a/fdbrpc/TokenSign.cpp b/fdbrpc/TokenSign.cpp
index 0f357c749d..30ba6f6726 100644
--- a/fdbrpc/TokenSign.cpp
+++ b/fdbrpc/TokenSign.cpp
@@ -22,6 +22,7 @@
 #include "flow/network.h"
 #include "flow/serialize.h"
 #include "flow/Arena.h"
+#include "flow/AutoCPointer.h"
 #include "flow/Error.h"
 #include "flow/IRandom.h"
 #include "flow/MkCert.h"
@@ -30,6 +31,7 @@
 #include "flow/Trace.h"
 #include "flow/UnitTest.h"
 #include <fmt/format.h>
+#include <cmath>
 #include <iterator>
 #include <string_view>
 #include <type_traits>
@@ -87,6 +89,51 @@ bool checkSignAlgorithm(PKeyAlgorithm algo, PrivateKey key) {
 	}
 }
 
+Optional<StringRef> convertEs256P1363ToDer(Arena& arena, StringRef p1363) {
+	const int SIGLEN = p1363.size();
+	const int HALF_SIGLEN = SIGLEN / 2;
+	auto r = AutoCPointer(BN_bin2bn(p1363.begin(), HALF_SIGLEN, nullptr), &::BN_free);
+	auto s = AutoCPointer(BN_bin2bn(p1363.begin() + HALF_SIGLEN, HALF_SIGLEN, nullptr), &::BN_free);
+	if (!r || !s)
+		return {};
+	auto sig = AutoCPointer(::ECDSA_SIG_new(), &ECDSA_SIG_free);
+	if (!sig)
+		return {};
+	::ECDSA_SIG_set0(sig, r.release(), s.release());
+	auto const derLen = ::i2d_ECDSA_SIG(sig, nullptr);
+	if (derLen < 0)
+		return {};
+	auto buf = new (arena) uint8_t[derLen];
+	auto bufPtr = buf;
+	::i2d_ECDSA_SIG(sig, &bufPtr);
+	return StringRef(buf, derLen);
+}
+
+Optional<StringRef> convertEs256DerToP1363(Arena& arena, StringRef der) {
+	uint8_t const* derPtr = der.begin();
+	auto sig = AutoCPointer(::d2i_ECDSA_SIG(nullptr, &derPtr, der.size()), &::ECDSA_SIG_free);
+	if (!sig) {
+		return {};
+	}
+	// ES256-specific constant. Adapt as needed
+	constexpr const int SIGLEN = 64;
+	constexpr const int HALF_SIGLEN = SIGLEN / 2;
+	auto buf = new (arena) uint8_t[SIGLEN];
+	::memset(buf, 0, SIGLEN);
+	auto bufr = buf;
+	auto bufs = bufr + HALF_SIGLEN;
+	auto r = std::add_pointer_t<BIGNUM const>();
+	auto s = std::add_pointer_t<BIGNUM const>();
+	ECDSA_SIG_get0(sig, &r, &s);
+	auto const lenr = BN_num_bytes(r);
+	auto const lens = BN_num_bytes(s);
+	if (lenr > HALF_SIGLEN || lens > HALF_SIGLEN)
+		return {};
+	BN_bn2bin(r, bufr + (HALF_SIGLEN - lenr));
+	BN_bn2bin(s, bufs + (HALF_SIGLEN - lens));
+	return StringRef(buf, SIGLEN);
+}
+
 } // namespace
 
 namespace authz {
@@ -130,11 +177,7 @@ SignedTokenRef signToken(Arena& arena, TokenRef token, StringRef keyName, Privat
 	auto writer = ObjectWriter([&arena](size_t len) { return new (arena) uint8_t[len]; }, IncludeVersion());
 	writer.serialize(token);
 	auto tokenStr = writer.toStringRef();
-	auto [signAlgo, digest] = getMethod(Algorithm::ES256);
-	if (!checkSignAlgorithm(signAlgo, privateKey)) {
-		throw digital_signature_ops_error();
-	}
-	auto sig = privateKey.sign(arena, tokenStr, *digest);
+	auto sig = privateKey.sign(arena, tokenStr, *::EVP_sha256());
 	ret.token = tokenStr;
 	ret.signature = sig;
 	ret.keyName = StringRef(arena, keyName);
@@ -142,10 +185,7 @@ SignedTokenRef signToken(Arena& arena, TokenRef token, StringRef keyName, Privat
 }
 
 bool verifyToken(SignedTokenRef signedToken, PublicKey publicKey) {
-	auto [keyAlg, digest] = getMethod(Algorithm::ES256);
-	if (!checkVerifyAlgorithm(keyAlg, publicKey))
-		return false;
-	return publicKey.verify(signedToken.token, signedToken.signature, *digest);
+	return publicKey.verify(signedToken.token, signedToken.signature, *::EVP_sha256());
 }
 
 TokenRef makeRandomTokenSpec(Arena& arena, IRandom& rng) {
@@ -268,6 +308,17 @@ StringRef signToken(Arena& arena, TokenRef tokenSpec, PrivateKey privateKey) {
 		throw digital_signature_ops_error();
 	}
 	auto plainSig = privateKey.sign(tmpArena, tokenPart, *digest);
+	if (tokenSpec.algorithm == Algorithm::ES256) {
+		// Need to convert ASN.1/DER signature to IEEE-P1363
+		auto convertedSig = convertEs256DerToP1363(tmpArena, plainSig);
+		if (!convertedSig.present()) {
+			auto tmpArena = Arena();
+			TraceEvent(SevWarn, "TokenSigConversionFailure")
+			    .detail("TokenSpec", tokenSpec.toStringRef(tmpArena).toString());
+			throw digital_signature_ops_error();
+		}
+		plainSig = convertedSig.get();
+	}
 	auto const sigPartLen = base64url::encodedLength(plainSig.size());
 	auto const totalLen = tokenPart.size() + 1 + sigPartLen;
 	auto out = new (arena) uint8_t[totalLen];
@@ -335,9 +386,9 @@ bool parseField(Arena& arena, Optional<FieldType>& out, const rapidjson::Documen
 			return false;
 		out = StringRef(arena, reinterpret_cast<const uint8_t*>(field.GetString()), field.GetStringLength());
 	} else if constexpr (std::is_same_v<FieldType, uint64_t>) {
-		if (!field.IsUint64())
+		if (!field.IsNumber())
 			return false;
-		out = field.GetUint64();
+		out = static_cast<uint64_t>(field.GetDouble());
 	} else {
 		if (!field.IsArray())
 			return false;
@@ -442,13 +493,17 @@ bool verifyToken(StringRef signedToken, PublicKey publicKey) {
 	auto [verifyAlgo, digest] = getMethod(parsedToken.algorithm);
 	if (!checkVerifyAlgorithm(verifyAlgo, publicKey))
 		return false;
+	if (parsedToken.algorithm == Algorithm::ES256) {
+		// Need to convert IEEE-P1363 signature to ASN.1/DER
+		auto convertedSig = convertEs256P1363ToDer(arena, sig);
+		if (!convertedSig.present())
+			return false;
+		sig = convertedSig.get();
+	}
 	return publicKey.verify(b64urlTokenPart, sig, *digest);
 }
 
 TokenRef makeRandomTokenSpec(Arena& arena, IRandom& rng, Algorithm alg) {
-	if (alg != Algorithm::ES256) {
-		throw unsupported_operation();
-	}
 	auto ret = TokenRef{};
 	ret.algorithm = alg;
 	ret.keyId = genRandomAlphanumStringRef(arena, rng, MaxKeyNameLenPlus1);
@@ -460,7 +515,7 @@ TokenRef makeRandomTokenSpec(Arena& arena, IRandom& rng, Algorithm alg) {
 	for (auto i = 0; i < numAudience; i++)
 		aud[i] = genRandomAlphanumStringRef(arena, rng, MaxTenantNameLenPlus1);
 	ret.audience = VectorRef<StringRef>(aud, numAudience);
-	ret.issuedAtUnixTime = uint64_t(std::floor(g_network->timer()));
+	ret.issuedAtUnixTime = g_network->timer();
 	ret.notBeforeUnixTime = ret.issuedAtUnixTime.get();
 	ret.expiresAtUnixTime = ret.issuedAtUnixTime.get() + rng.randomInt(360, 1080 + 1);
 	auto numTenants = rng.randomInt(1, 3);
@@ -569,51 +624,68 @@ TEST_CASE("/fdbrpc/TokenSign/JWT/ToStringRef") {
 }
 
 TEST_CASE("/fdbrpc/TokenSign/bench") {
-	constexpr auto repeat = 5;
-	constexpr auto numSamples = 10000;
-	auto keys = std::vector<PrivateKey>(numSamples);
-	auto pubKeys = std::vector<PublicKey>(numSamples);
-	for (auto i = 0; i < numSamples; i++) {
-		keys[i] = mkcert::makeEcP256();
-		pubKeys[i] = keys[i].toPublic();
-	}
-	fmt::print("{} keys generated\n", numSamples);
-	auto& rng = *deterministicRandom();
-	auto arena = Arena();
-	auto jwts = new (arena) StringRef[numSamples];
-	auto fbs = new (arena) StringRef[numSamples];
-	{
-		auto tmpArena = Arena();
+	auto keyTypes = std::array<StringRef, 2>{ "EC"_sr, "RSA"_sr };
+	for (auto kty : keyTypes) {
+		constexpr auto repeat = 5;
+		constexpr auto numSamples = 10000;
+		fmt::print("=== {} keys case\n", kty.toString());
+		auto key = kty == "EC"_sr ? mkcert::makeEcP256() : mkcert::makeRsa4096Bit();
+		auto pubKey = key.toPublic();
+		auto& rng = *deterministicRandom();
+		auto arena = Arena();
+		auto jwtSpecs = new (arena) authz::jwt::TokenRef[numSamples];
+		auto fbSpecs = new (arena) authz::flatbuffers::TokenRef[numSamples];
+		auto jwts = new (arena) StringRef[numSamples];
+		auto fbs = new (arena) StringRef[numSamples];
 		for (auto i = 0; i < numSamples; i++) {
-			auto jwtSpec = authz::jwt::makeRandomTokenSpec(tmpArena, rng, authz::Algorithm::ES256);
-			jwts[i] = authz::jwt::signToken(arena, jwtSpec, keys[i]);
-			auto fbSpec = authz::flatbuffers::makeRandomTokenSpec(tmpArena, rng);
-			auto fbToken = authz::flatbuffers::signToken(tmpArena, fbSpec, "defaultKey"_sr, keys[i]);
-			auto wr = ObjectWriter([&arena](size_t len) { return new (arena) uint8_t[len]; }, Unversioned());
-			wr.serialize(fbToken);
-			fbs[i] = wr.toStringRef();
+			jwtSpecs[i] = authz::jwt::makeRandomTokenSpec(
+			    arena, rng, kty == "EC"_sr ? authz::Algorithm::ES256 : authz::Algorithm::RS256);
+			fbSpecs[i] = authz::flatbuffers::makeRandomTokenSpec(arena, rng);
+		}
+		{
+			auto const jwtSignBegin = timer_monotonic();
+			for (auto i = 0; i < numSamples; i++) {
+				jwts[i] = authz::jwt::signToken(arena, jwtSpecs[i], key);
+			}
+			auto const jwtSignEnd = timer_monotonic();
+			fmt::print("JWT Sign   :         {:.2f} OPS\n", numSamples / (jwtSignEnd - jwtSignBegin));
+		}
+		{
+			auto const jwtVerifyBegin = timer_monotonic();
+			for (auto rep = 0; rep < repeat; rep++) {
+				for (auto i = 0; i < numSamples; i++) {
+					auto verifyOk = authz::jwt::verifyToken(jwts[i], pubKey);
+					ASSERT(verifyOk);
+				}
+			}
+			auto const jwtVerifyEnd = timer_monotonic();
+			fmt::print("JWT Verify :         {:.2f} OPS\n", repeat * numSamples / (jwtVerifyEnd - jwtVerifyBegin));
+		}
+		{
+			auto tmpArena = Arena();
+			auto const fbSignBegin = timer_monotonic();
+			for (auto i = 0; i < numSamples; i++) {
+				auto fbToken = authz::flatbuffers::signToken(tmpArena, fbSpecs[i], "defaultKey"_sr, key);
+				auto wr = ObjectWriter([&arena](size_t len) { return new (arena) uint8_t[len]; }, Unversioned());
+				wr.serialize(fbToken);
+				fbs[i] = wr.toStringRef();
+			}
+			auto const fbSignEnd = timer_monotonic();
+			fmt::print("FlatBuffers Sign   : {:.2f} OPS\n", numSamples / (fbSignEnd - fbSignBegin));
+		}
+		{
+			auto const fbVerifyBegin = timer_monotonic();
+			for (auto rep = 0; rep < repeat; rep++) {
+				for (auto i = 0; i < numSamples; i++) {
+					auto signedToken = ObjectReader::fromStringRef<Standalone<authz::flatbuffers::SignedTokenRef>>(
+					    fbs[i], Unversioned());
+					auto verifyOk = authz::flatbuffers::verifyToken(signedToken, pubKey);
+					ASSERT(verifyOk);
+				}
+			}
+			auto const fbVerifyEnd = timer_monotonic();
+			fmt::print("FlatBuffers Verify : {:.2f} OPS\n", repeat * numSamples / (fbVerifyEnd - fbVerifyBegin));
 		}
 	}
-	fmt::print("{} FB/JWT tokens generated\n", numSamples);
-	auto jwtBegin = timer_monotonic();
-	for (auto rep = 0; rep < repeat; rep++) {
-		for (auto i = 0; i < numSamples; i++) {
-			auto verifyOk = authz::jwt::verifyToken(jwts[i], pubKeys[i]);
-			ASSERT(verifyOk);
-		}
-	}
-	auto jwtEnd = timer_monotonic();
-	fmt::print("JWT:         {:.2f} OPS\n", repeat * numSamples / (jwtEnd - jwtBegin));
-	auto fbBegin = timer_monotonic();
-	for (auto rep = 0; rep < repeat; rep++) {
-		for (auto i = 0; i < numSamples; i++) {
-			auto signedToken =
-			    ObjectReader::fromStringRef<Standalone<authz::flatbuffers::SignedTokenRef>>(fbs[i], Unversioned());
-			auto verifyOk = authz::flatbuffers::verifyToken(signedToken, pubKeys[i]);
-			ASSERT(verifyOk);
-		}
-	}
-	auto fbEnd = timer_monotonic();
-	fmt::print("FlatBuffers: {:.2f} OPS\n", repeat * numSamples / (fbEnd - fbBegin));
 	return Void();
 }
diff --git a/fdbrpc/include/fdbrpc/AsyncFileEIO.actor.h b/fdbrpc/include/fdbrpc/AsyncFileEIO.actor.h
index a41bbfadea..bc7865b485 100644
--- a/fdbrpc/include/fdbrpc/AsyncFileEIO.actor.h
+++ b/fdbrpc/include/fdbrpc/AsyncFileEIO.actor.h
@@ -107,8 +107,8 @@ public:
 		    .detail("Flags", flags);
 
 		if ((flags & OPEN_LOCK) && !lock_fd(r->result)) {
-			TraceEvent(SevError, "UnableToLockFile").detail("Filename", filename).GetLastError();
-			throw io_error();
+			TraceEvent(SevWarn, "UnableToLockFile").detail("Filename", filename).GetLastError();
+			throw lock_file_failure();
 		}
 
 		return Reference<IAsyncFile>(new AsyncFileEIO(r->result, flags, filename));
diff --git a/fdbrpc/include/fdbrpc/AsyncFileKAIO.actor.h b/fdbrpc/include/fdbrpc/AsyncFileKAIO.actor.h
index dd1f1b014d..09320ae659 100644
--- a/fdbrpc/include/fdbrpc/AsyncFileKAIO.actor.h
+++ b/fdbrpc/include/fdbrpc/AsyncFileKAIO.actor.h
@@ -162,8 +162,8 @@ public:
 			       // by l_whence and l_start through to the end of file, no matter how large the file grows."
 			lockDesc.l_pid = 0;
 			if (fcntl(fd, F_SETLK, &lockDesc) == -1) {
-				TraceEvent(SevError, "UnableToLockFile").detail("Filename", filename).GetLastError();
-				return io_error();
+				TraceEvent(SevWarn, "UnableToLockFile").detail("Filename", filename).GetLastError();
+				return lock_file_failure();
 			}
 		}
 
diff --git a/fdbrpc/include/fdbrpc/FlowTransport.h b/fdbrpc/include/fdbrpc/FlowTransport.h
index fda9925112..af0a7a8b75 100644
--- a/fdbrpc/include/fdbrpc/FlowTransport.h
+++ b/fdbrpc/include/fdbrpc/FlowTransport.h
@@ -298,6 +298,12 @@ public:
 	void removePublicKey(StringRef name);
 	void removeAllPublicKeys();
 
+	// Synchronously load and apply JWKS (RFC 7517) public key file with which to verify authorization tokens.
+	void loadPublicKeyFile(const std::string& publicKeyFilePath);
+
+	// Periodically read JWKS (RFC 7517) public key file to refresh public key set.
+	void watchPublicKeyFile(const std::string& publicKeyFilePath);
+
 private:
 	class TransportData* self;
 };
diff --git a/fdbrpc/include/fdbrpc/HTTP.h b/fdbrpc/include/fdbrpc/HTTP.h
index a629e9ecd6..ad6b649d19 100644
--- a/fdbrpc/include/fdbrpc/HTTP.h
+++ b/fdbrpc/include/fdbrpc/HTTP.h
@@ -68,6 +68,12 @@ Future<Reference<Response>> doRequest(Reference<IConnection> const& conn,
                                       Reference<IRateControl> const& recvRate,
                                       const std::string& requestHeader = std::string());
 
+// Connect to proxy, send CONNECT command, and connect to the remote host.
+Future<Reference<IConnection>> proxyConnect(const std::string& remoteHost,
+                                            const std::string& remoteService,
+                                            const std::string& proxyHost,
+                                            const std::string& proxyService);
+
 constexpr int HTTP_STATUS_CODE_OK = 200;
 constexpr int HTTP_STATUS_CODE_CREATED = 201;
 constexpr int HTTP_STATUS_CODE_ACCEPTED = 202;
diff --git a/fdbrpc/include/fdbrpc/LoadBalance.actor.h b/fdbrpc/include/fdbrpc/LoadBalance.actor.h
index 91ab0e3b6d..d8e15d927f 100644
--- a/fdbrpc/include/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/include/fdbrpc/LoadBalance.actor.h
@@ -486,6 +486,12 @@ Future<REPLY_TYPE(Request)> loadBalance(
 				// server count is within "LOAD_BALANCE_MAX_BAD_OPTIONS". We
 				// do not need to consider any remote servers.
 				break;
+			} else if (badServers == alternatives->countBest() && i == badServers) {
+				TraceEvent("AllLocalAlternativesFailed")
+				    .suppressFor(1.0)
+				    .detail("Alternatives", alternatives->description())
+				    .detail("Total", alternatives->size())
+				    .detail("Best", alternatives->countBest());
 			}
 
 			RequestStream<Request, P> const* thisStream = &alternatives->get(i, channel);
@@ -587,6 +593,7 @@ Future<REPLY_TYPE(Request)> loadBalance(
 		// nextAlt. This logic matters only if model == nullptr. Otherwise, the
 		// bestAlt and nextAlt have been decided.
 		state RequestStream<Request, P> const* stream = nullptr;
+		state LBDistance::Type distance;
 		for (int alternativeNum = 0; alternativeNum < alternatives->size(); alternativeNum++) {
 			int useAlt = nextAlt;
 			if (nextAlt == startAlt)
@@ -595,6 +602,7 @@ Future<REPLY_TYPE(Request)> loadBalance(
 				useAlt = (nextAlt + alternatives->size() - 1) % alternatives->size();
 
 			stream = &alternatives->get(useAlt, channel);
+			distance = alternatives->getDistance(useAlt);
 			if (!IFailureMonitor::failureMonitor().getState(stream->getEndpoint()).failed &&
 			    (!firstRequestEndpoint.present() || stream->getEndpoint().token.first() != firstRequestEndpoint.get()))
 				break;
@@ -602,6 +610,7 @@ Future<REPLY_TYPE(Request)> loadBalance(
 			if (nextAlt == startAlt)
 				triedAllOptions = TriedAllOptions::True;
 			stream = nullptr;
+			distance = LBDistance::DISTANT;
 		}
 
 		if (!stream && !firstRequestData.isValid()) {
@@ -637,6 +646,18 @@ Future<REPLY_TYPE(Request)> loadBalance(
 			firstRequestEndpoint = Optional<uint64_t>();
 		} else if (firstRequestData.isValid()) {
 			// Issue a second request, the first one is taking a long time.
+			if (distance == LBDistance::DISTANT) {
+				TraceEvent("LBDistant2nd")
+				    .suppressFor(0.1)
+				    .detail("Distance", (int)distance)
+				    .detail("BackOff", backoff)
+				    .detail("TriedAllOptions", triedAllOptions)
+				    .detail("Alternatives", alternatives->description())
+				    .detail("Token", stream->getEndpoint().token)
+				    .detail("Total", alternatives->size())
+				    .detail("Best", alternatives->countBest())
+				    .detail("Attempts", numAttempts);
+			}
 			secondRequestData.startRequest(backoff, triedAllOptions, stream, request, model, alternatives, channel);
 
 			loop choose {
@@ -664,6 +685,18 @@ Future<REPLY_TYPE(Request)> loadBalance(
 			}
 		} else {
 			// Issue a request, if it takes too long to get a reply, go around the loop
+			if (distance == LBDistance::DISTANT) {
+				TraceEvent("LBDistant")
+				    .suppressFor(0.1)
+				    .detail("Distance", (int)distance)
+				    .detail("BackOff", backoff)
+				    .detail("TriedAllOptions", triedAllOptions)
+				    .detail("Alternatives", alternatives->description())
+				    .detail("Token", stream->getEndpoint().token)
+				    .detail("Total", alternatives->size())
+				    .detail("Best", alternatives->countBest())
+				    .detail("Attempts", numAttempts);
+			}
 			firstRequestData.startRequest(backoff, triedAllOptions, stream, request, model, alternatives, channel);
 			firstRequestEndpoint = stream->getEndpoint().token.first();
 
diff --git a/fdbrpc/include/fdbrpc/MultiInterface.h b/fdbrpc/include/fdbrpc/MultiInterface.h
index 4f15dbf087..85fa195206 100644
--- a/fdbrpc/include/fdbrpc/MultiInterface.h
+++ b/fdbrpc/include/fdbrpc/MultiInterface.h
@@ -226,6 +226,7 @@ public:
 	}
 
 	T const& getInterface(int index) { return alternatives[index]->interf; }
+	LBDistance::Type getDistance(int index) const { return (LBDistance::Type)alternatives[index]->distance; }
 	UID getId(int index) const { return alternatives[index]->interf.id(); }
 	bool hasInterface(UID id) const {
 		for (const auto& ref : alternatives) {
diff --git a/fdbrpc/include/fdbrpc/SimExternalConnection.h b/fdbrpc/include/fdbrpc/SimExternalConnection.h
index 00f55b6ab0..82a585f50e 100644
--- a/fdbrpc/include/fdbrpc/SimExternalConnection.h
+++ b/fdbrpc/include/fdbrpc/SimExternalConnection.h
@@ -47,6 +47,7 @@ public:
 	int read(uint8_t* begin, uint8_t* end) override;
 	int write(SendBuffer const* buffer, int limit) override;
 	NetworkAddress getPeerAddress() const override;
+	bool hasTrustedPeer() const override;
 	UID getDebugID() const override;
 	boost::asio::ip::tcp::socket& getSocket() override { return socket; }
 	static Future<std::vector<NetworkAddress>> resolveTCPEndpoint(const std::string& host,
diff --git a/fdbrpc/include/fdbrpc/simulator.h b/fdbrpc/include/fdbrpc/simulator.h
index 5e402b8261..da9a607962 100644
--- a/fdbrpc/include/fdbrpc/simulator.h
+++ b/fdbrpc/include/fdbrpc/simulator.h
@@ -63,6 +63,25 @@ public:
 	enum TSSMode { Disabled, EnabledNormal, EnabledAddDelay, EnabledDropMutations };
 
 	enum class BackupAgentType { NoBackupAgents, WaitForType, BackupToFile, BackupToDB };
+	enum class ExtraDatabaseMode { Disabled, LocalOrSingle, Single, Local, Multiple };
+
+	static ExtraDatabaseMode stringToExtraDatabaseMode(std::string databaseMode) {
+		if (databaseMode == "Disabled") {
+			return ExtraDatabaseMode::Disabled;
+		} else if (databaseMode == "LocalOrSingle") {
+			return ExtraDatabaseMode::LocalOrSingle;
+		} else if (databaseMode == "Single") {
+			return ExtraDatabaseMode::Single;
+		} else if (databaseMode == "Local") {
+			return ExtraDatabaseMode::Local;
+		} else if (databaseMode == "Multiple") {
+			return ExtraDatabaseMode::Multiple;
+		} else {
+			TraceEvent(SevError, "UnknownExtraDatabaseMode").detail("DatabaseMode", databaseMode);
+			ASSERT(false);
+			throw internal_error();
+		}
+	};
 
 	// Subclasses may subclass ProcessInfo as well
 	struct MachineInfo;
@@ -405,7 +424,7 @@ public:
 		allSwapsDisabled = false;
 	}
 	bool canSwapToMachine(Optional<Standalone<StringRef>> zoneId) const {
-		return swapsDisabled.count(zoneId) == 0 && !allSwapsDisabled && !extraDB;
+		return swapsDisabled.count(zoneId) == 0 && !allSwapsDisabled && extraDatabases.empty();
 	}
 	void enableSwapsToAll() {
 		swapsDisabled.clear();
@@ -432,7 +451,7 @@ public:
 	int listenersPerProcess;
 	std::set<NetworkAddress> protectedAddresses;
 	std::map<NetworkAddress, ProcessInfo*> currentlyRebootingProcesses;
-	class ClusterConnectionString* extraDB = nullptr;
+	std::vector<std::string> extraDatabases;
 	Reference<IReplicationPolicy> storagePolicy;
 	Reference<IReplicationPolicy> tLogPolicy;
 	int32_t tLogWriteAntiQuorum;
diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index 40fd0b9393..0e75b59976 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -208,7 +208,7 @@ SimClogging g_clogging;
 
 struct Sim2Conn final : IConnection, ReferenceCounted<Sim2Conn> {
 	Sim2Conn(ISimulator::ProcessInfo* process)
-	  : opened(false), closedByCaller(false), stableConnection(false), process(process),
+	  : opened(false), closedByCaller(false), stableConnection(false), trustedPeer(true), process(process),
 	    dbgid(deterministicRandom()->randomUniqueID()), stopReceive(Never()) {
 		pipes = sender(this) && receiver(this);
 	}
@@ -259,6 +259,8 @@ struct Sim2Conn final : IConnection, ReferenceCounted<Sim2Conn> {
 
 	bool isPeerGone() const { return !peer || peerProcess->failed; }
 
+	bool hasTrustedPeer() const override { return trustedPeer; }
+
 	bool isStableConnection() const override { return stableConnection; }
 
 	void peerClosed() {
@@ -327,7 +329,7 @@ struct Sim2Conn final : IConnection, ReferenceCounted<Sim2Conn> {
 
 	boost::asio::ip::tcp::socket& getSocket() override { throw operation_failed(); }
 
-	bool opened, closedByCaller, stableConnection;
+	bool opened, closedByCaller, stableConnection, trustedPeer;
 
 private:
 	ISimulator::ProcessInfo *process, *peerProcess;
@@ -518,12 +520,12 @@ public:
 		state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
 		state TaskPriority currentTaskID = g_network->getCurrentTask();
 
-		if (++openCount >= 3000) {
+		if (++openCount >= 6000) {
 			TraceEvent(SevError, "TooManyFiles").log();
 			ASSERT(false);
 		}
 
-		if (openCount == 2000) {
+		if (openCount == 4000) {
 			TraceEvent(SevWarnAlways, "DisableConnectionFailures_TooManyFiles").log();
 			g_simulator.speedUpSimulation = true;
 			g_simulator.connectionFailuresDisableDuration = 1e6;
diff --git a/fdbrpc/sim_validation.cpp b/fdbrpc/sim_validation.cpp
index 061df1386e..4c9b7536d6 100644
--- a/fdbrpc/sim_validation.cpp
+++ b/fdbrpc/sim_validation.cpp
@@ -51,13 +51,13 @@ void debug_advanceVersion(UID id, int64_t version, const char* suffix) {
 }
 
 void debug_advanceMinCommittedVersion(UID id, int64_t version) {
-	if (!g_network->isSimulated() || g_simulator.extraDB)
+	if (!g_network->isSimulated() || !g_simulator.extraDatabases.empty())
 		return;
 	debug_advanceVersion(id, version, "min");
 }
 
 void debug_advanceMaxCommittedVersion(UID id, int64_t version) {
-	if (!g_network->isSimulated() || g_simulator.extraDB)
+	if (!g_network->isSimulated() || !g_simulator.extraDatabases.empty())
 		return;
 	debug_advanceVersion(id, version, "max");
 }
@@ -67,7 +67,7 @@ bool debug_checkPartRestoredVersion(UID id,
                                     std::string context,
                                     std::string minormax,
                                     Severity sev = SevError) {
-	if (!g_network->isSimulated() || g_simulator.extraDB)
+	if (!g_network->isSimulated() || !g_simulator.extraDatabases.empty())
 		return false;
 	if (disabledMachines.count(id))
 		return false;
@@ -88,33 +88,33 @@ bool debug_checkPartRestoredVersion(UID id,
 }
 
 bool debug_checkRestoredVersion(UID id, int64_t version, std::string context, Severity sev) {
-	if (!g_network->isSimulated() || g_simulator.extraDB)
+	if (!g_network->isSimulated() || !g_simulator.extraDatabases.empty())
 		return false;
 	return debug_checkPartRestoredVersion(id, version, context, "min", sev) ||
 	       debug_checkPartRestoredVersion(id, version, context, "max", sev);
 }
 
 void debug_removeVersions(UID id) {
-	if (!g_network->isSimulated() || g_simulator.extraDB)
+	if (!g_network->isSimulated() || !g_simulator.extraDatabases.empty())
 		return;
 	validationData.erase(id.toString() + "min");
 	validationData.erase(id.toString() + "max");
 }
 
 bool debug_versionsExist(UID id) {
-	if (!g_network->isSimulated() || g_simulator.extraDB)
+	if (!g_network->isSimulated() || !g_simulator.extraDatabases.empty())
 		return false;
 	return validationData.count(id.toString() + "min") != 0 || validationData.count(id.toString() + "max") != 0;
 }
 
 bool debug_checkMinRestoredVersion(UID id, int64_t version, std::string context, Severity sev) {
-	if (!g_network->isSimulated() || g_simulator.extraDB)
+	if (!g_network->isSimulated() || !g_simulator.extraDatabases.empty())
 		return false;
 	return debug_checkPartRestoredVersion(id, version, context, "min", sev);
 }
 
 bool debug_checkMaxRestoredVersion(UID id, int64_t version, std::string context, Severity sev) {
-	if (!g_network->isSimulated() || g_simulator.extraDB)
+	if (!g_network->isSimulated() || !g_simulator.extraDatabases.empty())
 		return false;
 	return debug_checkPartRestoredVersion(id, version, context, "max", sev);
 }
@@ -129,13 +129,13 @@ void debug_setCheckRelocationDuration(bool check) {
 	checkRelocationDuration = check;
 }
 void debug_advanceVersionTimestamp(int64_t version, double t) {
-	if (!g_network->isSimulated() || g_simulator.extraDB)
+	if (!g_network->isSimulated() || !g_simulator.extraDatabases.empty())
 		return;
 	timedVersionsValidationData[version] = t;
 }
 
 bool debug_checkVersionTime(int64_t version, double t, std::string context, Severity sev) {
-	if (!g_network->isSimulated() || g_simulator.extraDB)
+	if (!g_network->isSimulated() || !g_simulator.extraDatabases.empty())
 		return false;
 	if (!timedVersionsValidationData.count(version)) {
 		TraceEvent(SevWarn, (context + "UnknownTime").c_str())
diff --git a/fdbrpc/tests/AuthzTlsTest.actor.cpp b/fdbrpc/tests/AuthzTlsTest.actor.cpp
new file mode 100644
index 0000000000..3cf7c5528e
--- /dev/null
+++ b/fdbrpc/tests/AuthzTlsTest.actor.cpp
@@ -0,0 +1,357 @@
+/*
+ * AuthzTlsTest.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _WIN32
+#include <algorithm>
+#include <cstring>
+#include <fmt/format.h>
+#include <unistd.h>
+#include <string_view>
+#include <signal.h>
+#include <sys/wait.h>
+#include "flow/Arena.h"
+#include "flow/MkCert.h"
+#include "flow/ScopeExit.h"
+#include "flow/TLSConfig.actor.h"
+#include "fdbrpc/fdbrpc.h"
+#include "fdbrpc/FlowTransport.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+std::FILE* outp = stdout;
+
+template <class... Args>
+void log(Args&&... args) {
+	auto buf = fmt::memory_buffer{};
+	fmt::format_to(std::back_inserter(buf), std::forward<Args>(args)...);
+	fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size()));
+}
+
+template <class... Args>
+void logc(Args&&... args) {
+	auto buf = fmt::memory_buffer{};
+	fmt::format_to(std::back_inserter(buf), "[CLIENT] ");
+	fmt::format_to(std::back_inserter(buf), std::forward<Args>(args)...);
+	fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size()));
+}
+
+template <class... Args>
+void logs(Args&&... args) {
+	auto buf = fmt::memory_buffer{};
+	fmt::format_to(std::back_inserter(buf), "[SERVER] ");
+	fmt::format_to(std::back_inserter(buf), std::forward<Args>(args)...);
+	fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size()));
+}
+
+template <class... Args>
+void logm(Args&&... args) {
+	auto buf = fmt::memory_buffer{};
+	fmt::format_to(std::back_inserter(buf), "[ MAIN ] ");
+	fmt::format_to(std::back_inserter(buf), std::forward<Args>(args)...);
+	fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size()));
+}
+
+struct TLSCreds {
+	std::string certBytes;
+	std::string keyBytes;
+	std::string caBytes;
+};
+
+TLSCreds makeCreds(int chainLen, mkcert::ESide side) {
+	if (chainLen == 0)
+		return {};
+	auto arena = Arena();
+	auto ret = TLSCreds{};
+	auto specs = mkcert::makeCertChainSpec(arena, std::labs(chainLen), side);
+	if (chainLen < 0) {
+		specs[0].offsetNotBefore = -60l * 60 * 24 * 365;
+		specs[0].offsetNotAfter = -10l; // cert that expired 10 seconds ago
+	}
+	auto chain = mkcert::makeCertChain(arena, specs, {} /* create root CA cert from spec*/);
+	if (chain.size() == 1) {
+		ret.certBytes = concatCertChain(arena, chain).toString();
+	} else {
+		auto nonRootChain = chain;
+		nonRootChain.pop_back();
+		ret.certBytes = concatCertChain(arena, nonRootChain).toString();
+	}
+	ret.caBytes = chain.back().certPem.toString();
+	ret.keyBytes = chain.front().privateKeyPem.toString();
+	return ret;
+}
+
+enum class Result : int {
+	TRUSTED = 0,
+	UNTRUSTED,
+	ERROR,
+};
+
+template <>
+struct fmt::formatter<Result> {
+	constexpr auto parse(format_parse_context& ctx) -> decltype(ctx.begin()) { return ctx.begin(); }
+
+	template <class FormatContext>
+	auto format(const Result& r, FormatContext& ctx) -> decltype(ctx.out()) {
+		if (r == Result::TRUSTED)
+			return fmt::format_to(ctx.out(), "TRUSTED");
+		else if (r == Result::UNTRUSTED)
+			return fmt::format_to(ctx.out(), "UNTRUSTED");
+		else
+			return fmt::format_to(ctx.out(), "ERROR");
+	}
+};
+
+ACTOR template <class T>
+Future<T> stopNetworkAfter(Future<T> what) {
+	T t = wait(what);
+	g_network->stop();
+	return t;
+}
+
+// Reflective struct containing information about the requester from a server PoV
+struct SessionInfo {
+	constexpr static FileIdentifier file_identifier = 1578312;
+	bool isPeerTrusted = false;
+	NetworkAddress peerAddress;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, isPeerTrusted, peerAddress);
+	}
+};
+
+struct SessionProbeRequest {
+	constexpr static FileIdentifier file_identifier = 1559713;
+	ReplyPromise<SessionInfo> reply{ PeerCompatibilityPolicy{ RequirePeer::AtLeast,
+		                                                      ProtocolVersion::withStableInterfaces() } };
+
+	bool verify() const { return true; }
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, reply);
+	}
+};
+
+struct SessionProbeReceiver final : NetworkMessageReceiver {
+	SessionProbeReceiver() {}
+	void receive(ArenaObjectReader& reader) override {
+		SessionProbeRequest req;
+		reader.deserialize(req);
+		SessionInfo res;
+		res.isPeerTrusted = FlowTransport::transport().currentDeliveryPeerIsTrusted();
+		res.peerAddress = FlowTransport::transport().currentDeliveryPeerAddress();
+		req.reply.send(res);
+	}
+	PeerCompatibilityPolicy peerCompatibilityPolicy() const override {
+		return PeerCompatibilityPolicy{ RequirePeer::AtLeast, ProtocolVersion::withStableInterfaces() };
+	}
+	bool isPublic() const override { return true; }
+};
+
+Future<Void> runServer(Future<Void> listenFuture, const Endpoint& endpoint, int addrPipe, int completionPipe) {
+	auto realAddr = FlowTransport::transport().getLocalAddresses().address;
+	logs("Listening at {}", realAddr.toString());
+	logs("Endpoint token is {}", endpoint.token.toString());
+	// below writes/reads would block, but this is good enough for a test.
+	if (sizeof(realAddr) != ::write(addrPipe, &realAddr, sizeof(realAddr))) {
+		logs("Failed to write server addr to pipe: {}", strerror(errno));
+		return Void();
+	}
+	if (sizeof(endpoint.token) != ::write(addrPipe, &endpoint.token, sizeof(endpoint.token))) {
+		logs("Failed to write server endpoint to pipe: {}", strerror(errno));
+		return Void();
+	}
+	auto done = false;
+	if (sizeof(done) != ::read(completionPipe, &done, sizeof(done))) {
+		logs("Failed to read completion flag from pipe: {}", strerror(errno));
+		return Void();
+	}
+	return Void();
+}
+
+ACTOR Future<Void> waitAndPrintResponse(Future<SessionInfo> response, Result* rc) {
+	try {
+		SessionInfo info = wait(response);
+		logc("Probe response: trusted={} peerAddress={}", info.isPeerTrusted, info.peerAddress.toString());
+		*rc = info.isPeerTrusted ? Result::TRUSTED : Result::UNTRUSTED;
+	} catch (Error& err) {
+		logc("Error: {}", err.what());
+		*rc = Result::ERROR;
+	}
+	return Void();
+}
+
+template <bool IsServer>
+int runHost(TLSCreds creds, int addrPipe, int completionPipe, Result expect) {
+	auto tlsConfig = TLSConfig(IsServer ? TLSEndpointType::SERVER : TLSEndpointType::CLIENT);
+	tlsConfig.setCertificateBytes(creds.certBytes);
+	tlsConfig.setCABytes(creds.caBytes);
+	tlsConfig.setKeyBytes(creds.keyBytes);
+	g_network = newNet2(tlsConfig);
+	openTraceFile(NetworkAddress(),
+	              10 << 20,
+	              10 << 20,
+	              ".",
+	              IsServer ? "authz_tls_unittest_server" : "authz_tls_unittest_client");
+	FlowTransport::createInstance(!IsServer, 1, WLTOKEN_RESERVED_COUNT);
+	auto& transport = FlowTransport::transport();
+	if constexpr (IsServer) {
+		auto addr = NetworkAddress::parse("127.0.0.1:0:tls");
+		auto thread = std::thread([]() {
+			g_network->run();
+			flushTraceFileVoid();
+		});
+		auto endpoint = Endpoint();
+		auto receiver = SessionProbeReceiver();
+		transport.addEndpoint(endpoint, &receiver, TaskPriority::ReadSocket);
+		runServer(transport.bind(addr, addr), endpoint, addrPipe, completionPipe);
+		auto cleanupGuard = ScopeExit([&thread]() {
+			g_network->stop();
+			thread.join();
+		});
+	} else {
+		auto dest = Endpoint();
+		auto& serverAddr = dest.addresses.address;
+		if (sizeof(serverAddr) != ::read(addrPipe, &serverAddr, sizeof(serverAddr))) {
+			logc("Failed to read server addr from pipe: {}", strerror(errno));
+			return 1;
+		}
+		auto& token = dest.token;
+		if (sizeof(token) != ::read(addrPipe, &token, sizeof(token))) {
+			logc("Failed to read server endpoint token from pipe: {}", strerror(errno));
+			return 2;
+		}
+		logc("Server address is {}", serverAddr.toString());
+		logc("Server endpoint token is {}", token.toString());
+		auto sessionProbeReq = SessionProbeRequest{};
+		transport.sendUnreliable(SerializeSource(sessionProbeReq), dest, true /*openConnection*/);
+		logc("Request is sent");
+		auto probeResponse = sessionProbeReq.reply.getFuture();
+		auto result = Result::TRUSTED;
+		auto timeout = delay(5);
+		auto complete = waitAndPrintResponse(probeResponse, &result);
+		auto f = stopNetworkAfter(complete || timeout);
+		auto rc = 0;
+		g_network->run();
+		if (!complete.isReady()) {
+			logc("Error: Probe request timed out");
+			rc = 3;
+		}
+		auto done = true;
+		if (sizeof(done) != ::write(completionPipe, &done, sizeof(done))) {
+			logc("Failed to signal server to terminate: {}", strerror(errno));
+			rc = 4;
+		}
+		if (rc == 0) {
+			if (expect != result) {
+				logc("Test failed: expected {}, got {}", expect, result);
+				rc = 5;
+			} else {
+				logc("Response OK: got {} as expected", result);
+			}
+		}
+		return rc;
+	}
+	return 0;
+}
+
+int runTlsTest(int serverChainLen, int clientChainLen) {
+	log("==== BEGIN TESTCASE ====");
+	auto expect = Result::ERROR;
+	if (serverChainLen > 0) {
+		if (clientChainLen > 0)
+			expect = Result::TRUSTED;
+		else if (clientChainLen == 0)
+			expect = Result::UNTRUSTED;
+	}
+	log("Cert chain length: server={} client={}", serverChainLen, clientChainLen);
+	auto arena = Arena();
+	auto serverCreds = makeCreds(serverChainLen, mkcert::ESide::Server);
+	auto clientCreds = makeCreds(clientChainLen, mkcert::ESide::Client);
+	// make server and client trust each other
+	std::swap(serverCreds.caBytes, clientCreds.caBytes);
+	auto clientPid = pid_t{};
+	auto serverPid = pid_t{};
+	int addrPipe[2];
+	int completionPipe[2];
+	if (::pipe(addrPipe) || ::pipe(completionPipe)) {
+		logm("Pipe open failed: {}", strerror(errno));
+		return 1;
+	}
+	auto pipeCleanup = ScopeExit([&addrPipe, &completionPipe]() {
+		::close(addrPipe[0]);
+		::close(addrPipe[1]);
+		::close(completionPipe[0]);
+		::close(completionPipe[1]);
+	});
+	serverPid = fork();
+	if (serverPid == 0) {
+		_exit(runHost<true>(std::move(serverCreds), addrPipe[1], completionPipe[0], expect));
+	}
+	clientPid = fork();
+	if (clientPid == 0) {
+		_exit(runHost<false>(std::move(clientCreds), addrPipe[0], completionPipe[1], expect));
+	}
+	auto pid = pid_t{};
+	auto status = int{};
+	pid = waitpid(clientPid, &status, 0);
+	auto ok = true;
+	if (pid < 0) {
+		logm("waitpid() for client failed with {}", strerror(errno));
+		ok = false;
+	} else {
+		if (status != 0) {
+			logm("Client error: rc={}", status);
+			ok = false;
+		} else {
+			logm("Client OK");
+		}
+	}
+	pid = waitpid(serverPid, &status, 0);
+	if (pid < 0) {
+		logm("waitpid() for server failed with {}", strerror(errno));
+		ok = false;
+	} else {
+		if (status != 0) {
+			logm("Server error: rc={}", status);
+			ok = false;
+		} else {
+			logm("Server OK");
+		}
+	}
+	log(ok ? "OK" : "FAILED");
+	return 0;
+}
+
+int main() {
+	std::pair<int, int> inputs[] = { { 3, 2 }, { 4, 0 }, { 1, 3 }, { 1, 0 }, { 2, 0 }, { 3, 3 }, { 3, 0 } };
+	for (auto input : inputs) {
+		auto [serverChainLen, clientChainLen] = input;
+		if (auto rc = runTlsTest(serverChainLen, clientChainLen))
+			return rc;
+	}
+	return 0;
+}
+#else // _WIN32
+
+int main() {
+	return 0;
+}
+#endif // _WIN32
diff --git a/fdbrpc/tests/CMakeLists.txt b/fdbrpc/tests/CMakeLists.txt
new file mode 100644
index 0000000000..19d75921b3
--- /dev/null
+++ b/fdbrpc/tests/CMakeLists.txt
@@ -0,0 +1,6 @@
+if(NOT WIN32)
+  add_flow_target(EXECUTABLE NAME authz_tls_unittest SRCS AuthzTlsTest.actor.cpp)
+  target_link_libraries(authz_tls_unittest PRIVATE flow fdbrpc fmt::fmt)
+  add_test(NAME authorization_tls_unittest
+           COMMAND $<TARGET_FILE:authz_tls_unittest>)
+endif()
diff --git a/fdbserver/ApplyMetadataMutation.cpp b/fdbserver/ApplyMetadataMutation.cpp
index 826888ba11..6971b93af2 100644
--- a/fdbserver/ApplyMetadataMutation.cpp
+++ b/fdbserver/ApplyMetadataMutation.cpp
@@ -20,11 +20,12 @@
 
 #include "fdbclient/BackupAgent.actor.h"
 #include "fdbclient/KeyBackedTypes.h" // for key backed map codecs for tss mapping
+#include "fdbclient/Metacluster.h"
 #include "fdbclient/MutationList.h"
 #include "fdbclient/Notified.h"
 #include "fdbclient/SystemData.h"
 #include "fdbserver/ApplyMetadataMutation.h"
-#include "fdbserver/EncryptedMutationMessage.h"
+#include "fdbserver/EncryptionOpsUtils.h"
 #include "fdbserver/IKeyValueStore.h"
 #include "fdbserver/LogProtocolMessage.h"
 #include "fdbserver/LogSystem.h"
@@ -58,9 +59,10 @@ public:
 	                           const UID& dbgid_,
 	                           Arena& arena_,
 	                           const VectorRef<MutationRef>& mutations_,
-	                           IKeyValueStore* txnStateStore_)
+	                           IKeyValueStore* txnStateStore_,
+	                           Reference<AsyncVar<ServerDBInfo> const> db)
 	  : spanContext(spanContext_), dbgid(dbgid_), arena(arena_), mutations(mutations_), txnStateStore(txnStateStore_),
-	    confChange(dummyConfChange) {}
+	    confChange(dummyConfChange), dbInfo(db) {}
 
 	ApplyMetadataMutationsImpl(const SpanContext& spanContext_,
 	                           Arena& arena_,
@@ -81,17 +83,18 @@ public:
 	    uid_applyMutationsData(proxyCommitData_.firstProxy ? &proxyCommitData_.uid_applyMutationsData : nullptr),
 	    commit(proxyCommitData_.commit), cx(proxyCommitData_.cx), committedVersion(&proxyCommitData_.committedVersion),
 	    storageCache(&proxyCommitData_.storageCache), tag_popped(&proxyCommitData_.tag_popped),
-	    tssMapping(&proxyCommitData_.tssMapping), tenantMap(&proxyCommitData_.tenantMap),
-	    initialCommit(initialCommit_) {}
+	    tssMapping(&proxyCommitData_.tssMapping), tenantMap(&proxyCommitData_.tenantMap), initialCommit(initialCommit_),
+	    dbInfo(proxyCommitData_.db) {}
 
 	ApplyMetadataMutationsImpl(const SpanContext& spanContext_,
 	                           ResolverData& resolverData_,
-	                           const VectorRef<MutationRef>& mutations_)
+	                           const VectorRef<MutationRef>& mutations_,
+	                           Reference<AsyncVar<ServerDBInfo> const> db)
 	  : spanContext(spanContext_), dbgid(resolverData_.dbgid), arena(resolverData_.arena), mutations(mutations_),
 	    txnStateStore(resolverData_.txnStateStore), toCommit(resolverData_.toCommit),
 	    confChange(resolverData_.confChanges), logSystem(resolverData_.logSystem), popVersion(resolverData_.popVersion),
 	    keyInfo(resolverData_.keyInfo), storageCache(resolverData_.storageCache),
-	    initialCommit(resolverData_.initialCommit), forResolver(true) {}
+	    initialCommit(resolverData_.initialCommit), forResolver(true), dbInfo(db) {}
 
 private:
 	// The following variables are incoming parameters
@@ -138,6 +141,8 @@ private:
 	// true if called from Resolver
 	bool forResolver = false;
 
+	Reference<AsyncVar<ServerDBInfo> const> dbInfo;
+
 private:
 	// The following variables are used internally
 
@@ -158,12 +163,12 @@ private:
 
 private:
 	void writeMutation(const MutationRef& m) {
-		if (forResolver || !SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION) {
+		if (forResolver || !isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION, dbInfo->get().client)) {
 			toCommit->writeTypedMessage(m);
 		} else {
 			ASSERT(cipherKeys != nullptr);
 			Arena arena;
-			toCommit->writeTypedMessage(EncryptedMutationMessage::encryptMetadata(arena, *cipherKeys, m));
+			toCommit->writeTypedMessage(m.encryptMetadata(*cipherKeys, arena));
 		}
 	}
 
@@ -650,7 +655,7 @@ private:
 	}
 
 	void checkSetTenantMapPrefix(MutationRef m) {
-		KeyRef prefix = TenantMetadata::tenantMap.subspace.begin;
+		KeyRef prefix = TenantMetadata::tenantMap().subspace.begin;
 		if (m.param1.startsWith(prefix)) {
 			if (tenantMap) {
 				ASSERT(version != invalidVersion);
@@ -685,6 +690,26 @@ private:
 		}
 	}
 
+	void checkSetMetaclusterRegistration(MutationRef m) {
+		if (m.param1 == MetaclusterMetadata::metaclusterRegistration().key) {
+			MetaclusterRegistrationEntry entry = MetaclusterRegistrationEntry::decode(m.param2);
+
+			TraceEvent("SetMetaclusterRegistration", dbgid)
+			    .detail("ClusterType", entry.clusterType)
+			    .detail("MetaclusterID", entry.metaclusterId)
+			    .detail("MetaclusterName", entry.metaclusterName)
+			    .detail("ClusterID", entry.id)
+			    .detail("ClusterName", entry.name);
+
+			if (!initialCommit) {
+				txnStateStore->set(KeyValueRef(m.param1, m.param2));
+			}
+
+			confChange = true;
+			CODE_PROBE(true, "Metacluster registration set");
+		}
+	}
+
 	void checkClearKeyServerKeys(KeyRangeRef range) {
 		if (!keyServersKeys.intersects(range)) {
 			return;
@@ -1029,7 +1054,7 @@ private:
 	}
 
 	void checkClearTenantMapPrefix(KeyRangeRef range) {
-		KeyRangeRef subspace = TenantMetadata::tenantMap.subspace;
+		KeyRangeRef subspace = TenantMetadata::tenantMap().subspace;
 		if (subspace.intersects(range)) {
 			if (tenantMap) {
 				ASSERT(version != invalidVersion);
@@ -1074,6 +1099,19 @@ private:
 		}
 	}
 
+	void checkClearMetaclusterRegistration(KeyRangeRef range) {
+		if (range.contains(MetaclusterMetadata::metaclusterRegistration().key)) {
+			TraceEvent("ClearMetaclusterRegistration", dbgid);
+
+			if (!initialCommit) {
+				txnStateStore->clear(singleKeyRange(MetaclusterMetadata::metaclusterRegistration().key));
+			}
+
+			confChange = true;
+			CODE_PROBE(true, "Metacluster registration cleared");
+		}
+	}
+
 	void checkClearMiscRangeKeys(KeyRangeRef range) {
 		if (initialCommit) {
 			return;
@@ -1196,6 +1234,7 @@ public:
 				checkSetMinRequiredCommitVersionKey(m);
 				checkSetVersionEpochKey(m);
 				checkSetTenantMapPrefix(m);
+				checkSetMetaclusterRegistration(m);
 				checkSetOtherKeys(m);
 			} else if (m.type == MutationRef::ClearRange && isSystemKey(m.param2)) {
 				KeyRangeRef range(m.param1, m.param2);
@@ -1213,6 +1252,7 @@ public:
 				checkClearTssQuarantineKeys(m, range);
 				checkClearVersionEpochKeys(m, range);
 				checkClearTenantMapPrefix(range);
+				checkClearMetaclusterRegistration(range);
 				checkClearMiscRangeKeys(range);
 			}
 		}
@@ -1245,7 +1285,6 @@ void applyMetadataMutations(SpanContext const& spanContext,
                             Version version,
                             Version popVersion,
                             bool initialCommit) {
-
 	ApplyMetadataMutationsImpl(spanContext,
 	                           arena,
 	                           mutations,
@@ -1262,14 +1301,16 @@ void applyMetadataMutations(SpanContext const& spanContext,
 
 void applyMetadataMutations(SpanContext const& spanContext,
                             ResolverData& resolverData,
-                            const VectorRef<MutationRef>& mutations) {
-	ApplyMetadataMutationsImpl(spanContext, resolverData, mutations).apply();
+                            const VectorRef<MutationRef>& mutations,
+                            Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
+	ApplyMetadataMutationsImpl(spanContext, resolverData, mutations, dbInfo).apply();
 }
 
 void applyMetadataMutations(SpanContext const& spanContext,
                             const UID& dbgid,
                             Arena& arena,
                             const VectorRef<MutationRef>& mutations,
-                            IKeyValueStore* txnStateStore) {
-	ApplyMetadataMutationsImpl(spanContext, dbgid, arena, mutations, txnStateStore).apply();
+                            IKeyValueStore* txnStateStore,
+                            Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
+	ApplyMetadataMutationsImpl(spanContext, dbgid, arena, mutations, txnStateStore, dbInfo).apply();
 }
diff --git a/fdbserver/BackupWorker.actor.cpp b/fdbserver/BackupWorker.actor.cpp
index 515e52f682..3828f16df1 100644
--- a/fdbserver/BackupWorker.actor.cpp
+++ b/fdbserver/BackupWorker.actor.cpp
@@ -25,8 +25,7 @@
 #include "fdbclient/SystemData.h"
 #include "fdbserver/BackupInterface.h"
 #include "fdbserver/BackupProgress.actor.h"
-#include "fdbserver/EncryptedMutationMessage.h"
-#include "fdbserver/GetEncryptCipherKeys.h"
+#include "fdbclient/GetEncryptCipherKeys.actor.h"
 #include "fdbserver/Knobs.h"
 #include "fdbserver/LogProtocolMessage.h"
 #include "fdbserver/LogSystem.h"
@@ -75,24 +74,25 @@ struct VersionedMessage {
 			CODE_PROBE(true, "Returning false for OTELSpanContextMessage");
 			return false;
 		}
-		if (EncryptedMutationMessage::isNextIn(reader)) {
+		reader >> *m;
+		if (m->isEncrypted()) {
 			// In case the mutation is encrypted, get the decrypted mutation and also update message to point to
 			// the decrypted mutation.
 			// We use dedicated arena for decrypt buffer, as the other arena is used to count towards backup lock bytes.
-			*m = EncryptedMutationMessage::decrypt(reader, decryptArena, cipherKeys, &message);
-		} else {
-			reader >> *m;
+			*m = m->decrypt(cipherKeys, decryptArena, &message);
 		}
 		return normalKeys.contains(m->param1) || m->param1 == metadataVersionKey;
 	}
 
 	void collectCipherDetailIfEncrypted(std::unordered_set<BlobCipherDetails>& cipherDetails) {
-		ArenaReader reader(arena, message, AssumeVersion(g_network->protocolVersion()));
-		if (EncryptedMutationMessage::isNextIn(reader)) {
-			EncryptedMutationMessage emm;
-			reader >> emm;
-			cipherDetails.insert(emm.header.cipherTextDetails);
-			cipherDetails.insert(emm.header.cipherHeaderDetails);
+		ASSERT(!message.empty());
+		if (*message.begin() == MutationRef::Encrypted) {
+			ArenaReader reader(arena, message, AssumeVersion(ProtocolVersion::withEncryptionAtRest()));
+			MutationRef m;
+			reader >> m;
+			const BlobCipherEncryptHeader* header = m.encryptionHeader();
+			cipherDetails.insert(header->cipherTextDetails);
+			cipherDetails.insert(header->cipherHeaderDetails);
 		}
 	}
 };
@@ -453,20 +453,30 @@ struct BackupData {
 	ACTOR static Future<Version> _getMinKnownCommittedVersion(BackupData* self) {
 		state Span span("BA:GetMinCommittedVersion"_loc);
 		loop {
-			GetReadVersionRequest request(span.context,
-			                              0,
-			                              TransactionPriority::DEFAULT,
-			                              invalidVersion,
-			                              GetReadVersionRequest::FLAG_USE_MIN_KNOWN_COMMITTED_VERSION);
-			choose {
-				when(wait(self->cx->onProxiesChanged())) {}
-				when(GetReadVersionReply reply =
-				         wait(basicLoadBalance(self->cx->getGrvProxies(UseProvisionalProxies::False),
-				                               &GrvProxyInterface::getConsistentReadVersion,
-				                               request,
-				                               self->cx->taskID))) {
-					self->cx->ssVersionVectorCache.applyDelta(reply.ssVersionVectorDelta);
-					return reply.version;
+			try {
+				GetReadVersionRequest request(span.context,
+				                              0,
+				                              TransactionPriority::DEFAULT,
+				                              invalidVersion,
+				                              GetReadVersionRequest::FLAG_USE_MIN_KNOWN_COMMITTED_VERSION);
+				choose {
+					when(wait(self->cx->onProxiesChanged())) {}
+					when(GetReadVersionReply reply =
+					         wait(basicLoadBalance(self->cx->getGrvProxies(UseProvisionalProxies::False),
+					                               &GrvProxyInterface::getConsistentReadVersion,
+					                               request,
+					                               self->cx->taskID))) {
+						self->cx->ssVersionVectorCache.applyDelta(reply.ssVersionVectorDelta);
+						return reply.version;
+					}
+				}
+			} catch (Error& e) {
+				if (e.code() == error_code_batch_transaction_throttled ||
+				    e.code() == error_code_grv_proxy_memory_limit_exceeded) {
+					// GRV Proxy returns an error
+					wait(delayJittered(CLIENT_KNOBS->GRV_ERROR_RETRY_DELAY));
+				} else {
+					throw;
 				}
 			}
 		}
diff --git a/fdbserver/BlobGranuleServerCommon.actor.cpp b/fdbserver/BlobGranuleServerCommon.actor.cpp
index dcf365e28d..9ffa83aab7 100644
--- a/fdbserver/BlobGranuleServerCommon.actor.cpp
+++ b/fdbserver/BlobGranuleServerCommon.actor.cpp
@@ -112,6 +112,7 @@ ACTOR Future<GranuleFiles> loadHistoryFiles(Database cx, UID granuleID) {
 		try {
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 			wait(readGranuleFiles(&tr, &startKey, range.end, &files, granuleID));
 			return files;
 		} catch (Error& e) {
@@ -120,6 +121,20 @@ ACTOR Future<GranuleFiles> loadHistoryFiles(Database cx, UID granuleID) {
 	}
 }
 
+ACTOR Future<ForcedPurgeState> getForcePurgedState(Transaction* tr, KeyRange keyRange) {
+	// because map is coalesced, if the result returns more than 1, they must be alternating
+	RangeResult values =
+	    wait(krmGetRanges(tr, blobGranuleForcePurgedKeys.begin, keyRange, 3, GetRangeLimits::BYTE_LIMIT_UNLIMITED));
+
+	ASSERT(!values.empty());
+	if (values.size() > 2) {
+		ASSERT(values[0].value != values[1].value);
+		return ForcedPurgeState::SomePurged;
+	} else {
+		return values[0].value == LiteralStringRef("1") ? ForcedPurgeState::AllPurged : ForcedPurgeState::NonePurged;
+	}
+}
+
 // Normally a beginVersion != 0 means the caller wants all mutations between beginVersion and readVersion, instead of
 // the latest snapshot before readVersion + deltas after the snapshot. When canCollapse is set, the beginVersion is
 // essentially just an optimization hint. The caller is still concerned with reconstructing rows at readVersion, it just
@@ -137,7 +152,8 @@ void GranuleFiles::getFiles(Version beginVersion,
                             bool canCollapse,
                             BlobGranuleChunkRef& chunk,
                             Arena& replyArena,
-                            int64_t& deltaBytesCounter) const {
+                            int64_t& deltaBytesCounter,
+                            bool summarize) const {
 	BlobFileIndex dummyIndex; // for searching
 
 	// if beginVersion == 0 or we can collapse, find the latest snapshot <= readVersion
@@ -187,12 +203,13 @@ void GranuleFiles::getFiles(Version beginVersion,
 	Version lastIncluded = invalidVersion;
 	if (snapshotF != snapshotFiles.end()) {
 		chunk.snapshotVersion = snapshotF->version;
-		chunk.snapshotFile = BlobFilePointerRef(replyArena,
-		                                        snapshotF->filename,
-		                                        snapshotF->offset,
-		                                        snapshotF->length,
-		                                        snapshotF->fullFileLength,
-		                                        snapshotF->cipherKeysMeta);
+		chunk.snapshotFile =
+		    BlobFilePointerRef(replyArena,
+		                       summarize ? "" : snapshotF->filename,
+		                       snapshotF->offset,
+		                       snapshotF->length,
+		                       snapshotF->fullFileLength,
+		                       summarize ? Optional<BlobGranuleCipherKeysMeta>() : snapshotF->cipherKeysMeta);
 		lastIncluded = chunk.snapshotVersion;
 	} else {
 		chunk.snapshotVersion = invalidVersion;
@@ -200,18 +217,19 @@ void GranuleFiles::getFiles(Version beginVersion,
 
 	while (deltaF != deltaFiles.end() && deltaF->version < readVersion) {
 		chunk.deltaFiles.emplace_back_deep(replyArena,
-		                                   deltaF->filename,
+		                                   summarize ? "" : deltaF->filename,
 		                                   deltaF->offset,
 		                                   deltaF->length,
 		                                   deltaF->fullFileLength,
-		                                   deltaF->cipherKeysMeta);
+		                                   summarize ? Optional<BlobGranuleCipherKeysMeta>() : deltaF->cipherKeysMeta);
 		deltaBytesCounter += deltaF->length;
 		ASSERT(lastIncluded < deltaF->version);
 		lastIncluded = deltaF->version;
 		deltaF++;
 	}
 	// include last delta file that passes readVersion, if it exists
-	if (deltaF != deltaFiles.end() && lastIncluded < readVersion) {
+	if (deltaF != deltaFiles.end() &&
+	    ((!summarize && lastIncluded < readVersion) || (summarize && deltaF->version == readVersion))) {
 		chunk.deltaFiles.emplace_back_deep(replyArena,
 		                                   deltaF->filename,
 		                                   deltaF->offset,
@@ -221,6 +239,7 @@ void GranuleFiles::getFiles(Version beginVersion,
 		deltaBytesCounter += deltaF->length;
 		lastIncluded = deltaF->version;
 	}
+	chunk.includedVersion = lastIncluded;
 }
 
 static std::string makeTestFileName(Version v) {
@@ -244,7 +263,7 @@ static void checkFiles(const GranuleFiles& f,
 	Arena a;
 	BlobGranuleChunkRef chunk;
 	int64_t deltaBytes = 0;
-	f.getFiles(beginVersion, readVersion, canCollapse, chunk, a, deltaBytes);
+	f.getFiles(beginVersion, readVersion, canCollapse, chunk, a, deltaBytes, false);
 	fmt::print("results({0}, {1}, {2}):\nEXPECTED:\n    snapshot={3}\n    deltas ({4}):\n",
 	           beginVersion,
 	           readVersion,
@@ -388,6 +407,49 @@ TEST_CASE("/blobgranule/server/common/granulefiles") {
 	return Void();
 }
 
+static void checkSummary(const GranuleFiles& f,
+                         Version summaryVersion,
+                         Version expectedSnapshotVersion,
+                         int64_t expectedSnapshotSize,
+                         Version expectedDeltaVersion,
+                         Version expectedDeltaSize) {
+	Arena fileArena, summaryArena;
+	BlobGranuleChunkRef chunk;
+	int64_t deltaBytes = 0;
+	f.getFiles(0, summaryVersion, true, chunk, fileArena, deltaBytes, true);
+
+	BlobGranuleSummaryRef summary = summarizeGranuleChunk(summaryArena, chunk);
+
+	ASSERT(expectedSnapshotVersion == summary.snapshotVersion);
+	ASSERT(expectedSnapshotSize == summary.snapshotSize);
+	ASSERT(expectedDeltaVersion == summary.deltaVersion);
+	ASSERT(expectedDeltaSize == summary.deltaSize);
+	ASSERT(deltaBytes == expectedDeltaSize);
+}
+
+/*
+ * This should technically be in client unit tests but we don't have a unit test there
+ * Files:
+ * S @ 100 (10 bytes)
+ * D @ 150 (5 bytes)
+ * D @ 200 (6 bytes)
+ */
+TEST_CASE("/blobgranule/server/common/granulesummary") {
+	GranuleFiles files;
+	files.snapshotFiles.push_back(makeTestFile(100, 10));
+	files.deltaFiles.push_back(makeTestFile(150, 5));
+	files.deltaFiles.push_back(makeTestFile(200, 6));
+
+	checkSummary(files, 100, 100, 10, 100, 0);
+	checkSummary(files, 149, 100, 10, 100, 0);
+	checkSummary(files, 150, 100, 10, 150, 5);
+	checkSummary(files, 199, 100, 10, 150, 5);
+	checkSummary(files, 200, 100, 10, 200, 11);
+	checkSummary(files, 700, 100, 10, 200, 11);
+
+	return Void();
+}
+
 // FIXME: if credentials can expire, refresh periodically
 ACTOR Future<Void> loadBlobMetadataForTenants(BGTenantMap* self, std::vector<TenantMapEntry> tenantMapEntries) {
 	ASSERT(SERVER_KNOBS->BG_METADATA_SOURCE == "tenant");
diff --git a/fdbserver/BlobGranuleValidation.actor.cpp b/fdbserver/BlobGranuleValidation.actor.cpp
index 3168951a46..56f72ac63c 100644
--- a/fdbserver/BlobGranuleValidation.actor.cpp
+++ b/fdbserver/BlobGranuleValidation.actor.cpp
@@ -115,23 +115,34 @@ bool compareFDBAndBlob(RangeResult fdb,
 			Optional<KeyValueRef> lastCorrect;
 			for (int i = 0; i < std::max(fdb.size(), blob.first.size()); i++) {
 				if (i >= fdb.size() || i >= blob.first.size() || fdb[i] != blob.first[i]) {
+					TraceEvent ev("GranuleMismatchInfo");
+					ev.detail("Idx", i);
 					printf("  Found mismatch at %d.\n", i);
 					if (lastCorrect.present()) {
 						printf("    last correct: %s=%s\n",
 						       lastCorrect.get().key.printable().c_str(),
 						       lastCorrect.get().value.printable().c_str());
+						ev.detail("LastCorrectKey", lastCorrect.get().key);
 					}
 					if (i < fdb.size()) {
 						printf("    FDB: %s=%s\n", fdb[i].key.printable().c_str(), fdb[i].value.printable().c_str());
+						ev.detail("FDBKey", fdb[i].key);
 					} else {
 						printf("    FDB: <missing>\n");
+						ev.detail("FDBKey", "Missing");
 					}
 					if (i < blob.first.size()) {
 						printf("    BLB: %s=%s\n",
 						       blob.first[i].key.printable().c_str(),
 						       blob.first[i].value.printable().c_str());
+						ev.detail("BlobKey", blob.first[i].key);
 					} else {
 						printf("    BLB: <missing>\n");
+						ev.detail("BlobKey", "Missing");
+					}
+					if (i < fdb.size() && i < blob.first.size() && fdb[i].key == blob.first[i].key) {
+						// value mismatch
+						ev.detail("FDBValue", fdb[i].value).detail("BlobValue", blob.first[i].value);
 					}
 					printf("\n");
 					break;
@@ -178,7 +189,7 @@ ACTOR Future<Void> clearAndAwaitMerge(Database cx, KeyRange range) {
 	state int reClearInterval = 1; // do quadratic backoff on clear rate, b/c large keys can keep it not write-cold
 	loop {
 		try {
-			Standalone<VectorRef<KeyRangeRef>> ranges = wait(tr.getBlobGranuleRanges(range));
+			Standalone<VectorRef<KeyRangeRef>> ranges = wait(tr.getBlobGranuleRanges(range, 2));
 			if (ranges.size() == 1) {
 				return Void();
 			}
@@ -201,3 +212,123 @@ ACTOR Future<Void> clearAndAwaitMerge(Database cx, KeyRange range) {
 		}
 	}
 }
+
+ACTOR Future<Standalone<VectorRef<BlobGranuleSummaryRef>>> getSummaries(Database cx,
+                                                                        KeyRange range,
+                                                                        Version summaryVersion,
+                                                                        Optional<TenantName> tenantName) {
+	state Transaction tr(cx, tenantName);
+	loop {
+		try {
+			Standalone<VectorRef<BlobGranuleSummaryRef>> summaries =
+			    wait(tr.summarizeBlobGranules(range, summaryVersion, 1000000));
+
+			// do some basic validation
+			ASSERT(!summaries.empty());
+			ASSERT(summaries.front().keyRange.begin == range.begin);
+			ASSERT(summaries.back().keyRange.end == range.end);
+
+			for (int i = 0; i < summaries.size() - 1; i++) {
+				ASSERT(summaries[i].keyRange.end == summaries[i + 1].keyRange.begin);
+			}
+
+			return summaries;
+		} catch (Error& e) {
+			wait(tr.onError(e));
+		}
+	}
+}
+
+ACTOR Future<Void> validateGranuleSummaries(Database cx,
+                                            KeyRange range,
+                                            Optional<TenantName> tenantName,
+                                            Promise<Void> testComplete) {
+	state Arena lastSummaryArena;
+	state KeyRangeMap<Optional<BlobGranuleSummaryRef>> lastSummary;
+	state Version lastSummaryVersion = invalidVersion;
+	state Transaction tr(cx, tenantName);
+	state int successCount = 0;
+	try {
+		loop {
+			// get grv and get latest summaries
+			state Version nextSummaryVersion;
+			tr.reset();
+			loop {
+				try {
+					wait(store(nextSummaryVersion, tr.getReadVersion()));
+					ASSERT(nextSummaryVersion >= lastSummaryVersion);
+					break;
+				} catch (Error& e) {
+					wait(tr.onError(e));
+				}
+			}
+
+			state Standalone<VectorRef<BlobGranuleSummaryRef>> nextSummary;
+			try {
+				wait(store(nextSummary, getSummaries(cx, range, nextSummaryVersion, tenantName)));
+			} catch (Error& e) {
+				if (e.code() == error_code_blob_granule_transaction_too_old) {
+					ASSERT(lastSummaryVersion == invalidVersion);
+
+					wait(delay(1.0));
+					continue;
+				} else {
+					throw e;
+				}
+			}
+
+			if (lastSummaryVersion != invalidVersion) {
+				CODE_PROBE(true, "comparing multiple summaries");
+				// diff with last summary ranges to ensure versions never decreased for any range
+				for (auto& it : nextSummary) {
+					auto lastSummaries = lastSummary.intersectingRanges(it.keyRange);
+					for (auto& itLast : lastSummaries) {
+
+						if (!itLast.cvalue().present()) {
+							ASSERT(lastSummaryVersion == invalidVersion);
+							continue;
+						}
+						auto& last = itLast.cvalue().get();
+
+						ASSERT(it.snapshotVersion >= last.snapshotVersion);
+						// same invariant isn't always true for delta version because of force flushing around granule
+						// merges
+						if (it.keyRange == itLast.range()) {
+							ASSERT(it.deltaVersion >= last.deltaVersion);
+							if (it.snapshotVersion == last.snapshotVersion) {
+								ASSERT(it.snapshotSize == last.snapshotSize);
+							}
+							if (it.snapshotVersion == last.snapshotVersion && it.deltaVersion == last.deltaVersion) {
+								ASSERT(it.snapshotSize == last.snapshotSize);
+								ASSERT(it.deltaSize == last.deltaSize);
+							} else if (it.snapshotVersion == last.snapshotVersion) {
+								ASSERT(it.deltaSize > last.deltaSize);
+							}
+							break;
+						}
+					}
+				}
+
+				if (!testComplete.canBeSet()) {
+					return Void();
+				}
+			}
+
+			successCount++;
+
+			lastSummaryArena = nextSummary.arena();
+			lastSummaryVersion = nextSummaryVersion;
+			lastSummary.insert(range, {});
+			for (auto& it : nextSummary) {
+				lastSummary.insert(it.keyRange, it);
+			}
+
+			wait(delayJittered(deterministicRandom()->randomInt(1, 10)));
+		}
+	} catch (Error& e) {
+		if (e.code() != error_code_operation_cancelled) {
+			TraceEvent(SevError, "UnexpectedErrorValidateGranuleSummaries").error(e);
+		}
+		throw e;
+	}
+}
diff --git a/fdbserver/BlobManager.actor.cpp b/fdbserver/BlobManager.actor.cpp
index 7bbfae32f8..0b51ab6e6c 100644
--- a/fdbserver/BlobManager.actor.cpp
+++ b/fdbserver/BlobManager.actor.cpp
@@ -130,7 +130,7 @@ void updateClientBlobRanges(KeyRangeMap<bool>* knownBlobRanges,
 				}
 				break;
 			}
-			bool active = dbBlobRanges[i].value == LiteralStringRef("1");
+			bool active = dbBlobRanges[i].value == blobRangeActive;
 			if (active) {
 				if (BM_DEBUG) {
 					fmt::print("BM sees client range [{0} - {1})\n",
@@ -194,6 +194,7 @@ struct RangeAssignment {
 	bool isAssign;
 	KeyRange keyRange;
 	Optional<UID> worker;
+	Optional<std::pair<UID, Error>> previousFailure;
 
 	// I tried doing this with a union and it was just kind of messy
 	Optional<RangeAssignmentData> assign;
@@ -272,6 +273,7 @@ struct BlobManagerStats {
 	Counter filesPurged;
 	Future<Void> logger;
 	int64_t activeMerges;
+	int64_t blockedAssignments;
 
 	// Current stats maintained for a given blob worker process
 	explicit BlobManagerStats(UID id,
@@ -286,12 +288,13 @@ struct BlobManagerStats {
 	    ccBytesChecked("CCBytesChecked", cc), ccMismatches("CCMismatches", cc), ccTimeouts("CCTimeouts", cc),
 	    ccErrors("CCErrors", cc), purgesProcessed("PurgesProcessed", cc),
 	    granulesFullyPurged("GranulesFullyPurged", cc), granulesPartiallyPurged("GranulesPartiallyPurged", cc),
-	    filesPurged("FilesPurged", cc), activeMerges(0) {
+	    filesPurged("FilesPurged", cc), activeMerges(0), blockedAssignments(0) {
 		specialCounter(cc, "WorkerCount", [workers]() { return workers->size(); });
 		specialCounter(cc, "Epoch", [epoch]() { return epoch; });
 		specialCounter(cc, "ActiveMerges", [this]() { return this->activeMerges; });
 		specialCounter(cc, "HardBoundaries", [mergeHardBoundaries]() { return mergeHardBoundaries->size(); });
 		specialCounter(cc, "SoftBoundaries", [mergeBoundaries]() { return mergeBoundaries->size(); });
+		specialCounter(cc, "BlockedAssignments", [this]() { return this->blockedAssignments; });
 		logger = traceCounters("BlobManagerMetrics", id, interval, &cc, "BlobManagerMetrics");
 	}
 };
@@ -366,6 +369,7 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
 	// TODO: consider switching to an iterator approach.
 	std::unordered_map<Key, bool> mergeHardBoundaries;
 	std::unordered_map<Key, BlobGranuleMergeBoundary> mergeBoundaries;
+	CoalescedKeyRangeMap<bool> forcePurgingRanges;
 
 	FlowLock concurrentMergeChecks;
 
@@ -375,6 +379,7 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
 	AsyncVar<int> recruitingStream;
 	Promise<Void> foundBlobWorkers;
 	Promise<Void> doneRecovering;
+	Promise<Void> loadedClientRanges;
 
 	int64_t epoch;
 	int64_t seqNo = 1;
@@ -390,7 +395,7 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
 	    stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, epoch, &workersById, &mergeHardBoundaries, &mergeBoundaries),
 	    knownBlobRanges(false, normalKeys.end), tenantData(BGTenantMap(dbInfo)),
 	    mergeCandidates(MergeCandidateInfo(MergeCandidateUnknown), normalKeys.end),
-	    activeGranuleMerges(invalidVersion, normalKeys.end),
+	    activeGranuleMerges(invalidVersion, normalKeys.end), forcePurgingRanges(false, normalKeys.end),
 	    concurrentMergeChecks(SERVER_KNOBS->BLOB_MANAGER_CONCURRENT_MERGE_CHECKS),
 	    restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY), recruitingStream(0), epoch(epoch) {}
 
@@ -426,7 +431,24 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
 		return v;
 	}
 
+	// FIXME: is it possible for merge/split/re-merge to call this with same range but a different granule id or
+	// startVersion? Unlikely but could cause weird history problems
 	void setMergeCandidate(const KeyRangeRef& range, UID granuleID, Version startVersion) {
+		// if this granule is not an active granule, it can't be merged
+		auto gIt = workerAssignments.rangeContaining(range.begin);
+		if (gIt->begin() != range.begin || gIt->end() != range.end) {
+			CODE_PROBE(true, "non-active granule reported merge eligible, ignoring");
+			if (BM_DEBUG) {
+				fmt::print(
+				    "BM {0} Ignoring Merge Candidate [{1} - {2}): range mismatch with active granule [{3} - {4})\n",
+				    epoch,
+				    range.begin.printable(),
+				    range.end.printable(),
+				    gIt->begin().printable(),
+				    gIt->end().printable());
+			}
+			return;
+		}
 		// Want this to be idempotent. If a granule was already reported as merge-eligible, we want to use the existing
 		// merge and mergeNow state.
 		auto it = mergeCandidates.rangeContaining(range.begin);
@@ -451,6 +473,16 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
 	}
 
 	void clearMergeCandidate(const KeyRangeRef& range) { setMergeCandidate(range, MergeCandidateCannotMerge); }
+
+	bool isForcePurging(const KeyRangeRef& range) {
+		auto ranges = forcePurgingRanges.intersectingRanges(range);
+		for (auto& it : ranges) {
+			if (it.value()) {
+				return true;
+			}
+		}
+		return false;
+	}
 };
 
 // Helper function for alignKeys().
@@ -514,10 +546,17 @@ ACTOR Future<BlobGranuleSplitPoints> alignKeys(Reference<BlobManagerData> bmData
 	splitPoints.keys.push_back_deep(splitPoints.keys.arena(), splits.front());
 
 	state Transaction tr = Transaction(bmData->db);
-	tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 	state int idx = 1;
+	state Reference<GranuleTenantData> tenantData = bmData->tenantData.getDataForGranule(granuleRange);
+	while (SERVER_KNOBS->BG_METADATA_SOURCE == "tenant" && !tenantData.isValid()) {
+		// this is a bit of a hack, but if we know this range is supposed to have a tenant, and it doesn't, just wait
+		wait(delay(1.0));
+		tenantData = bmData->tenantData.getDataForGranule(granuleRange);
+	}
 	for (; idx < splits.size() - 1; idx++) {
 		loop {
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 			try {
 				// Get the next full key in the granule.
 				RangeResult nextKeyRes = wait(
@@ -526,7 +565,6 @@ ACTOR Future<BlobGranuleSplitPoints> alignKeys(Reference<BlobManagerData> bmData
 					break;
 				}
 
-				Reference<GranuleTenantData> tenantData = bmData->tenantData.getDataForGranule(granuleRange);
 				alignKeyBoundary(bmData, tenantData, nextKeyRes[0].key, offset, splitPoints);
 				break;
 			} catch (Error& e) {
@@ -649,31 +687,70 @@ ACTOR Future<BlobGranuleSplitPoints> splitRange(Reference<BlobManagerData> bmDat
 
 // Picks a worker with the fewest number of already assigned ranges.
 // If there is a tie, picks one such worker at random.
-ACTOR Future<UID> pickWorkerForAssign(Reference<BlobManagerData> bmData) {
+ACTOR Future<UID> pickWorkerForAssign(Reference<BlobManagerData> bmData,
+                                      Optional<std::pair<UID, Error>> previousFailure) {
 	// wait until there are BWs to pick from
-	while (bmData->workerStats.size() == 0) {
-		CODE_PROBE(true, "BM wants to assign range, but no workers available");
-		if (BM_DEBUG) {
-			fmt::print("BM {0} waiting for blob workers before assigning granules\n", bmData->epoch);
+	loop {
+		state bool wasZeroWorkers = false;
+		while (bmData->workerStats.size() == 0) {
+			wasZeroWorkers = true;
+			CODE_PROBE(true, "BM wants to assign range, but no workers available");
+			if (BM_DEBUG) {
+				fmt::print("BM {0} waiting for blob workers before assigning granules\n", bmData->epoch);
+			}
+			bmData->restartRecruiting.trigger();
+			wait(bmData->recruitingStream.onChange() || bmData->foundBlobWorkers.getFuture());
 		}
-		bmData->restartRecruiting.trigger();
-		wait(bmData->recruitingStream.onChange() || bmData->foundBlobWorkers.getFuture());
-		// FIXME: may want to have some buffer here so zero-worker recruiting case doesn't assign every single pending
-		// range to the first worker recruited
+		if (wasZeroWorkers) {
+			// Add a bit of delay. If we were at zero workers, don't immediately assign all granules to the first worker
+			// we recruit
+			wait(delay(0.1));
+		}
+		if (bmData->workerStats.size() != 0) {
+			break;
+		}
+		// if in the post-zero workers delay, we went back down to zero workers, re-loop
 	}
 
 	int minGranulesAssigned = INT_MAX;
 	std::vector<UID> eligibleWorkers;
 
+	// because lowest number of granules worker(s) might not exactly have the lowest memory for various reasons, if we
+	// got blob_worker_full as the error last time, sometimes just pick a random worker that wasn't the last one we
+	// tried
+	if (bmData->workerStats.size() >= 2 && previousFailure.present() &&
+	    previousFailure.get().second.code() == error_code_blob_worker_full && deterministicRandom()->coinflip()) {
+		CODE_PROBE(true, "randomly picking worker due to blob_worker_full");
+		eligibleWorkers.reserve(bmData->workerStats.size());
+		for (auto& it : bmData->workerStats) {
+			if (it.first != previousFailure.get().first) {
+				eligibleWorkers.push_back(it.first);
+			}
+		}
+		ASSERT(!eligibleWorkers.empty());
+		int randomIdx = deterministicRandom()->randomInt(0, eligibleWorkers.size());
+		if (BM_DEBUG) {
+			fmt::print("picked worker {0} randomly since previous attempt got blob_worker_full\n",
+			           eligibleWorkers[randomIdx].toString().substr(0, 5));
+		}
+
+		return eligibleWorkers[randomIdx];
+	}
+
 	for (auto const& worker : bmData->workerStats) {
 		UID currId = worker.first;
 		int granulesAssigned = worker.second.numGranulesAssigned;
 
-		if (granulesAssigned < minGranulesAssigned) {
-			eligibleWorkers.resize(0);
-			minGranulesAssigned = granulesAssigned;
-			eligibleWorkers.emplace_back(currId);
-		} else if (granulesAssigned == minGranulesAssigned) {
+		// if previous attempt failed and that worker is still present, ignore it
+		if (bmData->workerStats.size() >= 2 && previousFailure.present() && previousFailure.get().first == currId) {
+			continue;
+		}
+
+		if (granulesAssigned <= minGranulesAssigned) {
+			if (granulesAssigned < minGranulesAssigned) {
+				eligibleWorkers.clear();
+				minGranulesAssigned = granulesAssigned;
+			}
 			eligibleWorkers.emplace_back(currId);
 		}
 	}
@@ -683,7 +760,7 @@ ACTOR Future<UID> pickWorkerForAssign(Reference<BlobManagerData> bmData) {
 	int idx = deterministicRandom()->randomInt(0, eligibleWorkers.size());
 	if (BM_DEBUG) {
 		fmt::print("picked worker {0}, which has a minimal number ({1}) of granules assigned\n",
-		           eligibleWorkers[idx].toString(),
+		           eligibleWorkers[idx].toString().substr(0, 5),
 		           minGranulesAssigned);
 	}
 
@@ -698,31 +775,49 @@ ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
                                      Optional<UID> workerID,
                                      int64_t epoch,
                                      int64_t seqNo) {
+	state bool blockedWaitingForWorker = false;
 	// WorkerId is set, except in case of assigning to any worker. Then we pick the worker to assign to in here
-
-	// inject delay into range assignments
-	if (BUGGIFY_WITH_PROB(0.05)) {
-		wait(delay(deterministicRandom()->random01()));
-	} else {
-		// otherwise, do delay(0) to ensure rest of code in calling handleRangeAssign runs, before this function can
-		// recursively call handleRangeAssign on error
-		wait(delay(0.0));
-	}
-
-	if (!workerID.present()) {
-		ASSERT(assignment.isAssign && assignment.assign.get().type != AssignRequestType::Continue);
-		UID _workerId = wait(pickWorkerForAssign(bmData));
-		if (BM_DEBUG) {
-			fmt::print("Chose BW {0} for seqno {1} in BM {2}\n", _workerId.toString(), seqNo, bmData->epoch);
+	try {
+		// inject delay into range assignments
+		if (BUGGIFY_WITH_PROB(0.05)) {
+			wait(delay(deterministicRandom()->random01()));
+		} else {
+			// otherwise, do delay(0) to ensure rest of code in calling handleRangeAssign runs, before this function can
+			// recursively call handleRangeAssign on error
+			wait(delay(0.0));
 		}
-		workerID = _workerId;
-		// We don't have to check for races with an overlapping assignment because it would insert over us in the actor
-		// map, cancelling this actor before it got here
-		bmData->workerAssignments.insert(assignment.keyRange, workerID.get());
+		if (!workerID.present()) {
+			ASSERT(assignment.isAssign && assignment.assign.get().type != AssignRequestType::Continue);
 
-		if (bmData->workerStats.count(workerID.get())) {
-			bmData->workerStats[workerID.get()].numGranulesAssigned += 1;
+			blockedWaitingForWorker = true;
+			if (!assignment.previousFailure.present()) {
+				// if not already blocked, now blocked
+				++bmData->stats.blockedAssignments;
+			}
+
+			UID _workerId = wait(pickWorkerForAssign(bmData, assignment.previousFailure));
+			if (BM_DEBUG) {
+				fmt::print("Chose BW {0} for seqno {1} in BM {2}\n", _workerId.toString(), seqNo, bmData->epoch);
+			}
+			workerID = _workerId;
+			// We don't have to check for races with an overlapping assignment because it would insert over us in the
+			// actor map, cancelling this actor before it got here
+			bmData->workerAssignments.insert(assignment.keyRange, workerID.get());
+
+			if (bmData->workerStats.count(workerID.get())) {
+				bmData->workerStats[workerID.get()].numGranulesAssigned += 1;
+			}
+
+			if (!assignment.previousFailure.present()) {
+				// if only blocked waiting for worker, now not blocked
+				--bmData->stats.blockedAssignments;
+			}
 		}
+	} catch (Error& e) {
+		if (assignment.previousFailure.present() || blockedWaitingForWorker) {
+			--bmData->stats.blockedAssignments;
+		}
+		throw e;
 	}
 
 	if (BM_DEBUG) {
@@ -753,6 +848,12 @@ ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
 				throw no_more_servers();
 			}
 			wait(bmData->workersById[workerID.get()].assignBlobRangeRequest.getReply(req));
+			if (assignment.previousFailure.present()) {
+				// previous assign failed and this one succeeded
+				--bmData->stats.blockedAssignments;
+			}
+
+			return Void();
 		} else {
 			ASSERT(!assignment.assign.present());
 			ASSERT(assignment.revoke.present());
@@ -772,6 +873,11 @@ ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
 			}
 		}
 	} catch (Error& e) {
+		if (assignment.previousFailure.present()) {
+			// previous assign failed, consider it unblocked if it's not a retriable error
+			--bmData->stats.blockedAssignments;
+		}
+		state Error e2 = e;
 		if (e.code() == error_code_operation_cancelled) {
 			throw;
 		}
@@ -799,7 +905,8 @@ ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
 			return Void();
 		}
 
-		if (e.code() != error_code_broken_promise && e.code() != error_code_no_more_servers) {
+		if (e.code() != error_code_broken_promise && e.code() != error_code_no_more_servers &&
+		    e.code() != error_code_blob_worker_full) {
 			TraceEvent(SevWarn, "BlobManagerUnexpectedErrorDoRangeAssignment", bmData->id)
 			    .error(e)
 			    .detail("Epoch", bmData->epoch);
@@ -810,6 +917,30 @@ ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
 			throw;
 		}
 
+		// this assign failed and we will retry, consider it blocked until it successfully retries
+		if (assignment.isAssign) {
+			++bmData->stats.blockedAssignments;
+		}
+
+		if (e.code() == error_code_blob_worker_full) {
+			CODE_PROBE(true, "blob worker too full");
+			ASSERT(assignment.isAssign);
+			try {
+				if (assignment.previousFailure.present() &&
+				    assignment.previousFailure.get().second.code() == error_code_blob_worker_full) {
+					// if previous assignment also failed due to blob_worker_full, multiple workers are full, so wait
+					// even longer
+					CODE_PROBE(true, "multiple blob workers too full");
+					wait(delayJittered(10.0));
+				} else {
+					wait(delayJittered(1.0)); // wait a bit before retrying
+				}
+			} catch (Error& e) {
+				--bmData->stats.blockedAssignments;
+				throw;
+			}
+		}
+
 		CODE_PROBE(true, "BM retrying range assign");
 
 		// We use reliable delivery (getReply), so the broken_promise means the worker is dead, and we may need to retry
@@ -817,7 +948,7 @@ ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
 		if (assignment.isAssign) {
 			if (BM_DEBUG) {
 				fmt::print("BM got error {0} assigning range [{1} - {2}) to worker {3}, requeueing\n",
-				           e.name(),
+				           e2.name(),
 				           assignment.keyRange.begin.printable(),
 				           assignment.keyRange.end.printable(),
 				           workerID.get().toString());
@@ -838,8 +969,9 @@ ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
 			ASSERT(assignment.assign.present());
 			assignment.assign.get().type = AssignRequestType::Normal;
 			assignment.worker.reset();
+			std::pair<UID, Error> failure = { workerID.get(), e2 };
+			assignment.previousFailure = failure;
 			handleRangeAssign(bmData, assignment);
-			// FIXME: improvement would be to add history of failed workers to assignment so it can try other ones first
 		} else {
 			if (BM_DEBUG) {
 				fmt::print("BM got error revoking range [{0} - {1}) from worker",
@@ -894,6 +1026,11 @@ static bool handleRangeIsAssign(Reference<BlobManagerData> bmData, RangeAssignme
 	}
 	ASSERT(count == 1);
 
+	bool forcePurging = bmData->isForcePurging(assignment.keyRange);
+
+	if (forcePurging && assignment.previousFailure.present()) {
+		--bmData->stats.blockedAssignments;
+	}
 	if (assignment.worker.present() && assignment.worker.get().isValid()) {
 		if (BM_DEBUG) {
 			fmt::print("BW {0} already chosen for seqno {1} in BM {2}\n",
@@ -911,8 +1048,10 @@ static bool handleRangeIsAssign(Reference<BlobManagerData> bmData, RangeAssignme
 			// assignsInProgress
 			bmData->addActor.send(doRangeAssignment(bmData, assignment, workerId, bmData->epoch, seqNo));
 		} else {
-			bmData->assignsInProgress.insert(assignment.keyRange,
-			                                 doRangeAssignment(bmData, assignment, workerId, bmData->epoch, seqNo));
+			if (!forcePurging) {
+				bmData->assignsInProgress.insert(assignment.keyRange,
+				                                 doRangeAssignment(bmData, assignment, workerId, bmData->epoch, seqNo));
+			}
 			if (bmData->workerStats.count(workerId)) {
 				bmData->workerStats[workerId].numGranulesAssigned += 1;
 			}
@@ -921,8 +1060,10 @@ static bool handleRangeIsAssign(Reference<BlobManagerData> bmData, RangeAssignme
 		// Ensure the key boundaries are updated before we pick a worker
 		bmData->workerAssignments.insert(assignment.keyRange, UID());
 		ASSERT(assignment.assign.get().type != AssignRequestType::Continue);
-		bmData->assignsInProgress.insert(assignment.keyRange,
-		                                 doRangeAssignment(bmData, assignment, Optional<UID>(), bmData->epoch, seqNo));
+		if (!forcePurging) {
+			bmData->assignsInProgress.insert(
+			    assignment.keyRange, doRangeAssignment(bmData, assignment, Optional<UID>(), bmData->epoch, seqNo));
+		}
 	}
 	return true;
 }
@@ -943,8 +1084,31 @@ static bool handleRangeIsRevoke(Reference<BlobManagerData> bmData, RangeAssignme
 		auto currentAssignments = bmData->workerAssignments.intersectingRanges(assignment.keyRange);
 		for (auto& it : currentAssignments) {
 			// ensure range doesn't truncate existing ranges
-			ASSERT(it.begin() >= assignment.keyRange.begin);
-			ASSERT(it.end() <= assignment.keyRange.end);
+			if (it.begin() < assignment.keyRange.begin || it.end() > assignment.keyRange.end) {
+				// the only case where this is ok is on startup when a BM is revoking old granules after reading
+				// knownBlobRanges and seeing that some are no longer present.
+				auto knownRanges = bmData->knownBlobRanges.intersectingRanges(it.range());
+				bool inKnownBlobRanges = false;
+				for (auto& r : knownRanges) {
+					if (r.value()) {
+						inKnownBlobRanges = true;
+						break;
+					}
+				}
+				bool forcePurging = bmData->isForcePurging(it.range());
+				if (it.cvalue() != UID() || (inKnownBlobRanges && !forcePurging)) {
+					fmt::print("Assignment [{0} - {1}): {2} truncates range [{3} - {4}) ({5}, {6})\n",
+					           assignment.keyRange.begin.printable(),
+					           assignment.keyRange.end.printable(),
+					           it.cvalue().toString().substr(0, 5),
+					           it.begin().printable(),
+					           it.end().printable(),
+					           inKnownBlobRanges,
+					           forcePurging);
+					// assert on condition again to make assertion failure better than "false"
+					ASSERT(it.cvalue() == UID() && (!inKnownBlobRanges || forcePurging));
+				}
+			}
 
 			// It is fine for multiple disjoint sub-ranges to have the same sequence number since they were part
 			// of the same logical change
@@ -975,7 +1139,7 @@ static bool handleRangeAssign(Reference<BlobManagerData> bmData, RangeAssignment
 	}
 }
 
-ACTOR Future<Void> checkManagerLock(Reference<ReadYourWritesTransaction> tr, Reference<BlobManagerData> bmData) {
+ACTOR Future<Void> checkManagerLock(Transaction* tr, Reference<BlobManagerData> bmData) {
 	Optional<Value> currentLockValue = wait(tr->get(blobManagerEpochKey));
 	ASSERT(currentLockValue.present());
 	int64_t currentEpoch = decodeBlobManagerEpochValue(currentLockValue.get());
@@ -998,6 +1162,11 @@ ACTOR Future<Void> checkManagerLock(Reference<ReadYourWritesTransaction> tr, Ref
 	return Void();
 }
 
+ACTOR Future<Void> checkManagerLock(Reference<ReadYourWritesTransaction> tr, Reference<BlobManagerData> bmData) {
+	wait(checkManagerLock(&(tr->getTransaction()), bmData));
+	return Void();
+}
+
 ACTOR Future<Void> writeInitialGranuleMapping(Reference<BlobManagerData> bmData, BlobGranuleSplitPoints splitPoints) {
 	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);
 	// don't do too many in one transaction
@@ -1009,25 +1178,25 @@ ACTOR Future<Void> writeInitialGranuleMapping(Reference<BlobManagerData> bmData,
 		state int j = 0;
 		loop {
 			try {
-				tr->setOption(FDBTransactionOptions::Option::PRIORITY_SYSTEM_IMMEDIATE);
-				tr->setOption(FDBTransactionOptions::Option::ACCESS_SYSTEM_KEYS);
+				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 				wait(checkManagerLock(tr, bmData));
-				while (i + j < splitPoints.keys.size() - 1 && j < transactionChunkSize) {
-					state KeyRangeRef splitRange = KeyRangeRef(splitPoints.keys[i + j], splitPoints.keys[i + j + 1]);
-
-					// set to empty UID - no worker assigned yet
-					wait(krmSetRange(tr,
-					                 blobGranuleMappingKeys.begin,
-					                 KeyRangeRef(splitPoints.keys[i + j], splitPoints.keys[i + j + 1]),
-					                 blobGranuleMappingValueFor(UID())));
-
-					// Update BlobGranuleMergeBoundary.
-					if (splitPoints.boundaries.count(splitRange.begin)) {
-						tr->set(blobGranuleMergeBoundaryKeyFor(splitRange.begin),
-						        blobGranuleMergeBoundaryValueFor(splitPoints.boundaries[splitRange.begin]));
+				// Instead of doing a krmSetRange for each granule, because it does a read-modify-write, we do one
+				// krmSetRange for the whole batch, and then just individual sets for each intermediate boundary This
+				// does one read per transaction instead of N serial reads per transaction
+				state int endIdx = std::min(i + transactionChunkSize, (int)(splitPoints.keys.size() - 1));
+				wait(krmSetRange(tr,
+				                 blobGranuleMappingKeys.begin,
+				                 KeyRangeRef(splitPoints.keys[i], splitPoints.keys[endIdx]),
+				                 blobGranuleMappingValueFor(UID())));
+				for (j = 0; i + j < endIdx; j++) {
+					if (splitPoints.boundaries.count(splitPoints.keys[i + j])) {
+						tr->set(blobGranuleMergeBoundaryKeyFor(splitPoints.keys[i + j]),
+						        blobGranuleMergeBoundaryValueFor(splitPoints.boundaries[splitPoints.keys[i + j]]));
 					}
-
-					j++;
+					tr->set(splitPoints.keys[i + j].withPrefix(blobGranuleMappingKeys.begin),
+					        blobGranuleMappingValueFor(UID()));
 				}
 				wait(tr->commit());
 
@@ -1052,7 +1221,7 @@ ACTOR Future<Void> writeInitialGranuleMapping(Reference<BlobManagerData> bmData,
 ACTOR Future<Void> loadTenantMap(Reference<ReadYourWritesTransaction> tr, Reference<BlobManagerData> bmData) {
 	state KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> tenantResults;
 	wait(store(tenantResults,
-	           TenantMetadata::tenantMap.getRange(
+	           TenantMetadata::tenantMap().getRange(
 	               tr, Optional<TenantName>(), Optional<TenantName>(), CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)));
 	ASSERT(tenantResults.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER && !tenantResults.more);
 
@@ -1061,23 +1230,32 @@ ACTOR Future<Void> loadTenantMap(Reference<ReadYourWritesTransaction> tr, Refere
 	return Void();
 }
 
+ACTOR Future<Void> monitorTenants(Reference<BlobManagerData> bmData) {
+	loop {
+		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);
+		loop {
+			try {
+				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+				wait(loadTenantMap(tr, bmData));
+
+				state Future<Void> watchChange = tr->watch(TenantMetadata::lastTenantId().key);
+				wait(tr->commit());
+				wait(watchChange);
+				tr->reset();
+			} catch (Error& e) {
+				wait(tr->onError(e));
+			}
+		}
+	}
+}
+
 // FIXME: better way to load tenant mapping?
 ACTOR Future<Void> monitorClientRanges(Reference<BlobManagerData> bmData) {
 	state Optional<Value> lastChangeKeyValue;
-	state Key changeKey;
 	state bool needToCoalesce = bmData->epoch > 1;
-	state std::unordered_map<int64_t, TenantMapEntry> knownTenantCache;
-
-	if (SERVER_KNOBS->BG_RANGE_SOURCE == "tenant") {
-		changeKey = TenantMetadata::lastTenantId.key;
-	} else if (SERVER_KNOBS->BG_RANGE_SOURCE == "blobRangeKeys") {
-		changeKey = blobRangeChangeKey;
-	} else {
-		ASSERT_WE_THINK(false);
-		// wait to prevent spin looping
-		wait(delay(600.0));
-		throw internal_error();
-	}
+	state bool firstLoad = true;
 
 	loop {
 		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);
@@ -1089,53 +1267,36 @@ ACTOR Future<Void> monitorClientRanges(Reference<BlobManagerData> bmData) {
 			try {
 				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 
 				// read change key at this point along with data
-				state Optional<Value> ckvBegin = wait(tr->get(changeKey));
+				state Optional<Value> ckvBegin = wait(tr->get(blobRangeChangeKey));
 
-				// TODO why is there separate arena?
 				state Arena ar;
-				state RangeResult results;
-				if (SERVER_KNOBS->BG_RANGE_SOURCE == "blobRangeKeys") {
-					wait(store(results,
-					           krmGetRanges(tr,
-					                        blobRangeKeys.begin,
-					                        KeyRange(normalKeys),
-					                        CLIENT_KNOBS->TOO_MANY,
-					                        GetRangeLimits::BYTE_LIMIT_UNLIMITED)));
-					ASSERT_WE_THINK(!results.more && results.size() < CLIENT_KNOBS->TOO_MANY);
-					if (results.more || results.size() >= CLIENT_KNOBS->TOO_MANY) {
-						TraceEvent(SevError, "BlobManagerTooManyClientRanges", bmData->id)
-						    .detail("Epoch", bmData->epoch)
-						    .detail("ClientRanges", results.size() - 1);
-						wait(delay(600));
-						if (bmData->iAmReplaced.canBeSet()) {
-							bmData->iAmReplaced.sendError(internal_error());
-						}
-						throw internal_error();
-					}
-
-					ar.dependsOn(results.arena());
-				} else {
-					wait(loadTenantMap(tr, bmData));
-
-					std::vector<Key> prefixes;
-					for (auto& it : bmData->tenantData.tenantInfoById) {
-						prefixes.push_back(it.second.prefix);
-					}
-
-					// make this look like knownBlobRanges
-					std::sort(prefixes.begin(), prefixes.end());
-					for (auto& p : prefixes) {
-						if (!results.empty()) {
-							ASSERT(results.back().key < p);
-						}
-						results.push_back_deep(results.arena(), KeyValueRef(p, LiteralStringRef("1")));
-						results.push_back_deep(results.arena(),
-						                       KeyValueRef(p.withSuffix(normalKeys.end), LiteralStringRef("0")));
+				state RangeResult results = wait(krmGetRanges(tr,
+				                                              blobRangeKeys.begin,
+				                                              KeyRange(normalKeys),
+				                                              CLIENT_KNOBS->TOO_MANY,
+				                                              GetRangeLimits::BYTE_LIMIT_UNLIMITED));
+				ASSERT_WE_THINK(!results.more && results.size() < CLIENT_KNOBS->TOO_MANY);
+				if (results.more || results.size() >= CLIENT_KNOBS->TOO_MANY) {
+					TraceEvent(SevError, "BlobManagerTooManyClientRanges", bmData->id)
+					    .detail("Epoch", bmData->epoch)
+					    .detail("ClientRanges", results.size() - 1);
+					wait(delay(600));
+					if (bmData->iAmReplaced.canBeSet()) {
+						bmData->iAmReplaced.sendError(internal_error());
 					}
+					throw internal_error();
 				}
 
+				// TODO better way to do this!
+				bmData->mergeHardBoundaries.clear();
+				for (auto& it : results) {
+					bmData->mergeHardBoundaries[it.key] = true;
+				}
+				ar.dependsOn(results.arena());
+
 				VectorRef<KeyRangeRef> rangesToAdd;
 				VectorRef<KeyRangeRef> rangesToRemove;
 				updateClientBlobRanges(&bmData->knownBlobRanges, results, ar, &rangesToAdd, &rangesToRemove);
@@ -1148,11 +1309,29 @@ ACTOR Future<Void> monitorClientRanges(Reference<BlobManagerData> bmData) {
 					needToCoalesce = false;
 
 					for (int i = 0; i < results.size() - 1; i++) {
-						bool active = results[i].value == LiteralStringRef("1");
+						bool active = results[i].value == blobRangeActive;
 						bmData->knownBlobRanges.insert(KeyRangeRef(results[i].key, results[i + 1].key), active);
 					}
 				}
 
+				state std::vector<Future<BlobGranuleSplitPoints>> splitFutures;
+				// Divide new ranges up into equal chunks by using SS byte sample
+				for (KeyRangeRef range : rangesToAdd) {
+					TraceEvent("ClientBlobRangeAdded", bmData->id).detail("Range", range);
+					// add client range as known "granule" until we determine initial split, in case a purge or
+					// unblobbify comes in before we finish splitting
+
+					// TODO can remove validation eventually
+					auto r = bmData->workerAssignments.intersectingRanges(range);
+					for (auto& it : r) {
+						ASSERT(it.cvalue() == UID());
+					}
+					bmData->workerAssignments.insert(range, UID());
+
+					// start initial split for range
+					splitFutures.push_back(splitRange(bmData, range, false, true));
+				}
+
 				for (KeyRangeRef range : rangesToRemove) {
 					TraceEvent("ClientBlobRangeRemoved", bmData->id).detail("Range", range);
 					if (BM_DEBUG) {
@@ -1165,22 +1344,11 @@ ACTOR Future<Void> monitorClientRanges(Reference<BlobManagerData> bmData) {
 					ra.keyRange = range;
 					ra.revoke = RangeRevokeData(true); // dispose=true
 					handleRangeAssign(bmData, ra);
-
-					bmData->mergeHardBoundaries.erase(range.begin);
-					if (bmData->knownBlobRanges.containedRanges(singleKeyRange(range.end)).empty()) {
-						bmData->mergeHardBoundaries[range.end] = true;
-					}
 				}
 
-				state std::vector<Future<BlobGranuleSplitPoints>> splitFutures;
-				// Divide new ranges up into equal chunks by using SS byte sample
-				for (KeyRangeRef range : rangesToAdd) {
-					TraceEvent("ClientBlobRangeAdded", bmData->id).detail("Range", range);
-					splitFutures.push_back(splitRange(bmData, range, false, true));
-
-					if (bmData->knownBlobRanges.containedRanges(singleKeyRange(range.begin)).empty()) {
-						bmData->mergeHardBoundaries[range.begin] = true;
-					}
+				if (firstLoad) {
+					bmData->loadedClientRanges.send(Void());
+					firstLoad = false;
 				}
 
 				for (auto f : splitFutures) {
@@ -1219,12 +1387,13 @@ ACTOR Future<Void> monitorClientRanges(Reference<BlobManagerData> bmData) {
 				tr->reset();
 				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 				state Future<Void> watchFuture;
 
-				Optional<Value> ckvEnd = wait(tr->get(changeKey));
+				Optional<Value> ckvEnd = wait(tr->get(blobRangeChangeKey));
 
 				if (ckvEnd == lastChangeKeyValue) {
-					watchFuture = tr->watch(changeKey); // watch for change in key
+					watchFuture = tr->watch(blobRangeChangeKey); // watch for change in key
 					wait(tr->commit());
 					if (BM_DEBUG) {
 						printf("Blob manager done processing client ranges, awaiting update\n");
@@ -1271,6 +1440,249 @@ static void downsampleSplit(const Standalone<VectorRef<KeyRef>>& splits,
 	}
 }
 
+ACTOR Future<Void> reevaluateInitialSplit(Reference<BlobManagerData> bmData,
+                                          UID currentWorkerId,
+                                          KeyRange granuleRange,
+                                          UID granuleID,
+                                          int64_t epoch,
+                                          int64_t seqno,
+                                          Key proposedSplitKey) {
+	CODE_PROBE(true, "BM re-evaluating initial split too big");
+	if (BM_DEBUG) {
+		fmt::print("BM {0} re-evaluating initial split [{1} - {2}) too big from {3} @ ({4}, {5})\n",
+		           bmData->epoch,
+		           granuleRange.begin.printable(),
+		           granuleRange.end.printable(),
+		           currentWorkerId.toString().substr(0, 5),
+		           epoch,
+		           seqno);
+		fmt::print("Proposed split (2):\n");
+		fmt::print("    {0}\n", granuleRange.begin.printable());
+		fmt::print("    {0}\n", proposedSplitKey.printable());
+		fmt::print("    {0}\n", granuleRange.end.printable());
+	}
+	TraceEvent("BMCheckInitialSplitTooBig", bmData->id)
+	    .detail("Epoch", bmData->epoch)
+	    .detail("Granule", granuleRange)
+	    .detail("ProposedSplitKey", proposedSplitKey);
+	// calculate new split targets speculatively assuming split is too large and current worker still owns it
+	ASSERT(granuleRange.begin < proposedSplitKey);
+	ASSERT(proposedSplitKey < granuleRange.end);
+	state Future<BlobGranuleSplitPoints> fSplitFirst =
+	    splitRange(bmData, KeyRangeRef(granuleRange.begin, proposedSplitKey), false, true);
+	state Future<BlobGranuleSplitPoints> fSplitSecond =
+	    splitRange(bmData, KeyRangeRef(proposedSplitKey, granuleRange.end), false, true);
+
+	state Standalone<VectorRef<KeyRef>> newRanges;
+
+	BlobGranuleSplitPoints splitFirst = wait(fSplitFirst);
+	ASSERT(splitFirst.keys.size() >= 2);
+	ASSERT(splitFirst.keys.front() == granuleRange.begin);
+	ASSERT(splitFirst.keys.back() == proposedSplitKey);
+	for (int i = 0; i < splitFirst.keys.size(); i++) {
+		newRanges.push_back_deep(newRanges.arena(), splitFirst.keys[i]);
+	}
+
+	BlobGranuleSplitPoints splitSecond = wait(fSplitSecond);
+	ASSERT(splitSecond.keys.size() >= 2);
+	ASSERT(splitSecond.keys.front() == proposedSplitKey);
+	ASSERT(splitSecond.keys.back() == granuleRange.end);
+	// i=1 to skip proposedSplitKey, since above already added it
+	for (int i = 1; i < splitSecond.keys.size(); i++) {
+		newRanges.push_back_deep(newRanges.arena(), splitSecond.keys[i]);
+	}
+
+	if (BM_DEBUG) {
+		fmt::print("Re-evaluated split ({0}):\n", newRanges.size());
+		for (auto& it : newRanges) {
+			fmt::print("    {0}\n", it.printable());
+		}
+	}
+
+	// redo key alignment on full set of split points
+	// FIXME: only need to align propsedSplitKey in the middle
+	state BlobGranuleSplitPoints finalSplit = wait(alignKeys(bmData, granuleRange, newRanges));
+
+	ASSERT(finalSplit.keys.size() > 2);
+
+	if (BM_DEBUG) {
+		fmt::print("Aligned split ({0}):\n", finalSplit.keys.size());
+		for (auto& it : finalSplit.keys) {
+			fmt::print("    {0}{1}\n", it.printable(), finalSplit.boundaries.count(it) ? " *" : "");
+		}
+	}
+
+	// Check lock to see if lock is still the specified epoch and seqno, and there are no files for the granule.
+	// If either of these are false, some other worker now has the granule. if there are files, it already succeeded at
+	// a split. if not, and it fails too, it will retry and get back here
+	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);
+	state Key lockKey = blobGranuleLockKeyFor(granuleRange);
+	state bool retried = false;
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+			// make sure we're still manager when this transaction gets committed
+			wait(checkManagerLock(tr, bmData));
+
+			ForcedPurgeState purgeState = wait(getForcePurgedState(&tr->getTransaction(), granuleRange));
+			if (purgeState != ForcedPurgeState::NonePurged) {
+				CODE_PROBE(true, "Initial Split Re-evaluate stopped because of force purge");
+				TraceEvent("GranuleSplitReEvalCancelledForcePurge", bmData->id)
+				    .detail("Epoch", bmData->epoch)
+				    .detail("GranuleRange", granuleRange);
+
+				// destroy already created change feed from worker so it doesn't leak
+				wait(updateChangeFeed(&tr->getTransaction(),
+				                      granuleIDToCFKey(granuleID),
+				                      ChangeFeedStatus::CHANGE_FEED_DESTROY,
+				                      granuleRange));
+
+				wait(tr->commit());
+
+				return Void();
+			}
+
+			// this adds a read conflict range, so if another granule concurrently commits a file, we will retry and see
+			// that
+			KeyRange range = blobGranuleFileKeyRangeFor(granuleID);
+			RangeResult granuleFiles = wait(tr->getRange(range, 1));
+			if (!granuleFiles.empty()) {
+				CODE_PROBE(true, "split too big was eventually solved by another worker");
+				if (BM_DEBUG) {
+					fmt::print("BM {0} re-evaluating initial split [{1} - {2}) too big: solved by another worker\n",
+					           bmData->epoch,
+					           granuleRange.begin.printable(),
+					           granuleRange.end.printable());
+				}
+				return Void();
+			}
+
+			Optional<Value> prevLockValue = wait(tr->get(lockKey));
+			ASSERT(prevLockValue.present());
+			std::tuple<int64_t, int64_t, UID> prevOwner = decodeBlobGranuleLockValue(prevLockValue.get());
+			int64_t prevOwnerEpoch = std::get<0>(prevOwner);
+			int64_t prevOwnerSeqno = std::get<1>(prevOwner);
+			UID prevGranuleID = std::get<2>(prevOwner);
+			if (prevOwnerEpoch != epoch || prevOwnerSeqno != seqno || prevGranuleID != granuleID) {
+				if (retried && prevOwnerEpoch == bmData->epoch && prevGranuleID == granuleID &&
+				    prevOwnerSeqno == std::numeric_limits<int64_t>::max()) {
+					// owner didn't change, last iteration of this transaction just succeeded but threw an error.
+					CODE_PROBE(true, "split too big adjustment succeeded after retry");
+					break;
+				}
+				CODE_PROBE(true, "split too big was since moved to another worker");
+				if (BM_DEBUG) {
+					fmt::print("BM {0} re-evaluating initial split [{1} - {2}) too big: moved to another worker\n",
+					           bmData->epoch,
+					           granuleRange.begin.printable(),
+					           granuleRange.end.printable());
+					fmt::print("Epoch: Prev {0}, Cur {1}\n", prevOwnerEpoch, epoch);
+					fmt::print("Seqno: Prev {0}, Cur {1}\n", prevOwnerSeqno, seqno);
+					fmt::print("GranuleID: Prev {0}, Cur {1}\n",
+					           prevGranuleID.toString().substr(0, 6),
+					           granuleID.toString().substr(0, 6));
+				}
+				return Void();
+			}
+
+			if (prevOwnerEpoch > bmData->epoch) {
+				if (BM_DEBUG) {
+					fmt::print("BM {0} found a higher epoch {1} for granule lock of [{2} - {3})\n",
+					           bmData->epoch,
+					           prevOwnerEpoch,
+					           granuleRange.begin.printable(),
+					           granuleRange.end.printable());
+				}
+
+				if (bmData->iAmReplaced.canBeSet()) {
+					bmData->iAmReplaced.send(Void());
+				}
+				return Void();
+			}
+
+			// The lock check above *should* handle this, but just be sure, also make sure that this granule wasn't
+			// already split in the granule mapping
+			RangeResult existingRanges = wait(
+			    krmGetRanges(tr, blobGranuleMappingKeys.begin, granuleRange, 3, GetRangeLimits::BYTE_LIMIT_UNLIMITED));
+			if (existingRanges.size() > 2 || existingRanges.more) {
+				CODE_PROBE(true, "split too big was already re-split");
+				if (BM_DEBUG) {
+					fmt::print("BM {0} re-evaluating initial split [{1} - {2}) too big: already split\n",
+					           bmData->epoch,
+					           granuleRange.begin.printable(),
+					           granuleRange.end.printable());
+					for (auto& it : existingRanges) {
+						fmt::print("  {0}\n", it.key.printable());
+					}
+				}
+				return Void();
+			}
+
+			// Set lock to max value for this manager, so other reassignments can't race with this transaction
+			// and existing owner can't modify it further.
+			tr->set(lockKey, blobGranuleLockValueFor(bmData->epoch, std::numeric_limits<int64_t>::max(), granuleID));
+
+			// set new ranges
+			state int i;
+			for (i = 0; i < finalSplit.keys.size() - 1; i++) {
+				wait(krmSetRange(tr,
+				                 blobGranuleMappingKeys.begin,
+				                 KeyRangeRef(finalSplit.keys[i], finalSplit.keys[i + 1]),
+				                 blobGranuleMappingValueFor(UID())));
+				if (finalSplit.boundaries.count(finalSplit.keys[i])) {
+					tr->set(blobGranuleMergeBoundaryKeyFor(finalSplit.keys[i]),
+					        blobGranuleMergeBoundaryValueFor(finalSplit.boundaries[finalSplit.keys[i]]));
+				}
+			}
+
+			// Need to destroy the old change feed for the no longer needed feed, otherwise it will leak
+			// This has to be a non-ryw transaction for the change feed destroy mutations to propagate properly
+			// TODO: fix this better! (privatize change feed key clear)
+			wait(updateChangeFeed(&tr->getTransaction(),
+			                      granuleIDToCFKey(granuleID),
+			                      ChangeFeedStatus::CHANGE_FEED_DESTROY,
+			                      granuleRange));
+
+			retried = true;
+			wait(tr->commit());
+			break;
+		} catch (Error& e) {
+			wait(tr->onError(e));
+		}
+	}
+
+	// transaction committed, send updated range assignments. Even if there is only one range still, we need to revoke
+	// it and re-assign it to cancel the old granule and retry
+	CODE_PROBE(true, "BM successfully changed initial split too big");
+	RangeAssignment raRevoke;
+	raRevoke.isAssign = false;
+	raRevoke.keyRange = granuleRange;
+	raRevoke.revoke = RangeRevokeData(false); // not a dispose
+	handleRangeAssign(bmData, raRevoke);
+
+	for (int i = 0; i < finalSplit.keys.size() - 1; i++) {
+		// reassign new range and do handover of previous range
+		RangeAssignment raAssignSplit;
+		raAssignSplit.isAssign = true;
+		raAssignSplit.keyRange = KeyRangeRef(finalSplit.keys[i], finalSplit.keys[i + 1]);
+		raAssignSplit.assign = RangeAssignmentData();
+		// don't care who this range gets assigned to
+		handleRangeAssign(bmData, raAssignSplit);
+	}
+
+	if (BM_DEBUG) {
+		fmt::print("BM {0} Re-splitting initial range [{1} - {2}) into {3} granules done\n",
+		           bmData->epoch,
+		           granuleRange.begin.printable(),
+		           granuleRange.end.printable(),
+		           finalSplit.keys.size() - 1);
+	}
+
+	return Void();
+}
+
 ACTOR Future<Void> maybeSplitRange(Reference<BlobManagerData> bmData,
                                    UID currentWorkerId,
                                    KeyRange granuleRange,
@@ -1279,6 +1691,10 @@ ACTOR Future<Void> maybeSplitRange(Reference<BlobManagerData> bmData,
                                    bool writeHot,
                                    int64_t originalEpoch,
                                    int64_t originalSeqno) {
+	if (bmData->isForcePurging(granuleRange)) {
+		// ignore
+		return Void();
+	}
 	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);
 
 	// first get ranges to split
@@ -1332,12 +1748,15 @@ ACTOR Future<Void> maybeSplitRange(Reference<BlobManagerData> bmData,
 		coalescedRanges.push_back(coalescedRanges.arena(), splitPoints.keys.back());
 		ASSERT(coalescedRanges.size() == SERVER_KNOBS->BG_MAX_SPLIT_FANOUT + 1);
 		if (BM_DEBUG) {
-			fmt::print("Downsampled split from {0} -> {1} granules\n",
+			fmt::print("Downsampled split [{0} - {1}) from {2} -> {3} granules\n",
+			           granuleRange.begin.printable(),
+			           granuleRange.end.printable(),
 			           splitPoints.keys.size() - 1,
 			           SERVER_KNOBS->BG_MAX_SPLIT_FANOUT);
 		}
 
-		splitPoints.keys = coalescedRanges;
+		// TODO probably do something better here?
+		wait(store(splitPoints, alignKeys(bmData, granuleRange, coalescedRanges)));
 		ASSERT(splitPoints.keys.size() <= SERVER_KNOBS->BG_MAX_SPLIT_FANOUT + 1);
 	}
 
@@ -1359,9 +1778,10 @@ ACTOR Future<Void> maybeSplitRange(Reference<BlobManagerData> bmData,
 		           granuleRange.end.printable(),
 		           splitPoints.keys.size() - 1);
 		for (int i = 0; i < splitPoints.keys.size(); i++) {
-			fmt::print("    {}:{}\n",
+			fmt::print("    {0}:{1}{2}\n",
 			           (i < newGranuleIDs.size() ? newGranuleIDs[i] : UID()).toString().substr(0, 6).c_str(),
-			           splitPoints.keys[i].printable());
+			           splitPoints.keys[i].printable(),
+			           splitPoints.boundaries.count(splitPoints.keys[i]) ? " *" : "");
 		}
 	}
 
@@ -1372,12 +1792,21 @@ ACTOR Future<Void> maybeSplitRange(Reference<BlobManagerData> bmData,
 	loop {
 		try {
 			tr->reset();
-			tr->setOption(FDBTransactionOptions::Option::PRIORITY_SYSTEM_IMMEDIATE);
-			tr->setOption(FDBTransactionOptions::Option::ACCESS_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 			ASSERT(splitPoints.keys.size() > 2);
 
 			// make sure we're still manager when this transaction gets committed
 			wait(checkManagerLock(tr, bmData));
+			ForcedPurgeState purgeState = wait(getForcePurgedState(&tr->getTransaction(), granuleRange));
+			if (purgeState != ForcedPurgeState::NonePurged) {
+				CODE_PROBE(true, "Split stopped because of force purge");
+				TraceEvent("GranuleSplitCancelledForcePurge", bmData->id)
+				    .detail("Epoch", bmData->epoch)
+				    .detail("GranuleRange", granuleRange);
+				return Void();
+			}
 
 			// TODO can do this + lock in parallel
 			// Read splitState to see if anything was committed instead of reading granule mapping because we don't want
@@ -1585,7 +2014,10 @@ ACTOR Future<Void> maybeSplitRange(Reference<BlobManagerData> bmData,
 // read mapping from db to handle any in flight granules or other issues
 // Forces all granules in the specified key range to flush data to blob up to the specified version. This is required
 // for executing a merge.
-ACTOR Future<Void> forceGranuleFlush(Reference<BlobManagerData> bmData, KeyRange keyRange, Version version) {
+ACTOR Future<bool> forceGranuleFlush(Reference<BlobManagerData> bmData,
+                                     UID mergeGranuleID,
+                                     KeyRange keyRange,
+                                     Version version) {
 	state Transaction tr(bmData->db);
 	state KeyRange currentRange = keyRange;
 
@@ -1596,10 +2028,27 @@ ACTOR Future<Void> forceGranuleFlush(Reference<BlobManagerData> bmData, KeyRange
 
 	loop {
 		tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+		tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+		tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 		if (currentRange.begin == currentRange.end) {
 			break;
 		}
 		try {
+			ForcedPurgeState purgeState = wait(getForcePurgedState(&tr, keyRange));
+			if (purgeState != ForcedPurgeState::NonePurged) {
+				CODE_PROBE(true, "Granule flush stopped because of force purge");
+				TraceEvent("GranuleFlushCancelledForcePurge", bmData->id)
+				    .detail("Epoch", bmData->epoch)
+				    .detail("KeyRange", keyRange);
+
+				// destroy already created change feed from earlier so it doesn't leak
+				wait(updateChangeFeed(
+				    &tr, granuleIDToCFKey(mergeGranuleID), ChangeFeedStatus::CHANGE_FEED_DESTROY, keyRange));
+
+				wait(tr.commit());
+				return false;
+			}
+
 			// TODO KNOB
 			state RangeResult blobGranuleMapping = wait(krmGetRanges(
 			    &tr, blobGranuleMappingKeys.begin, currentRange, 64, GetRangeLimits::BYTE_LIMIT_UNLIMITED));
@@ -1614,6 +2063,7 @@ ACTOR Future<Void> forceGranuleFlush(Reference<BlobManagerData> bmData, KeyRange
 						           blobGranuleMapping[i].key.printable(),
 						           blobGranuleMapping[i + 1].key.printable());
 					}
+					// range isn't force purged because of above check, so flush was for invalid range
 					throw blob_granule_transaction_too_old();
 				}
 
@@ -1624,6 +2074,7 @@ ACTOR Future<Void> forceGranuleFlush(Reference<BlobManagerData> bmData, KeyRange
 						           blobGranuleMapping[i].key.printable(),
 						           blobGranuleMapping[i + 1].key.printable());
 					}
+					// range isn't force purged because of above check, so flush was for invalid range
 					throw blob_granule_transaction_too_old();
 				}
 
@@ -1711,7 +2162,7 @@ ACTOR Future<Void> forceGranuleFlush(Reference<BlobManagerData> bmData, KeyRange
 		           version);
 	}
 
-	return Void();
+	return true;
 }
 
 // Persist the merge intent for this merge in the database. Once this transaction commits, the merge is in progress. It
@@ -1727,9 +2178,34 @@ ACTOR Future<std::pair<UID, Version>> persistMergeGranulesStart(Reference<BlobMa
 		try {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 
 			wait(checkManagerLock(tr, bmData));
 
+			ForcedPurgeState purgeState = wait(getForcePurgedState(&tr->getTransaction(), mergeRange));
+			if (purgeState != ForcedPurgeState::NonePurged) {
+				CODE_PROBE(true, "Merge start stopped because of force purge");
+				TraceEvent("GranuleMergeStartCancelledForcePurge", bmData->id)
+				    .detail("Epoch", bmData->epoch)
+				    .detail("GranuleRange", mergeRange);
+
+				// destroy already created change feed from earlier so it doesn't leak
+				wait(updateChangeFeed(&tr->getTransaction(),
+				                      granuleIDToCFKey(mergeGranuleID),
+				                      ChangeFeedStatus::CHANGE_FEED_DESTROY,
+				                      mergeRange));
+
+				wait(tr->commit());
+
+				bmData->activeGranuleMerges.insert(mergeRange, invalidVersion);
+				bmData->activeGranuleMerges.coalesce(mergeRange.begin);
+
+				// TODO better error?
+				return std::pair(UID(), invalidVersion);
+			}
+			// FIXME: extra safeguard: check that granuleID of active lock == parentGranuleID for each parent, abort
+			// merge if so
+
 			tr->atomicOp(
 			    blobGranuleMergeKeyFor(mergeGranuleID),
 			    blobGranuleMergeValueFor(mergeRange, parentGranuleIDs, parentGranuleRanges, parentGranuleStartVersions),
@@ -1769,10 +2245,9 @@ ACTOR Future<std::pair<UID, Version>> persistMergeGranulesStart(Reference<BlobMa
 	}
 }
 
-// FIXME: why not just make parentGranuleRanges vector of N+1 keys?
 // Persists the merge being complete in the database by clearing the merge intent. Once this transaction commits, the
 // merge is considered completed.
-ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
+ACTOR Future<bool> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
                                             UID mergeGranuleID,
                                             KeyRange mergeRange,
                                             Version mergeVersion,
@@ -1800,9 +2275,33 @@ ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
 		try {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 
 			wait(checkManagerLock(tr, bmData));
 
+			ForcedPurgeState purgeState = wait(getForcePurgedState(&tr->getTransaction(), mergeRange));
+			if (purgeState != ForcedPurgeState::NonePurged) {
+				CODE_PROBE(true, "Merge finish stopped because of force purge");
+				TraceEvent("GranuleMergeCancelledForcePurge", bmData->id)
+				    .detail("Epoch", bmData->epoch)
+				    .detail("GranuleRange", mergeRange);
+
+				// destroy already created change feed from earlier so it doesn't leak
+				wait(updateChangeFeed(&tr->getTransaction(),
+				                      granuleIDToCFKey(mergeGranuleID),
+				                      ChangeFeedStatus::CHANGE_FEED_DESTROY,
+				                      mergeRange));
+
+				// TODO could also delete history entry here
+
+				wait(tr->commit());
+
+				bmData->activeGranuleMerges.insert(mergeRange, invalidVersion);
+				bmData->activeGranuleMerges.coalesce(mergeRange.begin);
+
+				return false;
+			}
+
 			tr->clear(blobGranuleMergeKeyFor(mergeGranuleID));
 
 			state int parentIdx;
@@ -1815,7 +2314,7 @@ ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
 				// Clear existing merge boundaries.
 				tr->clear(blobGranuleMergeBoundaryKeyFor(parentRange.begin));
 
-				// This has to be
+				// This has to be a non-ryw transaction for the change feed destroy mutations to propagate properly
 				// TODO: fix this better! (privatize change feed key clear)
 				wait(updateChangeFeed(&tr->getTransaction(),
 				                      granuleIDToCFKey(parentGranuleIDs[parentIdx]),
@@ -1876,7 +2375,7 @@ ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
 				           tr->getCommittedVersion());
 			}
 			CODE_PROBE(true, "Granule merge complete");
-			return Void();
+			return true;
 		} catch (Error& e) {
 			wait(tr->onError(e));
 		}
@@ -1893,8 +2392,9 @@ ACTOR Future<Void> finishMergeGranules(Reference<BlobManagerData> bmData,
                                        std::vector<Version> parentGranuleStartVersions) {
 	++bmData->stats.activeMerges;
 
-	// wait for BM to be fully recovered before starting actual merges
+	// wait for BM to be fully recovered and have loaded hard boundaries before starting actual merges
 	wait(bmData->doneRecovering.getFuture());
+	wait(bmData->loadedClientRanges.getFuture());
 	wait(delay(0));
 
 	// Assert that none of the subsequent granules are hard boundaries.
@@ -1905,16 +2405,28 @@ ACTOR Future<Void> finishMergeGranules(Reference<BlobManagerData> bmData,
 	}
 
 	// force granules to persist state up to mergeVersion
-	wait(forceGranuleFlush(bmData, mergeRange, mergeVersion));
+	bool successFlush = wait(forceGranuleFlush(bmData, mergeGranuleID, mergeRange, mergeVersion));
+	if (!successFlush) {
+		bmData->activeGranuleMerges.insert(mergeRange, invalidVersion);
+		bmData->activeGranuleMerges.coalesce(mergeRange.begin);
+		--bmData->stats.activeMerges;
+		return Void();
+	}
 
 	// update state and clear merge intent
-	wait(persistMergeGranulesDone(bmData,
-	                              mergeGranuleID,
-	                              mergeRange,
-	                              mergeVersion,
-	                              parentGranuleIDs,
-	                              parentGranuleRanges,
-	                              parentGranuleStartVersions));
+	bool successFinish = wait(persistMergeGranulesDone(bmData,
+	                                                   mergeGranuleID,
+	                                                   mergeRange,
+	                                                   mergeVersion,
+	                                                   parentGranuleIDs,
+	                                                   parentGranuleRanges,
+	                                                   parentGranuleStartVersions));
+	if (!successFinish) {
+		bmData->activeGranuleMerges.insert(mergeRange, invalidVersion);
+		bmData->activeGranuleMerges.coalesce(mergeRange.begin);
+		--bmData->stats.activeMerges;
+		return Void();
+	}
 
 	int64_t seqnoForEval = bmData->seqNo;
 
@@ -1962,6 +2474,11 @@ ACTOR Future<Void> doMerge(Reference<BlobManagerData> bmData,
 	try {
 		std::pair<UID, Version> persistMerge =
 		    wait(persistMergeGranulesStart(bmData, mergeRange, ids, ranges, startVersions));
+		if (persistMerge.second == invalidVersion) {
+			// cancelled because of force purge
+
+			return Void();
+		}
 		wait(finishMergeGranules(
 		    bmData, persistMerge.first, mergeRange, persistMerge.second, ids, ranges, startVersions));
 		return Void();
@@ -2000,6 +2517,11 @@ static void attemptStartMerge(Reference<BlobManagerData> bmData,
 		}
 	}
 
+	if (bmData->isForcePurging(mergeRange)) {
+		// ignore
+		return;
+	}
+
 	if (BM_DEBUG) {
 		fmt::print("BM {0} Starting merge of [{1} - {2}) ({3})\n",
 		           bmData->epoch,
@@ -2026,7 +2548,7 @@ ACTOR Future<Void> attemptMerges(Reference<BlobManagerData> bmData,
 	}
 	CODE_PROBE(true, "Candidate ranges to merge");
 	wait(bmData->concurrentMergeChecks.take());
-	state FlowLock::Releaser holdingDVL(bmData->concurrentMergeChecks);
+	state FlowLock::Releaser holdingLock(bmData->concurrentMergeChecks);
 
 	// start merging any set of 2+ consecutive granules that can be merged
 	state int64_t currentBytes = 0;
@@ -2080,6 +2602,8 @@ ACTOR Future<Void> attemptMerges(Reference<BlobManagerData> bmData,
 // To ensure each granule waits to see whether all of its neighbors are merge-eligible before merging it, a newly
 // merge-eligible granule will be ignored on the first pass
 ACTOR Future<Void> granuleMergeChecker(Reference<BlobManagerData> bmData) {
+	// wait for BM data to have loaded hard boundaries before starting
+	wait(bmData->loadedClientRanges.getFuture());
 	// initial sleep
 	wait(delayJittered(SERVER_KNOBS->BG_MERGE_CANDIDATE_DELAY_SECONDS));
 	// TODO could optimize to not check if there are no new merge-eligible granules and none in merge pending state
@@ -2158,6 +2682,7 @@ ACTOR Future<Void> deregisterBlobWorker(Reference<BlobManagerData> bmData, BlobW
 	loop {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 		try {
 			wait(checkManagerLock(tr, bmData));
 			Key blobWorkerListKey = blobWorkerListKeyFor(interf.id());
@@ -2298,7 +2823,9 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
 					           rep.continueSeqno,
 					           bwInterf.id().toString(),
 					           rep.doSplit ? "split" : (rep.mergeCandidate ? "merge" : ""),
-					           rep.mergeCandidate ? "" : (rep.writeHotSplit ? "hot" : "normal"));
+					           rep.mergeCandidate
+					               ? ""
+					               : (rep.writeHotSplit ? "hot" : (rep.initialSplitTooBig ? "toobig" : "normal")));
 				}
 
 				ASSERT(rep.doSplit || rep.mergeCandidate);
@@ -2419,7 +2946,7 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
 						if (rep.blockedVersion < inProgressMergeVersion) {
 							CODE_PROBE(true, "merge blocking re-snapshot");
 							if (BM_DEBUG) {
-								fmt::print("DBG: BM {0} MERGE @ {1} blocking re-snapshot [{2} - {3}) @ {4}, "
+								fmt::print("BM {0} MERGE @ {1} blocking re-snapshot [{2} - {3}) @ {4}, "
 								           "continuing snapshot\n",
 								           bmData->epoch,
 								           inProgressMergeVersion,
@@ -2464,14 +2991,25 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
 							           rep.granuleRange.end.printable().c_str(),
 							           newEval.toString());
 						}
-						newEval.inProgress = maybeSplitRange(bmData,
-						                                     bwInterf.id(),
-						                                     rep.granuleRange,
-						                                     rep.granuleID,
-						                                     rep.startVersion,
-						                                     rep.writeHotSplit,
-						                                     rep.originalEpoch,
-						                                     rep.originalSeqno);
+						if (rep.initialSplitTooBig) {
+							ASSERT(rep.proposedSplitKey.present());
+							newEval.inProgress = reevaluateInitialSplit(bmData,
+							                                            bwInterf.id(),
+							                                            rep.granuleRange,
+							                                            rep.granuleID,
+							                                            rep.originalEpoch,
+							                                            rep.originalSeqno,
+							                                            rep.proposedSplitKey.get());
+						} else {
+							newEval.inProgress = maybeSplitRange(bmData,
+							                                     bwInterf.id(),
+							                                     rep.granuleRange,
+							                                     rep.granuleID,
+							                                     rep.startVersion,
+							                                     rep.writeHotSplit,
+							                                     rep.originalEpoch,
+							                                     rep.originalSeqno);
+						}
 						bmData->boundaryEvaluations.insert(rep.granuleRange, newEval);
 					}
 
@@ -2718,8 +3256,67 @@ static void addAssignment(KeyRangeMap<std::tuple<UID, int64_t, int64_t>>& map,
 	}
 }
 
-ACTOR Future<Void> resumeActiveMerges(Reference<BlobManagerData> bmData) {
+// essentially just error handling for resumed merge, since doMerge does it for new merge
+ACTOR Future<Void> resumeMerge(Future<Void> finishMergeFuture, KeyRange mergeRange) {
+	try {
+		wait(finishMergeFuture);
+		return Void();
+	} catch (Error& e) {
+		if (e.code() == error_code_operation_cancelled || e.code() == error_code_blob_manager_replaced) {
+			throw;
+		}
+		TraceEvent(SevError, "UnexpectedErrorResumeGranuleMerge").error(e).detail("Range", mergeRange);
+		throw e;
+	}
+}
+
+ACTOR Future<Void> loadForcePurgedRanges(Reference<BlobManagerData> bmData) {
 	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);
+	state Key beginKey = blobGranuleForcePurgedKeys.begin;
+	state int rowLimit = BUGGIFY ? deterministicRandom()->randomInt(2, 10) : 10000;
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+
+			// using the krm functions can produce incorrect behavior here as it does weird stuff with beginKey
+			KeyRange nextRange(KeyRangeRef(beginKey, blobGranuleForcePurgedKeys.end));
+			state GetRangeLimits limits(rowLimit, GetRangeLimits::BYTE_LIMIT_UNLIMITED);
+			limits.minRows = 2;
+			RangeResult results = wait(tr->getRange(nextRange, limits));
+
+			// Add the mappings to our in memory key range map
+			for (int rangeIdx = 0; rangeIdx < results.size() - 1; rangeIdx++) {
+				if (results[rangeIdx].value == LiteralStringRef("1")) {
+					Key rangeStartKey = results[rangeIdx].key.removePrefix(blobGranuleForcePurgedKeys.begin);
+					Key rangeEndKey = results[rangeIdx + 1].key.removePrefix(blobGranuleForcePurgedKeys.begin);
+					// note: if the old owner is dead, we handle this in rangeAssigner
+					bmData->forcePurgingRanges.insert(KeyRangeRef(rangeStartKey, rangeEndKey), true);
+				}
+			}
+
+			if (!results.more || results.size() <= 1) {
+				break;
+			}
+
+			// re-read last key to get range that starts there
+			beginKey = results.back().key;
+		} catch (Error& e) {
+			if (BM_DEBUG) {
+				fmt::print("BM {0} got error reading granule mapping during recovery: {1}\n", bmData->epoch, e.name());
+			}
+			wait(tr->onError(e));
+		}
+	}
+
+	return Void();
+}
+
+ACTOR Future<Void> resumeActiveMerges(Reference<BlobManagerData> bmData, Future<Void> loadForcePurgedRanges) {
+	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);
+	wait(loadForcePurgedRanges); // load these first, to make sure to avoid resuming any merges that are being force
+	                             // purged
 
 	// FIXME: use range stream instead
 	state int rowLimit = BUGGIFY ? deterministicRandom()->randomInt(1, 10) : 10000;
@@ -2728,8 +3325,11 @@ ACTOR Future<Void> resumeActiveMerges(Reference<BlobManagerData> bmData) {
 		try {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 
 			RangeResult result = wait(tr->getRange(currentRange, rowLimit));
+			state bool anyMore = result.more;
+			state std::vector<Future<Void>> cleanupForcePurged;
 			for (auto& it : result) {
 				CODE_PROBE(true, "Blob Manager Recovery found merging granule");
 				UID mergeGranuleID = decodeBlobGranuleMergeKey(it.key);
@@ -2749,25 +3349,51 @@ ACTOR Future<Void> resumeActiveMerges(Reference<BlobManagerData> bmData) {
 					           mergeVersion);
 				}
 
+				if (bmData->isForcePurging(mergeRange)) {
+					if (BM_DEBUG) {
+						fmt::print(
+						    "BM {0} cleaning up merge [{1} - {2}): {3} @ {4} in progress b/c it was force purged\n",
+						    bmData->epoch,
+						    mergeRange.begin.printable(),
+						    mergeRange.end.printable(),
+						    mergeGranuleID.toString().substr(0, 6),
+						    mergeVersion);
+					}
+					cleanupForcePurged.push_back(updateChangeFeed(&tr->getTransaction(),
+					                                              granuleIDToCFKey(mergeGranuleID),
+					                                              ChangeFeedStatus::CHANGE_FEED_DESTROY,
+					                                              mergeRange));
+					continue;
+				}
+
 				// want to mark in progress granule ranges as merging, BEFORE recovery is complete and workers can
 				// report updated status. Start with early (epoch, seqno) to guarantee lower than later status
 				BoundaryEvaluation eval(1, 0, BoundaryEvalType::MERGE, 1, 0);
 				ASSERT(!bmData->isMergeActive(mergeRange));
-				bmData->addActor.send(finishMergeGranules(bmData,
-				                                          mergeGranuleID,
-				                                          mergeRange,
-				                                          mergeVersion,
-				                                          parentGranuleIDs,
-				                                          parentGranuleRanges,
-				                                          parentGranuleStartVersions));
+				bmData->addActor.send(resumeMerge(finishMergeGranules(bmData,
+				                                                      mergeGranuleID,
+				                                                      mergeRange,
+				                                                      mergeVersion,
+				                                                      parentGranuleIDs,
+				                                                      parentGranuleRanges,
+				                                                      parentGranuleStartVersions),
+				                                  mergeRange));
 				bmData->boundaryEvaluations.insert(mergeRange, eval);
 				bmData->activeGranuleMerges.insert(mergeRange, mergeVersion);
 				bmData->setMergeCandidate(mergeRange, MergeCandidateMerging);
 			}
 
-			if (result.more) {
+			if (anyMore) {
 				currentRange = KeyRangeRef(keyAfter(result.back().key), currentRange.end);
-			} else {
+			}
+
+			if (!cleanupForcePurged.empty()) {
+				wait(waitForAll(cleanupForcePurged));
+				wait(tr->commit());
+				tr->reset();
+			}
+
+			if (!anyMore) {
 				return Void();
 			}
 		} catch (Error& e) {
@@ -2784,6 +3410,7 @@ ACTOR Future<Void> loadBlobGranuleMergeBoundaries(Reference<BlobManagerData> bmD
 		try {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 
 			KeyRange nextRange(KeyRangeRef(beginKey, blobGranuleMergeBoundaryKeys.end));
 			// using the krm functions can produce incorrect behavior here as it does weird stuff with beginKey
@@ -2791,8 +3418,9 @@ ACTOR Future<Void> loadBlobGranuleMergeBoundaries(Reference<BlobManagerData> bmD
 			RangeResult results = wait(tr->getRange(nextRange, limits));
 
 			// Add the mappings to our in memory key range map
-			for (int i = 0; i < results.size() - 1; i++) {
-				bmData->mergeBoundaries[results[i].key] = decodeBlobGranuleMergeBoundaryValue(results[i].value);
+			for (int i = 0; i < results.size(); i++) {
+				bmData->mergeBoundaries[results[i].key.removePrefix(blobGranuleMergeBoundaryKeys.begin)] =
+				    decodeBlobGranuleMergeBoundaryValue(results[i].value);
 			}
 
 			if (!results.more) {
@@ -2826,13 +3454,36 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
 	// Once we acknowledge the existing blob workers, we can go ahead and recruit new ones
 	bmData->startRecruiting.trigger();
 
+	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);
+
+	// set up force purge keys if not done already
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+			RangeResult existingForcePurgeKeys = wait(tr->getRange(blobGranuleForcePurgedKeys, 1));
+			if (!existingForcePurgeKeys.empty()) {
+				break;
+			}
+			wait(checkManagerLock(tr, bmData));
+			wait(krmSetRange(tr, blobGranuleForcePurgedKeys.begin, normalKeys, LiteralStringRef("0")));
+			wait(tr->commit());
+			tr->reset();
+			break;
+		} catch (Error& e) {
+			wait(tr->onError(e));
+		}
+	}
+
 	// skip the rest of the algorithm for the first blob manager
 	if (bmData->epoch == 1) {
 		bmData->doneRecovering.send(Void());
 		return Void();
 	}
 
-	state Future<Void> resumeMergesFuture = resumeActiveMerges(bmData);
+	state Future<Void> forcePurgedRanges = loadForcePurgedRanges(bmData);
+	state Future<Void> resumeMergesFuture = resumeActiveMerges(bmData, forcePurgedRanges);
 
 	CODE_PROBE(true, "BM doing recovery");
 
@@ -2856,7 +3507,6 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
 
 	state KeyRangeMap<std::tuple<UID, int64_t, int64_t>> workerAssignments;
 	workerAssignments.insert(normalKeys, std::tuple(UID(), 0, 0));
-	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);
 
 	// FIXME: use range stream instead
 	state int rowLimit = BUGGIFY ? deterministicRandom()->randomInt(2, 10) : 10000;
@@ -2941,6 +3591,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
 		try {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 
 			KeyRange nextRange(KeyRangeRef(beginKey, blobGranuleMappingKeys.end));
 			// using the krm functions can produce incorrect behavior here as it does weird stuff with beginKey
@@ -2991,6 +3642,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
 		}
 	}
 
+	wait(forcePurgedRanges);
 	wait(resumeMergesFuture);
 
 	// Step 2. Send assign requests for all the granules and transfer assignments
@@ -3001,6 +3653,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
 		try {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 			wait(checkManagerLock(tr, bmData));
 			wait(tr->commit());
 			break;
@@ -3040,6 +3693,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
 		fmt::print("BM {0} final ranges:\n", bmData->epoch);
 	}
 
+	state int totalGranules = 0;
 	state int explicitAssignments = 0;
 	for (auto& range : workerAssignments.intersectingRanges(normalKeys)) {
 		int64_t epoch = std::get<1>(range.value());
@@ -3048,6 +3702,8 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
 			continue;
 		}
 
+		totalGranules++;
+
 		UID workerId = std::get<0>(range.value());
 		bmData->workerAssignments.insert(range.range(), workerId);
 
@@ -3077,6 +3733,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
 		try {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 			wait(loadTenantMap(tr, bmData));
 			break;
 		} catch (Error& e) {
@@ -3089,7 +3746,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
 	TraceEvent("BlobManagerRecovered", bmData->id)
 	    .detail("Epoch", bmData->epoch)
 	    .detail("Duration", now() - recoveryStartTime)
-	    .detail("Granules", bmData->workerAssignments.size()) // TODO this includes un-set ranges, so it is inaccurate
+	    .detail("Granules", totalGranules)
 	    .detail("Assigned", explicitAssignments)
 	    .detail("Revoked", outOfDateAssignments.size());
 
@@ -3336,6 +3993,8 @@ ACTOR Future<GranuleFiles> loadHistoryFiles(Reference<BlobManagerData> bmData, U
 	state GranuleFiles files;
 	loop {
 		try {
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 			wait(readGranuleFiles(&tr, &startKey, range.end, &files, granuleID));
 			return files;
 		} catch (Error& e) {
@@ -3344,20 +4003,21 @@ ACTOR Future<GranuleFiles> loadHistoryFiles(Reference<BlobManagerData> bmData, U
 	}
 }
 
-ACTOR Future<bool> canDeleteFullGranule(Reference<BlobManagerData> self, UID granuleId) {
+ACTOR Future<bool> canDeleteFullGranuleSplit(Reference<BlobManagerData> self, UID granuleId) {
 	state Transaction tr(self->db);
 	state KeyRange splitRange = blobGranuleSplitKeyRangeFor(granuleId);
 	state KeyRange checkRange = splitRange;
 	state bool retry = false;
 
 	if (BM_PURGE_DEBUG) {
-		fmt::print("BM {0} Fully delete granule check {1}\n", self->epoch, granuleId.toString());
+		fmt::print("BM {0} Fully delete granule split check {1}\n", self->epoch, granuleId.toString());
 	}
 
 	loop {
 		try {
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 
 			int lim = SERVER_KNOBS->BG_MAX_SPLIT_FANOUT;
 			if (BUGGIFY_WITH_PROB(0.1)) {
@@ -3429,6 +4089,51 @@ ACTOR Future<bool> canDeleteFullGranule(Reference<BlobManagerData> self, UID gra
 	return false;
 }
 
+ACTOR Future<Void> canDeleteFullGranuleMerge(Reference<BlobManagerData> self, Optional<UID> mergeChildId) {
+	// if this granule is the parent of a merged granule, it needs to re-snapshot the merged granule before we can
+	// delete this one
+	if (!mergeChildId.present()) {
+		return Void();
+	}
+	CODE_PROBE(true, "checking canDeleteFullGranuleMerge");
+
+	if (BM_PURGE_DEBUG) {
+		fmt::print("BM {0} Fully delete granule merge check {1}\n", self->epoch, mergeChildId.get().toString());
+	}
+
+	state Transaction tr(self->db);
+	state KeyRange granuleFileRange = blobGranuleFileKeyRangeFor(mergeChildId.get());
+	// loop until granule has snapshotted
+	loop {
+		try {
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+			RangeResult files = wait(tr.getRange(granuleFileRange, 1));
+			if (!files.empty()) {
+				if (BM_PURGE_DEBUG) {
+					fmt::print("BM {0} Fully delete granule merge check {1} done\n",
+					           self->epoch,
+					           mergeChildId.get().toString());
+				}
+				return Void();
+			}
+			wait(delay(1.0));
+		} catch (Error& e) {
+			wait(tr.onError(e));
+		}
+	}
+}
+
+ACTOR Future<bool> canDeleteFullGranule(Reference<BlobManagerData> self, UID granuleId, Optional<UID> mergeChildId) {
+	state Future<bool> split = canDeleteFullGranuleSplit(self, granuleId);
+	state Future<Void> merge = canDeleteFullGranuleMerge(self, mergeChildId);
+
+	wait(success(split) && merge);
+	bool canDeleteHistory = wait(split);
+	return canDeleteHistory;
+}
+
 static Future<Void> deleteFile(Reference<BlobConnectionProvider> bstoreProvider, std::string filePath) {
 	Reference<BackupContainerFileSystem> bstore = bstoreProvider->getForRead(filePath);
 	return bstore->deleteFile(filePath);
@@ -3460,9 +4165,17 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
                                       UID granuleId,
                                       Key historyKey,
                                       Version purgeVersion,
-                                      KeyRange granuleRange) {
+                                      KeyRange granuleRange,
+                                      Optional<UID> mergeChildID,
+                                      bool force) {
 	if (BM_PURGE_DEBUG) {
-		fmt::print("BM {0} Fully deleting granule {1}: init\n", self->epoch, granuleId.toString());
+		fmt::print("BM {0} Fully deleting granule [{1} - {2}): {3} @ {4}{5}\n",
+		           self->epoch,
+		           granuleRange.begin.printable(),
+		           granuleRange.end.printable(),
+		           granuleId.toString(),
+		           purgeVersion,
+		           force ? " (force)" : "");
 	}
 
 	// if granule is still splitting and files are needed for new sub-granules to re-snapshot, we can only partially
@@ -3470,7 +4183,12 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
 	// Or, if the granule isn't finalized (still needs the history entry for the old change feed id, because all data
 	// from the old change feed hasn't yet been persisted in blob), we can delete the files but need to keep the granule
 	// history entry.
-	state bool canDeleteHistoryKey = wait(canDeleteFullGranule(self, granuleId));
+	state bool canDeleteHistoryKey;
+	if (force) {
+		canDeleteHistoryKey = true;
+	} else {
+		wait(store(canDeleteHistoryKey, canDeleteFullGranule(self, granuleId, mergeChildID)));
+	}
 	state Reference<BlobConnectionProvider> bstore = wait(getBStoreForGranule(self, granuleRange));
 
 	// get files
@@ -3496,9 +4214,9 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
 		           self->epoch,
 		           granuleId.toString(),
 		           filesToDelete.size());
-		for (auto filename : filesToDelete) {
-			fmt::print(" - {}\n", filename.c_str());
-		}
+		/*for (auto filename : filesToDelete) {
+		    fmt::print(" - {}\n", filename.c_str());
+		}*/
 	}
 
 	// delete the files before the corresponding metadata.
@@ -3514,16 +4232,27 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
 	}
 
 	state Transaction tr(self->db);
-	tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-	tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 
 	loop {
+		tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+		tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+		tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 		try {
 			KeyRange fileRangeKey = blobGranuleFileKeyRangeFor(granuleId);
 			if (canDeleteHistoryKey) {
 				tr.clear(historyKey);
 			}
 			tr.clear(fileRangeKey);
+			if (force) {
+				// check manager lock to not delete metadata out from under a later recovering manager
+				wait(checkManagerLock(&tr, self));
+				wait(updateChangeFeed(
+				    &tr, granuleIDToCFKey(granuleId), ChangeFeedStatus::CHANGE_FEED_DESTROY, granuleRange));
+				tr.clear(blobGranuleLockKeyFor(granuleRange));
+				tr.clear(blobGranuleSplitKeyRangeFor(granuleId));
+				tr.clear(blobGranuleMergeKeyFor(granuleId));
+				// FIXME: also clear merge boundaries!
+			}
 			wait(tr.commit());
 			break;
 		} catch (Error& e) {
@@ -3538,7 +4267,7 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
 		           canDeleteHistoryKey ? "" : " ignoring history key!");
 	}
 
-	TraceEvent("GranuleFullPurge", self->id)
+	TraceEvent(SevDebug, "GranuleFullPurge", self->id)
 	    .detail("Epoch", self->epoch)
 	    .detail("GranuleID", granuleId)
 	    .detail("PurgeVersion", purgeVersion)
@@ -3619,9 +4348,9 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self,
 		           self->epoch,
 		           granuleId.toString(),
 		           filesToDelete.size());
-		for (auto filename : filesToDelete) {
-			fmt::print(" - {0}\n", filename);
-		}
+		/*for (auto filename : filesToDelete) {
+		    fmt::print(" - {0}\n", filename);
+		}*/
 	}
 
 	// TODO: the following comment relies on the assumption that BWs will not get requests to
@@ -3640,10 +4369,11 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self,
 	}
 
 	state Transaction tr(self->db);
-	tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-	tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 
 	loop {
+		tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+		tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+		tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 		try {
 			for (auto& key : deletedFileKeys) {
 				tr.clear(key);
@@ -3658,7 +4388,7 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self,
 	if (BM_PURGE_DEBUG) {
 		fmt::print("BM {0} Partially deleting granule {1}: success\n", self->epoch, granuleId.toString());
 	}
-	TraceEvent("GranulePartialPurge", self->id)
+	TraceEvent(SevDebug, "GranulePartialPurge", self->id)
 	    .detail("Epoch", self->epoch)
 	    .detail("GranuleID", granuleId)
 	    .detail("PurgeVersion", purgeVersion)
@@ -3696,11 +4426,11 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 	    .detail("PurgeVersion", purgeVersion)
 	    .detail("Force", force);
 
-	// queue of <range, startVersion, endVersion> for BFS traversal of history
-	state std::queue<std::tuple<KeyRange, Version, Version>> historyEntryQueue;
+	// queue of <range, startVersion, endVersion, mergeChildID> for BFS traversal of history
+	state std::queue<std::tuple<KeyRange, Version, Version, Optional<UID>>> historyEntryQueue;
 
-	// stacks of <granuleId, historyKey> and <granuleId> to track which granules to delete
-	state std::vector<std::tuple<UID, Key, KeyRange>> toFullyDelete;
+	// stacks of <granuleId, historyKey> and <granuleId> (and mergeChildID) to track which granules to delete
+	state std::vector<std::tuple<UID, Key, KeyRange, Optional<UID>>> toFullyDelete;
 	state std::vector<std::pair<UID, KeyRange>> toPartiallyDelete;
 
 	// track which granules we have already added to traversal
@@ -3710,8 +4440,72 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 	// find all active granules (that comprise the range) and add to the queue
 
 	state Transaction tr(self->db);
-	tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-	tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+
+	if (force) {
+		// TODO could clean this up after force purge is done, but it's safer not to
+		self->forcePurgingRanges.insert(range, true);
+		// set force purged range, to prevent future operations on this range
+		loop {
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+			try {
+				// set force purged range, but don't clear mapping range yet, so that if a new BM recovers in the middle
+				// of purging, it still knows what granules to purge
+				wait(checkManagerLock(&tr, self));
+				// FIXME: need to handle this better if range is unaligned. Need to not truncate existing granules, and
+				// instead cover whole of intersecting granules at begin/end
+				wait(krmSetRangeCoalescing(
+				    &tr, blobGranuleForcePurgedKeys.begin, range, normalKeys, LiteralStringRef("1")));
+				wait(tr.commit());
+				break;
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+		tr.reset();
+		tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+		tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+	}
+
+	// if range isn't in known blob ranges, do nothing after writing force purge range to database
+	bool anyKnownRanges = false;
+	auto knownRanges = self->knownBlobRanges.intersectingRanges(range);
+	for (auto& it : knownRanges) {
+		if (it.cvalue()) {
+			anyKnownRanges = true;
+			break;
+		}
+	}
+
+	if (!anyKnownRanges) {
+		CODE_PROBE(true, "skipping purge because not in known blob ranges");
+		TraceEvent("PurgeGranulesSkippingUnknownRange", self->id)
+		    .detail("Epoch", self->epoch)
+		    .detail("Range", range)
+		    .detail("PurgeVersion", purgeVersion)
+		    .detail("Force", force);
+		return Void();
+	}
+
+	// wait for all active splits and merges in the range to come to a stop, so no races with purging
+	std::vector<Future<Void>> activeBoundaryEvals;
+	auto boundaries = self->boundaryEvaluations.intersectingRanges(range);
+	for (auto& it : boundaries) {
+		auto& f = it.cvalue().inProgress;
+		if (f.isValid() && !f.isReady() && !f.isError()) {
+			activeBoundaryEvals.push_back(f);
+		}
+	}
+
+	if (!activeBoundaryEvals.empty()) {
+		wait(waitForAll(activeBoundaryEvals));
+	}
+
+	// some merges aren't counted in boundary evals, for merge/split race reasons
+	while (self->isMergeActive(range)) {
+		wait(delayJittered(1.0));
+	}
 
 	auto ranges = self->workerAssignments.intersectingRanges(range);
 	state std::vector<KeyRange> activeRanges;
@@ -3721,6 +4515,29 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 		activeRanges.push_back(it.range());
 	}
 
+	state std::set<Key> knownBoundariesPurged;
+
+	if (force) {
+		// revoke range from all active blob workers - AFTER we copy set of active ranges to purge
+		// if purge covers multiple blobbified ranges, revoke each separately
+		auto knownRanges = self->knownBlobRanges.intersectingRanges(range);
+		for (auto& it : knownRanges) {
+			if (it.cvalue()) {
+				RangeAssignment ra;
+				ra.isAssign = false;
+				ra.keyRange = range & it.range();
+				ra.revoke = RangeRevokeData(true); // dispose=true
+				if (ra.keyRange.begin > range.begin) {
+					knownBoundariesPurged.insert(ra.keyRange.begin);
+				}
+				if (ra.keyRange.end < range.end) {
+					knownBoundariesPurged.insert(ra.keyRange.end);
+				}
+				handleRangeAssign(self, ra);
+			}
+		}
+	}
+
 	state int rangeIdx;
 	for (rangeIdx = 0; rangeIdx < activeRanges.size(); rangeIdx++) {
 		state KeyRange activeRange = activeRanges[rangeIdx];
@@ -3740,11 +4557,10 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 			continue;
 		}
 
-		// TODO: if this is a force purge, then revoke the assignment from the corresponding BW first
-		// so that it doesn't try to interact with the granule (i.e. force it to give up gLock).
-		// we'll need some way to ack that the revoke was successful
-
 		loop {
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 			try {
 				if (BM_PURGE_DEBUG) {
 					fmt::print("BM {0} Fetching latest history entry for range [{1} - {2})\n",
@@ -3765,7 +4581,7 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 						           history.get().version);
 					}
 					visited.insert({ activeRange.begin.toString(), history.get().version });
-					historyEntryQueue.push({ activeRange, history.get().version, MAX_VERSION });
+					historyEntryQueue.push({ activeRange, history.get().version, MAX_VERSION, {} });
 				} else if (BM_PURGE_DEBUG) {
 					fmt::print("BM {0}   No history for range, ignoring\n", self->epoch);
 				}
@@ -3788,7 +4604,8 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 		state KeyRange currRange;
 		state Version startVersion;
 		state Version endVersion;
-		std::tie(currRange, startVersion, endVersion) = historyEntryQueue.front();
+		state Optional<UID> mergeChildID;
+		std::tie(currRange, startVersion, endVersion, mergeChildID) = historyEntryQueue.front();
 		historyEntryQueue.pop();
 
 		if (BM_PURGE_DEBUG) {
@@ -3805,6 +4622,9 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 		state Key historyKey = blobGranuleHistoryKeyFor(currRange, startVersion);
 		state bool foundHistory = false;
 		loop {
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 			try {
 				Optional<Value> persistedHistory = wait(tr.get(historyKey));
 				if (persistedHistory.present()) {
@@ -3841,7 +4661,7 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 				fmt::print(
 				    "BM {0}   Granule {1} will be FULLY deleted\n", self->epoch, currHistoryNode.granuleID.toString());
 			}
-			toFullyDelete.push_back({ currHistoryNode.granuleID, historyKey, currRange });
+			toFullyDelete.push_back({ currHistoryNode.granuleID, historyKey, currRange, mergeChildID });
 		} else if (startVersion < purgeVersion) {
 			if (BM_PURGE_DEBUG) {
 				fmt::print("BM {0}   Granule {1} will be partially deleted\n",
@@ -3855,6 +4675,8 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 		if (BM_PURGE_DEBUG) {
 			fmt::print("BM {0}   Checking {1} parents\n", self->epoch, currHistoryNode.parentVersions.size());
 		}
+		Optional<UID> mergeChildID =
+		    currHistoryNode.parentVersions.size() > 1 ? currHistoryNode.granuleID : Optional<UID>();
 		for (int i = 0; i < currHistoryNode.parentVersions.size(); i++) {
 			// for (auto& parent : currHistoryNode.parentVersions.size()) {
 			// if we already added this node to queue, skip it; otherwise, mark it as visited
@@ -3884,7 +4706,7 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 
 			// the parent's end version is this node's startVersion,
 			// since this node must have started where it's parent finished
-			historyEntryQueue.push({ parentRange, parentVersion, startVersion });
+			historyEntryQueue.push({ parentRange, parentVersion, startVersion, mergeChildID });
 		}
 	}
 
@@ -3917,16 +4739,21 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 	if (BM_PURGE_DEBUG) {
 		fmt::print("BM {0}: {1} granules to fully delete\n", self->epoch, toFullyDelete.size());
 	}
+	// Go backwards through set of granules to guarantee deleting oldest first. This avoids orphaning granules in the
+	// deletion process
+	// FIXME: could track explicit parent dependencies and parallelize so long as a parent and child aren't running in
+	// parallel, but that's non-trivial
 	for (i = toFullyDelete.size() - 1; i >= 0; --i) {
 		state UID granuleId;
 		Key historyKey;
 		KeyRange keyRange;
-		std::tie(granuleId, historyKey, keyRange) = toFullyDelete[i];
+		Optional<UID> mergeChildId;
+		std::tie(granuleId, historyKey, keyRange, mergeChildId) = toFullyDelete[i];
 		// FIXME: consider batching into a single txn (need to take care of txn size limit)
 		if (BM_PURGE_DEBUG) {
 			fmt::print("BM {0}: About to fully delete granule {1}\n", self->epoch, granuleId.toString());
 		}
-		wait(fullyDeleteGranule(self, granuleId, historyKey, purgeVersion, range));
+		wait(fullyDeleteGranule(self, granuleId, historyKey, purgeVersion, keyRange, mergeChildId, force));
 	}
 
 	if (BM_PURGE_DEBUG) {
@@ -3935,16 +4762,41 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 
 	for (i = toPartiallyDelete.size() - 1; i >= 0; --i) {
 		UID granuleId;
-		KeyRange range;
-		std::tie(granuleId, range) = toPartiallyDelete[i];
+		KeyRange keyRange;
+		std::tie(granuleId, keyRange) = toPartiallyDelete[i];
 		if (BM_PURGE_DEBUG) {
 			fmt::print("BM {0}: About to partially delete granule {1}\n", self->epoch, granuleId.toString());
 		}
-		partialDeletions.emplace_back(partiallyDeleteGranule(self, granuleId, purgeVersion, range));
+		partialDeletions.emplace_back(partiallyDeleteGranule(self, granuleId, purgeVersion, keyRange));
 	}
 
 	wait(waitForAll(partialDeletions));
 
+	if (force) {
+		tr.reset();
+		tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+		tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+		tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+		loop {
+			try {
+				// clear mapping range, so that a new BM doesn't try to recover force purged granules, and clients can't
+				// read them
+				wait(checkManagerLock(&tr, self));
+				wait(krmSetRange(&tr, blobGranuleMappingKeys.begin, range, blobGranuleMappingValueFor(UID())));
+				// FIXME: there is probably a cleaner fix than setting extra keys in the database if someone does a
+				// purge that's not aligned to boundaries
+				for (auto& it : knownBoundariesPurged) {
+					// keep original bounds in granule mapping as to not confuse future managers on recovery
+					tr.set(it.withPrefix(blobGranuleMappingKeys.begin), blobGranuleMappingValueFor(UID()));
+				}
+				wait(tr.commit());
+				break;
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+	}
+
 	// Now that all the necessary granules and their files have been deleted, we can
 	// clear the purgeIntent key to signify that the work is done. However, there could have been
 	// another purgeIntent that got written for this table while we were processing this one.
@@ -4008,6 +4860,7 @@ ACTOR Future<Void> monitorPurgeKeys(Reference<BlobManagerData> self) {
 		loop {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 
 			state std::vector<Future<Void>> purges;
 			state CoalescedKeyRangeMap<std::pair<Version, bool>> purgeMap;
@@ -4084,6 +4937,7 @@ ACTOR Future<Void> monitorPurgeKeys(Reference<BlobManagerData> self) {
 			try {
 				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 				tr->clear(KeyRangeRef(blobGranulePurgeKeys.begin, keyAfter(lastPurgeKey)));
 				wait(tr->commit());
 				break;
@@ -4114,6 +4968,7 @@ ACTOR Future<Void> doLockChecks(Reference<BlobManagerData> bmData) {
 			try {
 				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 				wait(checkManagerLock(tr, bmData));
 				wait(tr->commit());
 				break;
@@ -4255,6 +5110,15 @@ ACTOR Future<Void> bgConsistencyCheck(Reference<BlobManagerData> bmData) {
 // Simulation validation that multiple blob managers aren't started with the same epoch
 static std::map<int64_t, UID> managerEpochsSeen;
 
+ACTOR Future<Void> checkBlobManagerEpoch(Reference<AsyncVar<ServerDBInfo> const> dbInfo, int64_t epoch, UID dbgid) {
+	loop {
+		if (dbInfo->get().blobManager.present() && dbInfo->get().blobManager.get().epoch > epoch) {
+			throw worker_removed();
+		}
+		wait(dbInfo->onChange());
+	}
+}
+
 ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
                                Reference<AsyncVar<ServerDBInfo> const> dbInfo,
                                int64_t epoch) {
@@ -4270,7 +5134,7 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
 		managerEpochsSeen[epoch] = bmInterf.id();
 	}
 	state Reference<BlobManagerData> self =
-	    makeReference<BlobManagerData>(deterministicRandom()->randomUniqueID(),
+	    makeReference<BlobManagerData>(bmInterf.id(),
 	                                   dbInfo,
 	                                   openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True),
 	                                   bmInterf.locality.dcId(),
@@ -4283,31 +5147,35 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
 	}
 	TraceEvent("BlobManagerInit", bmInterf.id()).detail("Epoch", epoch).log();
 
-	// although we start the recruiter, we wait until existing workers are ack'd
-	auto recruitBlobWorker = IAsyncListener<RequestStream<RecruitBlobWorkerRequest>>::create(
-	    dbInfo, [](auto const& info) { return info.clusterInterface.recruitBlobWorker; });
-
-	self->addActor.send(blobWorkerRecruiter(self, recruitBlobWorker));
-
-	// we need to recover the old blob manager's state (e.g. granule assignments) before
-	// before the new blob manager does anything
-	wait(recoverBlobManager(self));
-
-	self->addActor.send(doLockChecks(self));
-	self->addActor.send(monitorClientRanges(self));
-	self->addActor.send(monitorPurgeKeys(self));
-	if (SERVER_KNOBS->BG_CONSISTENCY_CHECK_ENABLED) {
-		self->addActor.send(bgConsistencyCheck(self));
-	}
-	if (SERVER_KNOBS->BG_ENABLE_MERGING) {
-		self->addActor.send(granuleMergeChecker(self));
-	}
-
-	if (BUGGIFY) {
-		self->addActor.send(chaosRangeMover(self));
-	}
+	self->epoch = epoch;
 
 	try {
+		// although we start the recruiter, we wait until existing workers are ack'd
+		auto recruitBlobWorker = IAsyncListener<RequestStream<RecruitBlobWorkerRequest>>::create(
+		    dbInfo, [](auto const& info) { return info.clusterInterface.recruitBlobWorker; });
+
+		self->addActor.send(blobWorkerRecruiter(self, recruitBlobWorker));
+		self->addActor.send(checkBlobManagerEpoch(dbInfo, epoch, bmInterf.id()));
+
+		// we need to recover the old blob manager's state (e.g. granule assignments) before
+		// before the new blob manager does anything
+		wait(recoverBlobManager(self) || collection);
+
+		self->addActor.send(doLockChecks(self));
+		self->addActor.send(monitorClientRanges(self));
+		self->addActor.send(monitorTenants(self));
+		self->addActor.send(monitorPurgeKeys(self));
+		if (SERVER_KNOBS->BG_CONSISTENCY_CHECK_ENABLED) {
+			self->addActor.send(bgConsistencyCheck(self));
+		}
+		if (SERVER_KNOBS->BG_ENABLE_MERGING) {
+			self->addActor.send(granuleMergeChecker(self));
+		}
+
+		if (BUGGIFY) {
+			self->addActor.send(chaosRangeMover(self));
+		}
+
 		loop choose {
 			when(wait(self->iAmReplaced.getFuture())) {
 				if (BM_DEBUG) {
@@ -4322,14 +5190,16 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
 				break;
 			}
 			when(state HaltBlobGranulesRequest req = waitNext(bmInterf.haltBlobGranules.getFuture())) {
-				wait(haltBlobGranules(self));
+				wait(haltBlobGranules(self) || collection);
 				req.reply.send(Void());
 				TraceEvent("BlobGranulesHalted", bmInterf.id()).detail("Epoch", epoch).detail("ReqID", req.requesterID);
 				break;
 			}
-			when(BlobManagerExclusionSafetyCheckRequest exclCheckReq =
-			         waitNext(bmInterf.blobManagerExclCheckReq.getFuture())) {
-				blobManagerExclusionSafetyCheck(self, exclCheckReq);
+			when(BlobManagerExclusionSafetyCheckRequest req = waitNext(bmInterf.blobManagerExclCheckReq.getFuture())) {
+				blobManagerExclusionSafetyCheck(self, req);
+			}
+			when(BlobManagerBlockedRequest req = waitNext(bmInterf.blobManagerBlockedReq.getFuture())) {
+				req.reply.send(BlobManagerBlockedReply(self->stats.blockedAssignments));
 			}
 			when(wait(collection)) {
 				TraceEvent(SevError, "BlobManagerActorCollectionError");
@@ -4340,6 +5210,9 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
 	} catch (Error& err) {
 		TraceEvent("BlobManagerDied", bmInterf.id()).errorUnsuppressed(err);
 	}
+	// prevent a reference counting cycle
+	self->assignsInProgress = KeyRangeActorMap();
+	self->boundaryEvaluations.clear();
 	return Void();
 }
 
@@ -4363,9 +5236,6 @@ TEST_CASE("/blobmanager/updateranges") {
 	VectorRef<KeyRangeRef> added;
 	VectorRef<KeyRangeRef> removed;
 
-	StringRef active = LiteralStringRef("1");
-	StringRef inactive = StringRef();
-
 	RangeResult dbDataEmpty;
 	std::vector<std::pair<KeyRangeRef, bool>> kbrRanges;
 
@@ -4376,34 +5246,34 @@ TEST_CASE("/blobmanager/updateranges") {
 
 	// db data setup
 	RangeResult dbDataAB;
-	dbDataAB.emplace_back(ar, keyA, active);
-	dbDataAB.emplace_back(ar, keyB, inactive);
+	dbDataAB.emplace_back(ar, keyA, blobRangeActive);
+	dbDataAB.emplace_back(ar, keyB, blobRangeInactive);
 
 	RangeResult dbDataAC;
-	dbDataAC.emplace_back(ar, keyA, active);
-	dbDataAC.emplace_back(ar, keyC, inactive);
+	dbDataAC.emplace_back(ar, keyA, blobRangeActive);
+	dbDataAC.emplace_back(ar, keyC, blobRangeInactive);
 
 	RangeResult dbDataAD;
-	dbDataAD.emplace_back(ar, keyA, active);
-	dbDataAD.emplace_back(ar, keyD, inactive);
+	dbDataAD.emplace_back(ar, keyA, blobRangeActive);
+	dbDataAD.emplace_back(ar, keyD, blobRangeInactive);
 
 	RangeResult dbDataBC;
-	dbDataBC.emplace_back(ar, keyB, active);
-	dbDataBC.emplace_back(ar, keyC, inactive);
+	dbDataBC.emplace_back(ar, keyB, blobRangeActive);
+	dbDataBC.emplace_back(ar, keyC, blobRangeInactive);
 
 	RangeResult dbDataBD;
-	dbDataBD.emplace_back(ar, keyB, active);
-	dbDataBD.emplace_back(ar, keyD, inactive);
+	dbDataBD.emplace_back(ar, keyB, blobRangeActive);
+	dbDataBD.emplace_back(ar, keyD, blobRangeInactive);
 
 	RangeResult dbDataCD;
-	dbDataCD.emplace_back(ar, keyC, active);
-	dbDataCD.emplace_back(ar, keyD, inactive);
+	dbDataCD.emplace_back(ar, keyC, blobRangeActive);
+	dbDataCD.emplace_back(ar, keyD, blobRangeInactive);
 
 	RangeResult dbDataAB_CD;
-	dbDataAB_CD.emplace_back(ar, keyA, active);
-	dbDataAB_CD.emplace_back(ar, keyB, inactive);
-	dbDataAB_CD.emplace_back(ar, keyC, active);
-	dbDataAB_CD.emplace_back(ar, keyD, inactive);
+	dbDataAB_CD.emplace_back(ar, keyA, blobRangeActive);
+	dbDataAB_CD.emplace_back(ar, keyB, blobRangeInactive);
+	dbDataAB_CD.emplace_back(ar, keyC, blobRangeActive);
+	dbDataAB_CD.emplace_back(ar, keyD, blobRangeInactive);
 
 	// key ranges setup
 	KeyRangeRef rangeAB = KeyRangeRef(keyA, keyB);
diff --git a/fdbserver/BlobWorker.actor.cpp b/fdbserver/BlobWorker.actor.cpp
index 67e45c7178..582f17d31b 100644
--- a/fdbserver/BlobWorker.actor.cpp
+++ b/fdbserver/BlobWorker.actor.cpp
@@ -36,7 +36,8 @@
 #include "fdbclient/Notified.h"
 
 #include "fdbserver/BlobGranuleServerCommon.actor.h"
-#include "fdbserver/GetEncryptCipherKeys.h"
+#include "fdbserver/EncryptionOpsUtils.h"
+#include "fdbclient/GetEncryptCipherKeys.actor.h"
 #include "fdbserver/Knobs.h"
 #include "fdbserver/MutationTracking.h"
 #include "fdbserver/ServerDBInfo.h"
@@ -83,7 +84,6 @@ struct GranuleStartState {
 	Optional<GranuleHistory> history;
 };
 
-// FIXME: add global byte limit for pending and buffered deltas
 struct GranuleMetadata : NonCopyable, ReferenceCounted<GranuleMetadata> {
 	KeyRange keyRange;
 
@@ -174,8 +174,6 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
 	UID id;
 	Database db;
 
-	BlobWorkerStats stats;
-
 	PromiseStream<Future<Void>> addActor;
 
 	LocalityData locality;
@@ -202,14 +200,32 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
 	NotifiedVersion grvVersion;
 	Promise<Void> fatalError;
 
-	FlowLock initialSnapshotLock;
+	Reference<FlowLock> initialSnapshotLock;
+	Reference<FlowLock> resnapshotLock;
+	Reference<FlowLock> deltaWritesLock;
+
+	BlobWorkerStats stats;
+
 	bool shuttingDown = false;
 
-	int changeFeedStreamReplyBufferSize = SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES / 2;
+	int changeFeedStreamReplyBufferSize = SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES / 4;
 
-	BlobWorkerData(UID id, Reference<AsyncVar<ServerDBInfo> const> dbInf, Database db)
-	  : id(id), db(db), stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL), tenantData(BGTenantMap(dbInf)), dbInfo(dbInf),
-	    initialSnapshotLock(SERVER_KNOBS->BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM) {}
+	bool isEncryptionEnabled = false;
+	bool buggifyFull = false;
+
+	int64_t memoryFullThreshold =
+	    (int64_t)(SERVER_KNOBS->BLOB_WORKER_REJECT_WHEN_FULL_THRESHOLD * SERVER_KNOBS->SERVER_MEM_LIMIT);
+	int64_t lastResidentMemory = 0;
+	double lastResidentMemoryCheckTime = -100.0;
+
+	BlobWorkerData(UID id, Reference<AsyncVar<ServerDBInfo> const> dbInfo, Database db)
+	  : id(id), db(db), tenantData(BGTenantMap(dbInfo)), dbInfo(dbInfo),
+	    initialSnapshotLock(new FlowLock(SERVER_KNOBS->BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM)),
+	    resnapshotLock(new FlowLock(SERVER_KNOBS->BLOB_WORKER_RESNAPSHOT_PARALLELISM)),
+	    deltaWritesLock(new FlowLock(SERVER_KNOBS->BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM)),
+	    stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, initialSnapshotLock, resnapshotLock, deltaWritesLock),
+	    isEncryptionEnabled(
+	        isEncryptionOpSupported(EncryptOperationType::BLOB_GRANULE_ENCRYPTION, db->clientInfo->get())) {}
 
 	bool managerEpochOk(int64_t epoch) {
 		if (epoch < currentManagerEpoch) {
@@ -231,14 +247,54 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
 			return true;
 		}
 	}
+
+	bool isFull() {
+		if (!SERVER_KNOBS->BLOB_WORKER_DO_REJECT_WHEN_FULL) {
+			return false;
+		}
+		if (g_network->isSimulated()) {
+			if (g_simulator.speedUpSimulation) {
+				return false;
+			}
+			return buggifyFull;
+		}
+
+		// TODO knob?
+		if (now() >= 1.0 + lastResidentMemoryCheckTime) {
+			// fdb as of 7.1 limits on resident memory instead of virtual memory
+			stats.lastResidentMemory = getResidentMemoryUsage();
+			lastResidentMemoryCheckTime = now();
+		}
+
+		// if we are already over threshold, no need to estimate extra memory
+		if (stats.lastResidentMemory >= memoryFullThreshold) {
+			return true;
+		}
+
+		// FIXME: since this isn't tested in simulation, could unit test this
+		// Try to model how much memory we *could* use given the already existing assignments and workload on this blob
+		// worker, before agreeing to take on a new assignment, given that several large sources of memory can grow and
+		// change post-assignment
+
+		// estimate slack in bytes buffered as max(0, assignments * (delta file size / 2) - bytesBuffered)
+		int64_t expectedExtraBytesBuffered = std::max(
+		    0, stats.numRangesAssigned * (SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES / 2) - stats.mutationBytesBuffered);
+		// estimate slack in potential pending delta file writes
+		int64_t maximumExtraDeltaWrite = SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES * deltaWritesLock->available();
+		// estimate slack in potential pending resnapshot
+		int64_t maximumExtraReSnapshot =
+		    (SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES + SERVER_KNOBS->BG_DELTA_BYTES_BEFORE_COMPACT) * 2 *
+		    resnapshotLock->available();
+
+		int64_t totalExtra = expectedExtraBytesBuffered + maximumExtraDeltaWrite + maximumExtraReSnapshot;
+		// assumes initial snapshot parallelism is small enough and uncommon enough to not add it to this computation
+		stats.estimatedMaxResidentMemory = stats.lastResidentMemory + totalExtra;
+
+		return stats.estimatedMaxResidentMemory >= memoryFullThreshold;
+	}
 };
 
 namespace {
-bool isBlobFileEncryptionSupported() {
-	bool supported = SERVER_KNOBS->ENABLE_BLOB_GRANULE_ENCRYPTION && SERVER_KNOBS->BG_RANGE_SOURCE == "tenant";
-	ASSERT((supported && SERVER_KNOBS->ENABLE_ENCRYPTION) || !supported);
-	return supported;
-}
 
 Optional<CompressionFilter> getBlobFileCompressFilter() {
 	Optional<CompressionFilter> compFilter;
@@ -418,7 +474,12 @@ ACTOR Future<Void> readAndCheckGranuleLock(Reference<ReadYourWritesTransaction>
 	state Key lockKey = blobGranuleLockKeyFor(granuleRange);
 	Optional<Value> lockValue = wait(tr->get(lockKey));
 
-	ASSERT(lockValue.present());
+	if (!lockValue.present()) {
+		// FIXME: could add some validation for simulation that a force purge was initiated
+		// for lock to be deleted out from under an active granule means a force purge must have happened.
+		throw granule_assignment_conflict();
+	}
+
 	std::tuple<int64_t, int64_t, UID> currentOwner = decodeBlobGranuleLockValue(lockValue.get());
 	checkGranuleLock(epoch, seqno, std::get<0>(currentOwner), std::get<1>(currentOwner));
 
@@ -438,6 +499,8 @@ ACTOR Future<GranuleFiles> loadHistoryFiles(Reference<BlobWorkerData> bwData, UI
 	state GranuleFiles files;
 	loop {
 		try {
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 			wait(readGranuleFiles(&tr, &startKey, range.end, &files, granuleID));
 			return files;
 		} catch (Error& e) {
@@ -621,7 +684,11 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
                                            Version currentDeltaVersion,
                                            Future<BlobFileIndex> previousDeltaFileFuture,
                                            Future<Void> waitCommitted,
-                                           Optional<std::pair<KeyRange, UID>> oldGranuleComplete) {
+                                           Optional<std::pair<KeyRange, UID>> oldGranuleComplete,
+                                           Future<Void> startDeltaFileWrite) {
+	wait(startDeltaFileWrite);
+	state FlowLock::Releaser holdingLock(*bwData->deltaWritesLock);
+
 	wait(delay(0, TaskPriority::BlobWorkerUpdateStorage));
 
 	state std::string fileName = randomBGFilename(bwData->id, granuleID, currentDeltaVersion, ".delta");
@@ -630,7 +697,7 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
 	state Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta;
 	state Arena arena;
 
-	if (isBlobFileEncryptionSupported()) {
+	if (bwData->isEncryptionEnabled) {
 		BlobGranuleCipherKeysCtx ciphKeysCtx = wait(getLatestGranuleCipherKeys(bwData, keyRange, &arena));
 		cipherKeysCtx = std::move(ciphKeysCtx);
 		cipherKeysMeta = BlobGranuleCipherKeysCtx::toCipherKeysMeta(cipherKeysCtx.get());
@@ -644,6 +711,8 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
 	                                                   compressFilter,
 	                                                   cipherKeysCtx);
 	state size_t serializedSize = serialized.size();
+	bwData->stats.compressionBytesRaw += deltasToWrite.expectedSize();
+	bwData->stats.compressionBytesFinal += serializedSize;
 
 	// Free up deltasToWrite here to reduce memory
 	deltasToWrite = Standalone<GranuleDeltas>();
@@ -663,6 +732,10 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
 	// free serialized since it is persisted in blob
 	serialized = Value();
 
+	// now that all buffered memory from file is gone, we can release memory flow lock
+	// we must unblock here to allow feed to continue to consume, so that waitCommitted returns
+	holdingLock.release();
+
 	state int numIterations = 0;
 	try {
 		// before updating FDB, wait for the delta file version to be committed and previous delta files to finish
@@ -674,6 +747,8 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
 		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bwData->db);
 		loop {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 			try {
 				wait(readAndCheckGranuleLock(tr, keyRange, epoch, seqno));
 				numIterations++;
@@ -739,6 +814,13 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
 	}
 }
 
+ACTOR Future<Void> reevaluateInitialSplit(Reference<BlobWorkerData> bwData,
+                                          UID granuleID,
+                                          KeyRange keyRange,
+                                          int64_t epoch,
+                                          int64_t seqno,
+                                          Key proposedSplitKey);
+
 ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
                                           Reference<BlobConnectionProvider> bstore,
                                           KeyRange keyRange,
@@ -747,33 +829,59 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
                                           int64_t seqno,
                                           Version version,
                                           PromiseStream<RangeResult> rows,
-                                          bool createGranuleHistory) {
+                                          bool initialSnapshot) {
 	state std::string fileName = randomBGFilename(bwData->id, granuleID, version, ".snapshot");
 	state Standalone<GranuleSnapshot> snapshot;
+	state int64_t bytesRead = 0;
+	state bool injectTooBig = initialSnapshot && g_network->isSimulated() && BUGGIFY_WITH_PROB(0.1);
 
 	wait(delay(0, TaskPriority::BlobWorkerUpdateStorage));
 
 	loop {
 		try {
+			if (initialSnapshot && snapshot.size() > 1 &&
+			    (injectTooBig || bytesRead >= 3 * SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES)) {
+				// throw transaction too old either on injection for simulation, or if snapshot would be too large now
+				throw transaction_too_old();
+			}
 			RangeResult res = waitNext(rows.getFuture());
 			snapshot.arena().dependsOn(res.arena());
 			snapshot.append(snapshot.arena(), res.begin(), res.size());
+			bytesRead += res.expectedSize();
 			wait(yield(TaskPriority::BlobWorkerUpdateStorage));
 		} catch (Error& e) {
 			if (e.code() == error_code_end_of_stream) {
 				break;
 			}
-			throw e;
+			// if we got transaction_too_old naturally, have lower threshold for re-evaluating (2xlimit)
+			if (initialSnapshot && snapshot.size() > 1 && e.code() == error_code_transaction_too_old &&
+			    (injectTooBig || bytesRead >= 2 * SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES)) {
+				// idle this actor, while we tell the manager this is too big and to re-evaluate granules and revoke us
+				if (BW_DEBUG) {
+					fmt::print("Granule [{0} - {1}) re-evaluating snapshot after {2} bytes ({3} limit) {4}\n",
+					           keyRange.begin.printable(),
+					           keyRange.end.printable(),
+					           bytesRead,
+					           SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES,
+					           injectTooBig ? "(injected)" : "");
+				}
+				wait(reevaluateInitialSplit(
+				    bwData, granuleID, keyRange, epoch, seqno, snapshot[snapshot.size() / 2].key));
+				ASSERT(false);
+			} else {
+				throw e;
+			}
 		}
 	}
 
 	wait(delay(0, TaskPriority::BlobWorkerUpdateStorage));
 
 	if (BW_DEBUG) {
-		fmt::print("Granule [{0} - {1}) read {2} snapshot rows\n",
+		fmt::print("Granule [{0} - {1}) read {2} snapshot rows ({3} bytes)\n",
 		           keyRange.begin.printable(),
 		           keyRange.end.printable(),
-		           snapshot.size());
+		           snapshot.size(),
+		           bytesRead);
 	}
 
 	if (g_network->isSimulated()) {
@@ -795,7 +903,7 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
 	state Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta;
 	state Arena arena;
 
-	if (isBlobFileEncryptionSupported()) {
+	if (bwData->isEncryptionEnabled) {
 		BlobGranuleCipherKeysCtx ciphKeysCtx = wait(getLatestGranuleCipherKeys(bwData, keyRange, &arena));
 		cipherKeysCtx = std::move(ciphKeysCtx);
 		cipherKeysMeta = BlobGranuleCipherKeysCtx::toCipherKeysMeta(cipherKeysCtx.get());
@@ -808,6 +916,8 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
 	                                                  compressFilter,
 	                                                  cipherKeysCtx);
 	state size_t serializedSize = serialized.size();
+	bwData->stats.compressionBytesRaw += snapshot.expectedSize();
+	bwData->stats.compressionBytesFinal += serializedSize;
 
 	// free snapshot to reduce memory
 	snapshot = Standalone<GranuleSnapshot>();
@@ -837,6 +947,8 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
 	try {
 		loop {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 			try {
 				wait(readAndCheckGranuleLock(tr, keyRange, epoch, seqno));
 				numIterations++;
@@ -846,7 +958,7 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
 				    blobGranuleFileValueFor(fname, 0, serializedSize, serializedSize, cipherKeysMeta);
 				tr->set(snapshotFileKey, snapshotFileValue);
 				// create granule history at version if this is a new granule with the initial dump from FDB
-				if (createGranuleHistory) {
+				if (initialSnapshot) {
 					Key historyKey = blobGranuleHistoryKeyFor(keyRange, version);
 					Standalone<BlobGranuleHistoryValue> historyValue;
 					historyValue.granuleID = granuleID;
@@ -909,19 +1021,19 @@ ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData>
 		           metadata->keyRange.begin.printable(),
 		           metadata->keyRange.end.printable());
 	}
-	wait(bwData->initialSnapshotLock.take());
-	state FlowLock::Releaser holdingDVL(bwData->initialSnapshotLock);
+	wait(bwData->initialSnapshotLock->take());
+	state FlowLock::Releaser holdingLock(*bwData->initialSnapshotLock);
 
 	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bwData->db);
-	state int64_t bytesRead = 0;
 	state int retries = 0;
 	state Version lastReadVersion = invalidVersion;
 	state Version readVersion = invalidVersion;
 
 	loop {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-		// FIXME: proper tenant support in Blob Worker
 		tr->setOption(FDBTransactionOptions::RAW_ACCESS);
+		tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 		try {
 			Version rv = wait(tr->getReadVersion());
 			readVersion = rv;
@@ -945,19 +1057,18 @@ ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData>
 			DEBUG_KEY_RANGE("BlobWorkerFDBSnapshot", readVersion, metadata->keyRange, bwData->id);
 
 			// initial snapshot is committed in fdb, we can pop the change feed up to this version
-			inFlightPops->push_back(bwData->db->popChangeFeedMutations(cfKey, readVersion));
+			inFlightPops->push_back(bwData->db->popChangeFeedMutations(cfKey, readVersion + 1));
 			return snapshotWriter.get();
 		} catch (Error& e) {
 			if (e.code() == error_code_operation_cancelled) {
 				throw e;
 			}
 			if (BW_DEBUG) {
-				fmt::print("Dumping snapshot {0} from FDB for [{1} - {2}) got error {3} after {4} bytes\n",
+				fmt::print("Dumping snapshot {0} from FDB for [{1} - {2}) got error {3}\n",
 				           retries + 1,
 				           metadata->keyRange.begin.printable(),
 				           metadata->keyRange.end.printable(),
-				           e.name(),
-				           bytesRead);
+				           e.name());
 			}
 			state Error err = e;
 			if (e.code() == error_code_server_overloaded) {
@@ -972,14 +1083,15 @@ ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData>
 			    .error(err)
 			    .detail("Granule", metadata->keyRange)
 			    .detail("Count", retries);
-			bytesRead = 0;
 			lastReadVersion = readVersion;
 			// Pop change feed up to readVersion, because that data will be before the next snapshot
 			// Do this to prevent a large amount of CF data from accumulating if we have consecutive failures to
 			// snapshot
 			// Also somewhat servers as a rate limiting function and checking that the database is available for this
 			// key range
-			wait(bwData->db->popChangeFeedMutations(cfKey, readVersion));
+			// FIXME: can't do this because this granule might not own the shard anymore and another worker might have
+			// successfully snapshotted already, but if it got an error before even getting to the lock check, it
+			// wouldn't realize wait(bwData->db->popChangeFeedMutations(cfKey, readVersion));
 		}
 	}
 }
@@ -992,6 +1104,8 @@ ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData,
                                             UID granuleID,
                                             std::vector<GranuleFiles> fileSet,
                                             Version version) {
+	wait(bwData->resnapshotLock->take());
+	state FlowLock::Releaser holdingLock(*bwData->resnapshotLock);
 	wait(delay(0, TaskPriority::BlobWorkerUpdateStorage));
 	if (BW_DEBUG) {
 		fmt::print("Compacting snapshot from blob for [{0} - {1}) @ {2}\n",
@@ -1023,7 +1137,7 @@ ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData,
 
 		state Optional<BlobGranuleCipherKeysCtx> snapCipherKeysCtx;
 		if (snapshotF.cipherKeysMeta.present()) {
-			ASSERT(isBlobFileEncryptionSupported());
+			ASSERT(bwData->isEncryptionEnabled);
 
 			BlobGranuleCipherKeysCtx keysCtx =
 			    wait(getGranuleCipherKeysFromKeysMeta(bwData, snapshotF.cipherKeysMeta.get(), &filenameArena));
@@ -1051,7 +1165,8 @@ ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData,
 			deltaF = files.deltaFiles[deltaIdx];
 
 			if (deltaF.cipherKeysMeta.present()) {
-				ASSERT(isBlobFileEncryptionSupported());
+				ASSERT(isEncryptionOpSupported(EncryptOperationType::BLOB_GRANULE_ENCRYPTION,
+				                               bwData->dbInfo->get().client));
 
 				BlobGranuleCipherKeysCtx keysCtx =
 				    wait(getGranuleCipherKeysFromKeysMeta(bwData, deltaF.cipherKeysMeta.get(), &filenameArena));
@@ -1192,6 +1307,7 @@ ACTOR Future<BlobFileIndex> checkSplitAndReSnapshot(Reference<BlobWorkerData> bw
 				bwData->currentManagerStatusStream.get().send(GranuleStatusReply(metadata->keyRange,
 				                                                                 true,
 				                                                                 writeHot,
+				                                                                 false,
 				                                                                 statusEpoch,
 				                                                                 statusSeqno,
 				                                                                 granuleID,
@@ -1255,6 +1371,100 @@ ACTOR Future<BlobFileIndex> checkSplitAndReSnapshot(Reference<BlobWorkerData> bw
 	return reSnapshotIdx;
 }
 
+ACTOR Future<BlobFileIndex> reSnapshotNoCheck(Reference<BlobWorkerData> bwData,
+                                              Reference<BlobConnectionProvider> bstore,
+                                              Reference<GranuleMetadata> metadata,
+                                              UID granuleID,
+                                              Future<BlobFileIndex> lastDeltaBeforeSnapshot) {
+	BlobFileIndex lastDeltaIdx = wait(lastDeltaBeforeSnapshot);
+	state Version reSnapshotVersion = lastDeltaIdx.version;
+	wait(delay(0, TaskPriority::BlobWorkerUpdateFDB));
+
+	CODE_PROBE(true, "re-snapshotting without BM check because still on old change feed!");
+
+	if (BW_DEBUG) {
+		fmt::print("Granule [{0} - {1}) re-snapshotting @ {2} WITHOUT checking with BM, because it is still on old "
+		           "change feed!\n",
+		           metadata->keyRange.begin.printable(),
+		           metadata->keyRange.end.printable(),
+		           reSnapshotVersion);
+	}
+
+	TraceEvent(SevDebug, "BlobGranuleReSnapshotOldFeed", bwData->id)
+	    .detail("Granule", metadata->keyRange)
+	    .detail("Version", reSnapshotVersion);
+
+	// wait for file updater to make sure that last delta file is in the metadata before
+	while (metadata->files.deltaFiles.empty() || metadata->files.deltaFiles.back().version < reSnapshotVersion) {
+		wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
+	}
+
+	std::vector<GranuleFiles> toSnapshot;
+	toSnapshot.push_back(metadata->files);
+	BlobFileIndex reSnapshotIdx =
+	    wait(compactFromBlob(bwData, bstore, metadata, granuleID, toSnapshot, reSnapshotVersion));
+
+	return reSnapshotIdx;
+}
+
+// wait indefinitely to tell manager to re-evaluate this split, until the granule is revoked
+ACTOR Future<Void> reevaluateInitialSplit(Reference<BlobWorkerData> bwData,
+                                          UID granuleID,
+                                          KeyRange keyRange,
+                                          int64_t epoch,
+                                          int64_t seqno,
+                                          Key proposedSplitKey) {
+	// wait for first stream to be initialized
+	while (!bwData->statusStreamInitialized) {
+		wait(bwData->currentManagerStatusStream.onChange());
+	}
+	loop {
+		try {
+			// wait for manager stream to become ready, and send a message
+			loop {
+				choose {
+					when(wait(bwData->currentManagerStatusStream.get().onReady())) { break; }
+					when(wait(bwData->currentManagerStatusStream.onChange())) {}
+				}
+			}
+
+			GranuleStatusReply reply(keyRange,
+			                         true,
+			                         false,
+			                         true,
+			                         epoch,
+			                         seqno,
+			                         granuleID,
+			                         invalidVersion,
+			                         invalidVersion,
+			                         false,
+			                         epoch,
+			                         seqno);
+			reply.proposedSplitKey = proposedSplitKey;
+			bwData->currentManagerStatusStream.get().send(reply);
+			// if a new manager appears, also tell it about this granule being splittable, or retry after a certain
+			// amount of time of not hearing back
+			wait(success(timeout(bwData->currentManagerStatusStream.onChange(), 10.0)));
+			wait(delay(0));
+			CODE_PROBE(true, "Blob worker re-sending initialsplit too big");
+		} catch (Error& e) {
+			if (e.code() == error_code_operation_cancelled) {
+				throw e;
+			}
+
+			CODE_PROBE(true, "Blob worker re-sending merge candidate to manager after not error/not hearing back");
+
+			// if we got broken promise while waiting, the old stream was killed, so we don't need to wait
+			// on change, just retry
+			if (e.code() == error_code_broken_promise) {
+				wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
+			} else {
+				wait(bwData->currentManagerStatusStream.onChange());
+			}
+		}
+	}
+}
+
 ACTOR Future<Void> granuleCheckMergeCandidate(Reference<BlobWorkerData> bwData,
                                               Reference<GranuleMetadata> metadata,
                                               UID granuleID,
@@ -1287,7 +1497,6 @@ ACTOR Future<Void> granuleCheckMergeCandidate(Reference<BlobWorkerData> bwData,
 			wait(bwData->currentManagerStatusStream.onChange());
 		}
 
-		// FIXME: after a certain amount of retries/time, we may want to re-check anyway
 		state double sendTimeGiveUp = now() + SERVER_KNOBS->BG_MERGE_CANDIDATE_THRESHOLD_SECONDS / 2.0;
 		loop {
 			try {
@@ -1306,6 +1515,7 @@ ACTOR Future<Void> granuleCheckMergeCandidate(Reference<BlobWorkerData> bwData,
 				}
 
 				bwData->currentManagerStatusStream.get().send(GranuleStatusReply(metadata->keyRange,
+				                                                                 false,
 				                                                                 false,
 				                                                                 false,
 				                                                                 metadata->continueEpoch,
@@ -1360,11 +1570,10 @@ void handleCompletedDeltaFile(Reference<BlobWorkerData> bwData,
 		}
 		// FIXME: for a write-hot shard, we could potentially batch these and only pop the largest one after
 		// several have completed
-		// FIXME: we actually want to pop at this version + 1 because pop is exclusive?
 		// FIXME: since this is async, and worker could die, new blob worker that opens granule should probably
 		// kick off an async pop at its previousDurableVersion after opening the granule to guarantee it is
 		// eventually popped?
-		Future<Void> popFuture = bwData->db->popChangeFeedMutations(cfKey, completedDeltaFile.version);
+		Future<Void> popFuture = bwData->db->popChangeFeedMutations(cfKey, completedDeltaFile.version + 1);
 		// Do pop asynchronously
 		inFlightPops.push_back(popFuture);
 	}
@@ -1630,6 +1839,20 @@ ACTOR Future<Void> waitVersionCommitted(Reference<BlobWorkerData> bwData,
 	return Void();
 }
 
+ACTOR Future<bool> checkFileNotFoundForcePurgeRace(Reference<BlobWorkerData> bwData, KeyRange range) {
+	state Transaction tr(bwData->db);
+	loop {
+		try {
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+			ForcedPurgeState purgeState = wait(getForcePurgedState(&tr, range));
+			return purgeState != ForcedPurgeState::NonePurged;
+		} catch (Error& e) {
+			wait(tr.onError(e));
+		}
+	}
+}
+
 // updater for a single granule
 // TODO: this is getting kind of large. Should try to split out this actor if it continues to grow?
 ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
@@ -1653,6 +1876,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 
 	state std::deque<std::pair<Version, Version>> rollbacksInProgress;
 	state std::deque<std::pair<Version, Version>> rollbacksCompleted;
+	state Future<Void> startDeltaFileWrite = Future<Void>(Void());
 
 	state bool snapshotEligible; // just wrote a delta file or just took granule over from another worker
 	state bool justDidRollback = false;
@@ -1758,7 +1982,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 		metadata->bufferedDeltaVersion = startVersion;
 		metadata->knownCommittedVersion = startVersion;
 
-		Reference<ChangeFeedData> cfData = makeReference<ChangeFeedData>();
+		Reference<ChangeFeedData> cfData = makeReference<ChangeFeedData>(bwData->db.getPtr());
 
 		if (startState.splitParentGranule.present() && startVersion < startState.changeFeedStartVersion) {
 			// read from parent change feed up until our new change feed is started
@@ -1785,6 +2009,13 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			                                                   metadata->keyRange,
 			                                                   bwData->changeFeedStreamReplyBufferSize,
 			                                                   false);
+			// in case previous worker died before popping the latest version, start another pop
+			if (startState.previousDurableVersion != invalidVersion) {
+				ASSERT(startState.previousDurableVersion >= startState.changeFeedStartVersion);
+				Future<Void> popFuture =
+				    bwData->db->popChangeFeedMutations(cfKey, startState.previousDurableVersion + 1);
+				inFlightPops.push_back(popFuture);
+			}
 		}
 
 		// Start actors BEFORE setting new change feed data to ensure the change feed data is properly
@@ -1915,7 +2146,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 				// update this for change feed popped detection
 				metadata->bufferedDeltaVersion = metadata->activeCFData.get()->getVersion();
 
-				Reference<ChangeFeedData> cfData = makeReference<ChangeFeedData>();
+				Reference<ChangeFeedData> cfData = makeReference<ChangeFeedData>(bwData->db.getPtr());
 
 				changeFeedFuture = bwData->db->getChangeFeedStream(cfData,
 				                                                   cfKey,
@@ -2025,7 +2256,8 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 									lastForceFlushVersion = 0;
 									metadata->forceFlushVersion = NotifiedVersion();
 
-									Reference<ChangeFeedData> cfData = makeReference<ChangeFeedData>();
+									Reference<ChangeFeedData> cfData =
+									    makeReference<ChangeFeedData>(bwData->db.getPtr());
 
 									if (!readOldChangeFeed && cfRollbackVersion < startState.changeFeedStartVersion) {
 										// It isn't possible to roll back across the parent/child feed boundary,
@@ -2182,6 +2414,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 				} else {
 					previousFuture = Future<BlobFileIndex>(BlobFileIndex());
 				}
+				startDeltaFileWrite = bwData->deltaWritesLock->take();
 				Future<BlobFileIndex> dfFuture =
 				    writeDeltaFile(bwData,
 				                   bstore,
@@ -2193,7 +2426,8 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 				                   lastDeltaVersion,
 				                   previousFuture,
 				                   waitVersionCommitted(bwData, metadata, lastDeltaVersion),
-				                   oldChangeFeedDataComplete);
+				                   oldChangeFeedDataComplete,
+				                   startDeltaFileWrite);
 				inFlightFiles.push_back(InFlightFile(dfFuture, lastDeltaVersion, metadata->bufferedDeltaBytes, false));
 
 				oldChangeFeedDataComplete.reset();
@@ -2217,6 +2451,10 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 				// exhaust old change feed before compacting - otherwise we could end up with an endlessly
 				// growing list of previous change feeds in the worst case.
 				snapshotEligible = true;
+
+				// Wait on delta file starting here. If we have too many pending delta file writes, we need to not
+				// continue to consume from the change feed, as that will pile on even more delta files to write
+				wait(startDeltaFileWrite);
 			}
 
 			// FIXME: if we're still reading from old change feed, we should probably compact if we're
@@ -2224,8 +2462,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			// yet
 
 			// If we have enough delta files, try to re-snapshot
-			if (snapshotEligible && metadata->bytesInNewDeltaFiles >= SERVER_KNOBS->BG_DELTA_BYTES_BEFORE_COMPACT &&
-			    metadata->pendingDeltaVersion >= startState.changeFeedStartVersion) {
+			if (snapshotEligible && metadata->bytesInNewDeltaFiles >= SERVER_KNOBS->BG_DELTA_BYTES_BEFORE_COMPACT) {
 				if (BW_DEBUG && !inFlightFiles.empty()) {
 					fmt::print("Granule [{0} - {1}) ready to re-snapshot at {2} after {3} > {4} bytes, "
 					           "waiting for "
@@ -2253,13 +2490,19 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 					previousFuture = Future<BlobFileIndex>(metadata->files.deltaFiles.back());
 				}
 				int64_t versionsSinceLastSnapshot = metadata->pendingDeltaVersion - metadata->pendingSnapshotVersion;
-				Future<BlobFileIndex> inFlightBlobSnapshot = checkSplitAndReSnapshot(bwData,
-				                                                                     bstore,
-				                                                                     metadata,
-				                                                                     startState.granuleID,
-				                                                                     metadata->bytesInNewDeltaFiles,
-				                                                                     previousFuture,
-				                                                                     versionsSinceLastSnapshot);
+				Future<BlobFileIndex> inFlightBlobSnapshot;
+				if (metadata->pendingDeltaVersion >= startState.changeFeedStartVersion) {
+					inFlightBlobSnapshot = checkSplitAndReSnapshot(bwData,
+					                                               bstore,
+					                                               metadata,
+					                                               startState.granuleID,
+					                                               metadata->bytesInNewDeltaFiles,
+					                                               previousFuture,
+					                                               versionsSinceLastSnapshot);
+				} else {
+					inFlightBlobSnapshot =
+					    reSnapshotNoCheck(bwData, bstore, metadata, startState.granuleID, previousFuture);
+				}
 				inFlightFiles.push_back(InFlightFile(inFlightBlobSnapshot, metadata->pendingDeltaVersion, 0, true));
 				pendingSnapshots++;
 
@@ -2408,17 +2651,31 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			throw e;
 		}
 
+		state Error e2 = e;
+		if (e.code() == error_code_file_not_found) {
+			// FIXME: better way to fix this?
+			bool isForcePurging = wait(checkFileNotFoundForcePurgeRace(bwData, metadata->keyRange));
+			if (isForcePurging) {
+				CODE_PROBE(true, "Granule got file not found from force purge");
+				TraceEvent("GranuleFileUpdaterFileNotFoundForcePurge", bwData->id)
+				    .error(e2)
+				    .detail("KeyRange", metadata->keyRange)
+				    .detail("GranuleID", startState.granuleID);
+				return Void();
+			}
+		}
+
 		TraceEvent(SevError, "GranuleFileUpdaterUnexpectedError", bwData->id)
-		    .error(e)
+		    .error(e2)
 		    .detail("Granule", metadata->keyRange)
 		    .detail("GranuleID", startState.granuleID);
 		ASSERT_WE_THINK(false);
 
 		// if not simulation, kill the BW
 		if (bwData->fatalError.canBeSet()) {
-			bwData->fatalError.sendError(e);
+			bwData->fatalError.sendError(e2);
 		}
-		throw e;
+		throw e2;
 	}
 }
 
@@ -2535,6 +2792,8 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
 						break;
 					}
 					try {
+						tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+						tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 						state KeyRangeRef parentRange(curHistory.value.parentBoundaries[pIdx],
 						                              curHistory.value.parentBoundaries[pIdx + 1]);
 						state Version parentVersion = curHistory.value.parentVersions[pIdx];
@@ -2959,14 +3218,15 @@ std::vector<std::pair<KeyRange, Future<GranuleFiles>>> loadHistoryChunks(Referen
 }
 
 // TODO might want to separate this out for valid values for range assignments vs read requests. Assignment
-// conflict isn't valid for read requests but is for assignments
+// conflict and blob_worker_full isn't valid for read requests but is for assignments
 bool canReplyWith(Error e) {
 	switch (e.code()) {
 	case error_code_blob_granule_transaction_too_old:
 	case error_code_transaction_too_old:
-	case error_code_future_version: // not thrown yet
+	case error_code_future_version:
 	case error_code_wrong_shard_server:
 	case error_code_process_behind: // not thrown yet
+	case error_code_blob_worker_full:
 		return true;
 	default:
 		return false;
@@ -3002,6 +3262,7 @@ ACTOR Future<Void> waitForVersion(Reference<GranuleMetadata> metadata, Version v
 
 	// wait for change feed version to catch up to ensure we have all data
 	if (metadata->activeCFData.get()->getVersion() < v) {
+		// FIXME: add future version timeout and throw here, same as SS
 		wait(metadata->activeCFData.get()->whenAtLeast(v));
 		ASSERT(metadata->activeCFData.get()->getVersion() >= v);
 	}
@@ -3225,6 +3486,16 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 						           req.readVersion);
 					}
 				}
+				// if feed was popped by another worker and BW only got empty versions, it wouldn't itself see that it
+				// got popped, but we can still reject the in theory this should never happen with other protections but
+				// it's a useful and inexpensive sanity check
+				Version emptyVersion = metadata->activeCFData.get()->popVersion - 1;
+				if (req.readVersion > metadata->durableDeltaVersion.get() &&
+				    emptyVersion > metadata->bufferedDeltaVersion) {
+					CODE_PROBE(true, "feed popped for read but granule updater didn't notice yet");
+					// FIXME: could try to cancel the actor here somehow, but it should find out eventually
+					throw wrong_shard_server();
+				}
 				rangeGranulePair.push_back(std::pair(metadata->keyRange, metadata->files));
 			}
 
@@ -3251,7 +3522,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 				}
 				state BlobGranuleChunkRef chunk;
 				// TODO change with early reply
-				chunk.includedVersion = req.readVersion;
+
 				chunk.keyRange =
 				    KeyRangeRef(StringRef(rep.arena, item.first.begin), StringRef(rep.arena, item.first.end));
 				if (tenantPrefix.present()) {
@@ -3259,127 +3530,131 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 				}
 
 				int64_t deltaBytes = 0;
-				item.second.getFiles(
-				    granuleBeginVersion, req.readVersion, req.canCollapseBegin, chunk, rep.arena, deltaBytes);
+				item.second.getFiles(granuleBeginVersion,
+				                     req.readVersion,
+				                     req.canCollapseBegin,
+				                     chunk,
+				                     rep.arena,
+				                     deltaBytes,
+				                     req.summarize);
 				bwData->stats.readReqDeltaBytesReturned += deltaBytes;
 				if (granuleBeginVersion > 0 && chunk.snapshotFile.present()) {
 					CODE_PROBE(true, "collapsed begin version request for efficiency");
 					didCollapse = true;
 				}
 
-				// Invoke calls to populate 'EncryptionKeysCtx' for snapshot and/or deltaFiles asynchronously
-				state Optional<Future<BlobGranuleCipherKeysCtx>> snapCipherKeysCtx;
-				if (chunk.snapshotFile.present()) {
-					const bool encrypted = chunk.snapshotFile.get().cipherKeysMetaRef.present();
-
-					if (BW_DEBUG) {
-						TraceEvent("DoBlobGranuleFileRequestDelta_KeysCtxPrepare")
-						    .detail("FileName", chunk.snapshotFile.get().filename.toString())
-						    .detail("Encrypted", encrypted);
-					}
-
-					if (encrypted) {
-						ASSERT(isBlobFileEncryptionSupported());
-						ASSERT(!chunk.snapshotFile.get().cipherKeysCtx.present());
-
-						snapCipherKeysCtx = getGranuleCipherKeysFromKeysMetaRef(
-						    bwData, chunk.snapshotFile.get().cipherKeysMetaRef.get(), &rep.arena);
-					}
-				}
-				state std::unordered_map<int, Future<BlobGranuleCipherKeysCtx>> deltaCipherKeysCtxs;
-				for (int deltaIdx = 0; deltaIdx < chunk.deltaFiles.size(); deltaIdx++) {
-					const bool encrypted = chunk.deltaFiles[deltaIdx].cipherKeysMetaRef.present();
-
-					if (BW_DEBUG) {
-						TraceEvent("DoBlobGranuleFileRequestDelta_KeysCtxPrepare")
-						    .detail("FileName", chunk.deltaFiles[deltaIdx].filename.toString())
-						    .detail("Encrypted", encrypted);
-					}
-
-					if (encrypted) {
-						ASSERT(isBlobFileEncryptionSupported());
-						ASSERT(!chunk.deltaFiles[deltaIdx].cipherKeysCtx.present());
-
-						deltaCipherKeysCtxs.emplace(
-						    deltaIdx,
-						    getGranuleCipherKeysFromKeysMetaRef(
-						        bwData, chunk.deltaFiles[deltaIdx].cipherKeysMetaRef.get(), &rep.arena));
-					}
-				}
-
-				// FIXME: get cipher keys for delta files too!
-
-				// new deltas (if version is larger than version of last delta file)
-				// FIXME: do trivial key bounds here if key range is not fully contained in request key
-				// range
-				if (req.readVersion > metadata->durableDeltaVersion.get() && !metadata->currentDeltas.empty()) {
-					if (metadata->durableDeltaVersion.get() != metadata->pendingDeltaVersion) {
-						fmt::print(
-						    "real-time read [{0} - {1}) @ {2} doesn't have mutations!! durable={3}, pending={4}\n",
-						    metadata->keyRange.begin.printable(),
-						    metadata->keyRange.end.printable(),
-						    req.readVersion,
-						    metadata->durableDeltaVersion.get(),
-						    metadata->pendingDeltaVersion);
-					}
-
-					// prune mutations based on begin version, if possible
-					ASSERT(metadata->durableDeltaVersion.get() == metadata->pendingDeltaVersion);
-					// FIXME: I think we can remove this dependsOn since we are doing push_back_deep
-					rep.arena.dependsOn(metadata->currentDeltas.arena());
-					MutationsAndVersionRef* mutationIt = metadata->currentDeltas.begin();
-					if (granuleBeginVersion > metadata->currentDeltas.back().version) {
-						CODE_PROBE(true, "beginVersion pruning all in-memory mutations");
-						mutationIt = metadata->currentDeltas.end();
-					} else if (granuleBeginVersion > metadata->currentDeltas.front().version) {
-						// binary search for beginVersion
-						CODE_PROBE(true, "beginVersion pruning some in-memory mutations");
-						mutationIt = std::lower_bound(metadata->currentDeltas.begin(),
-						                              metadata->currentDeltas.end(),
-						                              MutationsAndVersionRef(granuleBeginVersion, 0),
-						                              MutationsAndVersionRef::OrderByVersion());
-					}
-
-					// add mutations to response
-					while (mutationIt != metadata->currentDeltas.end()) {
-						if (mutationIt->version > req.readVersion) {
-							CODE_PROBE(true, "readVersion pruning some in-memory mutations");
-							break;
-						}
-						chunk.newDeltas.push_back_deep(rep.arena, *mutationIt);
-						mutationIt++;
-					}
-				}
-
-				// Update EncryptionKeysCtx information for the chunk->snapshotFile
-				if (chunk.snapshotFile.present() && snapCipherKeysCtx.present()) {
-					ASSERT(chunk.snapshotFile.get().cipherKeysMetaRef.present());
-
-					BlobGranuleCipherKeysCtx keysCtx = wait(snapCipherKeysCtx.get());
-					chunk.snapshotFile.get().cipherKeysCtx = std::move(keysCtx);
-					// reclaim memory from non-serializable field
-					chunk.snapshotFile.get().cipherKeysMetaRef.reset();
-
-					if (BW_DEBUG) {
-						TraceEvent("DoBlobGranuleFileRequestSnap_KeysCtxDone")
-						    .detail("FileName", chunk.snapshotFile.get().filename.toString());
-					}
-				}
-
-				// Update EncryptionKeysCtx information for the chunk->deltaFiles
-				if (!deltaCipherKeysCtxs.empty()) {
-					ASSERT(!chunk.deltaFiles.empty());
-
-					state std::unordered_map<int, Future<BlobGranuleCipherKeysCtx>>::const_iterator itr;
-					for (itr = deltaCipherKeysCtxs.begin(); itr != deltaCipherKeysCtxs.end(); itr++) {
-						BlobGranuleCipherKeysCtx keysCtx = wait(itr->second);
-						chunk.deltaFiles[itr->first].cipherKeysCtx = std::move(keysCtx);
-						// reclaim memory from non-serializable field
-						chunk.deltaFiles[itr->first].cipherKeysMetaRef.reset();
+				if (!req.summarize) {
+					chunk.includedVersion = req.readVersion;
+					// Invoke calls to populate 'EncryptionKeysCtx' for snapshot and/or deltaFiles asynchronously
+					state Optional<Future<BlobGranuleCipherKeysCtx>> snapCipherKeysCtx;
+					if (chunk.snapshotFile.present()) {
+						const bool encrypted = chunk.snapshotFile.get().cipherKeysMetaRef.present();
 
 						if (BW_DEBUG) {
-							TraceEvent("DoBlobGranuleFileRequestDelta_KeysCtxDone")
-							    .detail("FileName", chunk.deltaFiles[itr->first].filename.toString());
+							TraceEvent("DoBlobGranuleFileRequestDelta_KeysCtxPrepare")
+							    .detail("FileName", chunk.snapshotFile.get().filename.toString())
+							    .detail("Encrypted", encrypted);
+						}
+
+						if (encrypted) {
+							ASSERT(bwData->isEncryptionEnabled);
+							ASSERT(!chunk.snapshotFile.get().cipherKeysCtx.present());
+
+							snapCipherKeysCtx = getGranuleCipherKeysFromKeysMetaRef(
+							    bwData, chunk.snapshotFile.get().cipherKeysMetaRef.get(), &rep.arena);
+						}
+					}
+					state std::unordered_map<int, Future<BlobGranuleCipherKeysCtx>> deltaCipherKeysCtxs;
+					for (int deltaIdx = 0; deltaIdx < chunk.deltaFiles.size(); deltaIdx++) {
+						const bool encrypted = chunk.deltaFiles[deltaIdx].cipherKeysMetaRef.present();
+
+						if (BW_DEBUG) {
+							TraceEvent("DoBlobGranuleFileRequestDelta_KeysCtxPrepare")
+							    .detail("FileName", chunk.deltaFiles[deltaIdx].filename.toString())
+							    .detail("Encrypted", encrypted);
+						}
+
+						if (encrypted) {
+							ASSERT(bwData->isEncryptionEnabled);
+							ASSERT(!chunk.deltaFiles[deltaIdx].cipherKeysCtx.present());
+
+							deltaCipherKeysCtxs.emplace(
+							    deltaIdx,
+							    getGranuleCipherKeysFromKeysMetaRef(
+							        bwData, chunk.deltaFiles[deltaIdx].cipherKeysMetaRef.get(), &rep.arena));
+						}
+					}
+
+					// new deltas (if version is larger than version of last delta file)
+					// FIXME: do trivial key bounds here if key range is not fully contained in request key
+					// range
+					if (req.readVersion > metadata->durableDeltaVersion.get() && !metadata->currentDeltas.empty()) {
+						if (metadata->durableDeltaVersion.get() != metadata->pendingDeltaVersion) {
+							fmt::print(
+							    "real-time read [{0} - {1}) @ {2} doesn't have mutations!! durable={3}, pending={4}\n",
+							    metadata->keyRange.begin.printable(),
+							    metadata->keyRange.end.printable(),
+							    req.readVersion,
+							    metadata->durableDeltaVersion.get(),
+							    metadata->pendingDeltaVersion);
+						}
+
+						// prune mutations based on begin version, if possible
+						ASSERT(metadata->durableDeltaVersion.get() == metadata->pendingDeltaVersion);
+						MutationsAndVersionRef* mutationIt = metadata->currentDeltas.begin();
+						if (granuleBeginVersion > metadata->currentDeltas.back().version) {
+							CODE_PROBE(true, "beginVersion pruning all in-memory mutations");
+							mutationIt = metadata->currentDeltas.end();
+						} else if (granuleBeginVersion > metadata->currentDeltas.front().version) {
+							// binary search for beginVersion
+							CODE_PROBE(true, "beginVersion pruning some in-memory mutations");
+							mutationIt = std::lower_bound(metadata->currentDeltas.begin(),
+							                              metadata->currentDeltas.end(),
+							                              MutationsAndVersionRef(granuleBeginVersion, 0),
+							                              MutationsAndVersionRef::OrderByVersion());
+						}
+
+						// add mutations to response
+						while (mutationIt != metadata->currentDeltas.end()) {
+							if (mutationIt->version > req.readVersion) {
+								CODE_PROBE(true, "readVersion pruning some in-memory mutations");
+								break;
+							}
+							chunk.newDeltas.push_back_deep(rep.arena, *mutationIt);
+							mutationIt++;
+						}
+					}
+
+					// Update EncryptionKeysCtx information for the chunk->snapshotFile
+					if (chunk.snapshotFile.present() && snapCipherKeysCtx.present()) {
+						ASSERT(chunk.snapshotFile.get().cipherKeysMetaRef.present());
+
+						BlobGranuleCipherKeysCtx keysCtx = wait(snapCipherKeysCtx.get());
+						chunk.snapshotFile.get().cipherKeysCtx = std::move(keysCtx);
+						// reclaim memory from non-serializable field
+						chunk.snapshotFile.get().cipherKeysMetaRef.reset();
+
+						if (BW_DEBUG) {
+							TraceEvent("DoBlobGranuleFileRequestSnap_KeysCtxDone")
+							    .detail("FileName", chunk.snapshotFile.get().filename.toString());
+						}
+					}
+
+					// Update EncryptionKeysCtx information for the chunk->deltaFiles
+					if (!deltaCipherKeysCtxs.empty()) {
+						ASSERT(!chunk.deltaFiles.empty());
+
+						state std::unordered_map<int, Future<BlobGranuleCipherKeysCtx>>::const_iterator itr;
+						for (itr = deltaCipherKeysCtxs.begin(); itr != deltaCipherKeysCtxs.end(); itr++) {
+							BlobGranuleCipherKeysCtx keysCtx = wait(itr->second);
+							chunk.deltaFiles[itr->first].cipherKeysCtx = std::move(keysCtx);
+							// reclaim memory from non-serializable field
+							chunk.deltaFiles[itr->first].cipherKeysMetaRef.reset();
+
+							if (BW_DEBUG) {
+								TraceEvent("DoBlobGranuleFileRequestDelta_KeysCtxDone")
+								    .detail("FileName", chunk.deltaFiles[itr->first].filename.toString());
+							}
 						}
 					}
 				}
@@ -3422,6 +3697,11 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 }
 
 ACTOR Future<Void> handleBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, BlobGranuleFileRequest req) {
+	++bwData->stats.readRequests;
+	++bwData->stats.activeReadRequests;
+	if (req.summarize) {
+		++bwData->stats.summaryReads;
+	}
 	choose {
 		when(wait(doBlobGranuleFileRequest(bwData, req))) {}
 		when(wait(delay(SERVER_KNOBS->BLOB_WORKER_REQUEST_TIMEOUT))) {
@@ -3478,17 +3758,34 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
 		try {
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 
 			state GranuleStartState info;
 			info.changeFeedStartVersion = invalidVersion;
 
 			state Future<Optional<Value>> fLockValue = tr.get(lockKey);
+			state Future<ForcedPurgeState> fForcedPurgeState = getForcePurgedState(&tr, req.keyRange);
 			Future<Optional<GranuleHistory>> fHistory = getLatestGranuleHistory(&tr, req.keyRange);
+
 			Optional<GranuleHistory> history = wait(fHistory);
 			info.history = history;
+
+			ForcedPurgeState purgeState = wait(fForcedPurgeState);
+			if (purgeState != ForcedPurgeState::NonePurged) {
+				CODE_PROBE(true, "Worker trying to open force purged granule");
+				if (BW_DEBUG) {
+					fmt::print("Granule [{0} - {1}) is force purged on BW {2}, abandoning\n",
+					           req.keyRange.begin.printable(),
+					           req.keyRange.end.printable(),
+					           bwData->id.toString().substr(0, 5));
+				}
+				throw granule_assignment_conflict();
+			}
+
 			Optional<Value> prevLockValue = wait(fLockValue);
 			state bool hasPrevOwner = prevLockValue.present();
 			state bool createChangeFeed = false;
+
 			if (hasPrevOwner) {
 				CODE_PROBE(true, "Granule open found previous owner");
 				std::tuple<int64_t, int64_t, UID> prevOwner = decodeBlobGranuleLockValue(prevLockValue.get());
@@ -3936,6 +4233,17 @@ ACTOR Future<Void> handleRangeAssign(Reference<BlobWorkerData> bwData,
 		if (req.type == AssignRequestType::Continue) {
 			resumeBlobRange(bwData, req.keyRange, req.managerEpoch, req.managerSeqno);
 		} else {
+			if (!isSelfReassign && bwData->isFull()) {
+				if (BW_DEBUG) {
+					fmt::print("BW {0}: rejecting assignment [{1} - {2}) b/c full\n",
+					           bwData->id.toString().substr(0, 6),
+					           req.keyRange.begin.printable(),
+					           req.keyRange.end.printable());
+				}
+				++bwData->stats.fullRejections;
+				req.reply.sendError(blob_worker_full());
+				return Void();
+			}
 			std::vector<Future<Void>> toWait;
 			state bool shouldStart = changeBlobRange(bwData,
 			                                         req.keyRange,
@@ -3949,7 +4257,9 @@ ACTOR Future<Void> handleRangeAssign(Reference<BlobWorkerData> bwData,
 			wait(waitForAll(toWait));
 
 			if (shouldStart) {
-				bwData->stats.numRangesAssigned++;
+				if (!isSelfReassign) {
+					bwData->stats.numRangesAssigned++;
+				}
 				auto m = bwData->granuleMetadata.rangeContaining(req.keyRange.begin);
 				ASSERT(m.begin() == req.keyRange.begin && m.end() == req.keyRange.end);
 				if (m.value().activeMetadata.isValid()) {
@@ -4032,12 +4342,24 @@ ACTOR Future<Void> handleRangeRevoke(Reference<BlobWorkerData> bwData, RevokeBlo
 	}
 }
 
+void handleBlobVersionRequest(Reference<BlobWorkerData> bwData, MinBlobVersionRequest req) {
+	bwData->db->setDesiredChangeFeedVersion(
+	    std::max<Version>(0, req.grv - (SERVER_KNOBS->TARGET_BW_LAG_UPDATE * SERVER_KNOBS->VERSIONS_PER_SECOND)));
+	MinBlobVersionReply rep;
+	rep.version = bwData->db->getMinimumChangeFeedVersion();
+	bwData->stats.minimumCFVersion = rep.version;
+	bwData->stats.cfVersionLag = std::max((Version)0, req.grv - rep.version);
+	bwData->stats.notAtLatestChangeFeeds = bwData->db->notAtLatestChangeFeeds.size();
+	req.reply.send(rep);
+}
+
 ACTOR Future<Void> registerBlobWorker(Reference<BlobWorkerData> bwData, BlobWorkerInterface interf) {
 	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bwData->db);
 	TraceEvent("BlobWorkerRegister", bwData->id);
 	loop {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 		try {
 			Key blobWorkerListKey = blobWorkerListKeyFor(interf.id());
 			// FIXME: should be able to remove this conflict range
@@ -4074,6 +4396,7 @@ ACTOR Future<Void> monitorRemoval(Reference<BlobWorkerData> bwData) {
 			try {
 				tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
 				tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+				tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 
 				Optional<Value> val = wait(tr.get(blobWorkerListKey));
 				if (!val.present()) {
@@ -4111,6 +4434,8 @@ ACTOR Future<Void> runGRVChecks(Reference<BlobWorkerData> bwData) {
 
 		tr.reset();
 		try {
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 			Version readVersion = wait(tr.getReadVersion());
 			ASSERT(readVersion >= bwData->grvVersion.get());
 			bwData->grvVersion.set(readVersion);
@@ -4131,12 +4456,13 @@ ACTOR Future<Void> monitorTenants(Reference<BlobWorkerData> bwData) {
 			try {
 				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 				state KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> tenantResults;
 				wait(store(tenantResults,
-				           TenantMetadata::tenantMap.getRange(tr,
-				                                              Optional<TenantName>(),
-				                                              Optional<TenantName>(),
-				                                              CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)));
+				           TenantMetadata::tenantMap().getRange(tr,
+				                                                Optional<TenantName>(),
+				                                                Optional<TenantName>(),
+				                                                CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)));
 				ASSERT(tenantResults.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER && !tenantResults.more);
 
 				std::vector<std::pair<TenantName, TenantMapEntry>> tenants;
@@ -4146,7 +4472,7 @@ ACTOR Future<Void> monitorTenants(Reference<BlobWorkerData> bwData) {
 				}
 				bwData->tenantData.addTenants(tenants);
 
-				state Future<Void> watchChange = tr->watch(TenantMetadata::lastTenantId.key);
+				state Future<Void> watchChange = tr->watch(TenantMetadata::lastTenantId().key);
 				wait(tr->commit());
 				wait(watchChange);
 				tr->reset();
@@ -4162,7 +4488,7 @@ void handleGetGranuleAssignmentsRequest(Reference<BlobWorkerData> self, const Ge
 	GetGranuleAssignmentsReply reply;
 	auto allRanges = self->granuleMetadata.intersectingRanges(normalKeys);
 	for (auto& it : allRanges) {
-		if (it.value().activeMetadata.isValid()) {
+		if (it.value().activeMetadata.isValid() && it.value().activeMetadata->cancelled.canBeSet()) {
 			// range is active, copy into reply's arena
 			StringRef start = StringRef(reply.arena, it.begin());
 			StringRef end = StringRef(reply.arena, it.end());
@@ -4225,6 +4551,10 @@ ACTOR Future<Void> handleFlushGranuleReq(Reference<BlobWorkerData> self, FlushGr
 				// force granule to flush at this version, and wait
 				if (req.flushVersion > metadata->pendingDeltaVersion) {
 					// first, wait for granule active
+					if (!metadata->activeCFData.get().isValid()) {
+						req.reply.sendError(wrong_shard_server());
+						return Void();
+					}
 
 					// wait for change feed version to catch up to ensure we have all data
 					if (metadata->activeCFData.get()->getVersion() < req.flushVersion) {
@@ -4343,6 +4673,68 @@ ACTOR Future<Void> handleFlushGranuleReq(Reference<BlobWorkerData> self, FlushGr
 	}
 }
 
+ACTOR Future<Void> simForceFileWriteContention(Reference<BlobWorkerData> bwData) {
+	// take the file write contention lock down to just 1 or 2 open writes
+	int numToLeave = deterministicRandom()->randomInt(1, 3);
+	state int numToTake = SERVER_KNOBS->BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM - numToLeave;
+	ASSERT(bwData->deltaWritesLock->available() >= numToTake);
+
+	if (numToTake <= 0) {
+		return Void();
+	}
+	if (BW_DEBUG) {
+		fmt::print("BW {0} forcing file contention down to {1}\n", bwData->id.toString().substr(0, 5), numToTake);
+	}
+
+	wait(bwData->deltaWritesLock->take(TaskPriority::DefaultYield, numToTake));
+	if (BW_DEBUG) {
+		fmt::print("BW {0} force acquired {1} file writes\n", bwData->id.toString().substr(0, 5), numToTake);
+	}
+	state FlowLock::Releaser holdingLock(*bwData->deltaWritesLock, numToTake);
+	state Future<Void> delayFor = delay(deterministicRandom()->randomInt(10, 60));
+	loop {
+		choose {
+			when(wait(delayFor)) {
+				if (BW_DEBUG) {
+					fmt::print("BW {0} releasing {1} file writes\n", bwData->id.toString().substr(0, 5), numToTake);
+				}
+				return Void();
+			}
+			// check for speed up sim
+			when(wait(delay(5.0))) {
+				if (g_simulator.speedUpSimulation) {
+					if (BW_DEBUG) {
+						fmt::print("BW {0} releasing {1} file writes b/c speed up simulation\n",
+						           bwData->id.toString().substr(0, 5),
+						           numToTake);
+					}
+					return Void();
+				}
+			}
+		}
+	}
+}
+
+ACTOR Future<Void> simForceFullMemory(Reference<BlobWorkerData> bwData) {
+	// instead of randomly rejecting each request or not, simulate periods in which BW is full
+	loop {
+		wait(delayJittered(deterministicRandom()->randomInt(5, 20)));
+		if (g_simulator.speedUpSimulation) {
+			bwData->buggifyFull = false;
+			if (BW_DEBUG) {
+				fmt::print("BW {0}: ForceFullMemory exiting\n", bwData->id.toString().substr(0, 6));
+			}
+			return Void();
+		}
+		bwData->buggifyFull = !bwData->buggifyFull;
+		if (BW_DEBUG) {
+			fmt::print("BW {0}: ForceFullMemory {1}\n",
+			           bwData->id.toString().substr(0, 6),
+			           bwData->buggifyFull ? "starting" : "stopping");
+		}
+	}
+}
+
 ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
                               ReplyPromise<InitializeBlobWorkerReply> recruitReply,
                               Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
@@ -4358,7 +4750,7 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
 	}
 
 	try {
-		if (SERVER_KNOBS->BG_RANGE_SOURCE != "tenant") {
+		if (SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
 			if (BW_DEBUG) {
 				fmt::print("BW constructing backup container from {0}\n", SERVER_KNOBS->BG_URL);
 			}
@@ -4391,18 +4783,20 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
 
 	self->addActor.send(waitFailureServer(bwInterf.waitFailure.getFuture()));
 	self->addActor.send(runGRVChecks(self));
-	if (SERVER_KNOBS->BG_RANGE_SOURCE == "tenant") {
-		self->addActor.send(monitorTenants(self));
-	}
+	self->addActor.send(monitorTenants(self));
 	state Future<Void> selfRemoved = monitorRemoval(self);
+	if (g_network->isSimulated() && BUGGIFY_WITH_PROB(0.25)) {
+		self->addActor.send(simForceFileWriteContention(self));
+	}
+	if (g_network->isSimulated() && SERVER_KNOBS->BLOB_WORKER_DO_REJECT_WHEN_FULL && BUGGIFY_WITH_PROB(0.25)) {
+		self->addActor.send(simForceFullMemory(self));
+	}
 
 	TraceEvent("BlobWorkerInit", self->id).log();
 
 	try {
 		loop choose {
 			when(BlobGranuleFileRequest req = waitNext(bwInterf.blobGranuleFileRequest.getFuture())) {
-				++self->stats.readRequests;
-				++self->stats.activeReadRequests;
 				self->addActor.send(handleBlobGranuleFileRequest(self, req));
 			}
 			when(state GranuleStatusStreamRequest req = waitNext(bwInterf.granuleStatusStreamRequest.getFuture())) {
@@ -4424,7 +4818,6 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
 					if (self->statusStreamInitialized) {
 						copy = self->currentManagerStatusStream.get();
 					}
-					// TODO: pick a reasonable byte limit instead of just piggy-backing
 					req.reply.setByteLimit(SERVER_KNOBS->BLOBWORKERSTATUSSTREAM_LIMIT_BYTES);
 					self->statusStreamInitialized = true;
 
@@ -4474,7 +4867,8 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
 				self->addActor.send(handleRangeAssign(self, granuleToReassign, true));
 			}
 			when(GetGranuleAssignmentsRequest req = waitNext(bwInterf.granuleAssignmentsRequest.getFuture())) {
-				if (self->managerEpochOk(req.managerEpoch)) {
+				// if request isn't from a manager and is just validation, let it check
+				if (req.managerEpoch == -1 || self->managerEpochOk(req.managerEpoch)) {
 					if (BW_DEBUG) {
 						fmt::print("Worker {0} got granule assignments request from BM {1}\n",
 						           self->id.toString(),
@@ -4499,6 +4893,9 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
 					req.reply.sendError(blob_manager_replaced());
 				}
 			}
+			when(MinBlobVersionRequest req = waitNext(bwInterf.minBlobVersionRequest.getFuture())) {
+				handleBlobVersionRequest(self, req);
+			}
 			when(FlushGranuleRequest req = waitNext(bwInterf.flushGranuleRequest.getFuture())) {
 				if (self->managerEpochOk(req.managerEpoch)) {
 					if (BW_DEBUG) {
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 485d3ab3cb..f3d8b2167b 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -27,11 +27,12 @@
 
 #include "fdbclient/SystemData.h"
 #include "fdbrpc/FailureMonitor.h"
-#include "fdbserver/EncryptKeyProxyInterface.h"
+#include "fdbclient/EncryptKeyProxyInterface.h"
 #include "fdbserver/Knobs.h"
 #include "flow/ActorCollection.h"
 #include "fdbclient/ClusterConnectionMemoryRecord.h"
 #include "fdbclient/NativeAPI.actor.h"
+#include "fdbclient/TenantManagement.actor.h"
 #include "fdbserver/ApplyMetadataMutation.h"
 #include "fdbserver/BackupInterface.h"
 #include "fdbserver/BackupProgress.actor.h"
@@ -102,7 +103,7 @@ struct RatekeeperSingleton : Singleton<RatekeeperInterface> {
 		}
 	}
 	void halt(ClusterControllerData* cc, Optional<Standalone<StringRef>> pid) const {
-		if (interface.present()) {
+		if (interface.present() && cc->id_worker.count(pid)) {
 			cc->id_worker[pid].haltRatekeeper =
 			    brokenPromiseToNever(interface.get().haltRatekeeper.getReply(HaltRatekeeperRequest(cc->id)));
 		}
@@ -127,7 +128,7 @@ struct DataDistributorSingleton : Singleton<DataDistributorInterface> {
 		}
 	}
 	void halt(ClusterControllerData* cc, Optional<Standalone<StringRef>> pid) const {
-		if (interface.present()) {
+		if (interface.present() && cc->id_worker.count(pid)) {
 			cc->id_worker[pid].haltDistributor =
 			    brokenPromiseToNever(interface.get().haltDataDistributor.getReply(HaltDataDistributorRequest(cc->id)));
 		}
@@ -152,7 +153,7 @@ struct BlobManagerSingleton : Singleton<BlobManagerInterface> {
 		}
 	}
 	void halt(ClusterControllerData* cc, Optional<Standalone<StringRef>> pid) const {
-		if (interface.present()) {
+		if (interface.present() && cc->id_worker.count(pid)) {
 			cc->id_worker[pid].haltBlobManager =
 			    brokenPromiseToNever(interface.get().haltBlobManager.getReply(HaltBlobManagerRequest(cc->id)));
 		}
@@ -184,7 +185,7 @@ struct EncryptKeyProxySingleton : Singleton<EncryptKeyProxyInterface> {
 		}
 	}
 	void halt(ClusterControllerData* cc, Optional<Standalone<StringRef>> pid) const {
-		if (interface.present()) {
+		if (interface.present() && cc->id_worker.count(pid)) {
 			cc->id_worker[pid].haltEncryptKeyProxy =
 			    brokenPromiseToNever(interface.get().haltEncryptKeyProxy.getReply(HaltEncryptKeyProxyRequest(cc->id)));
 		}
@@ -248,9 +249,12 @@ ACTOR Future<Void> clusterWatchDatabase(ClusterControllerData* cluster,
 			dbInfo.latencyBandConfig = db->serverInfo->get().latencyBandConfig;
 			dbInfo.myLocality = db->serverInfo->get().myLocality;
 			dbInfo.client = ClientDBInfo();
+			dbInfo.client.encryptKeyProxy = db->serverInfo->get().encryptKeyProxy;
 			dbInfo.client.isEncryptionEnabled = SERVER_KNOBS->ENABLE_ENCRYPTION;
-			dbInfo.client.tenantMode = db->config.tenantMode;
+			dbInfo.client.tenantMode = TenantAPI::tenantModeForClusterType(db->clusterType, db->config.tenantMode);
 			dbInfo.client.clusterId = db->serverInfo->get().client.clusterId;
+			dbInfo.client.clusterType = db->clusterType;
+			dbInfo.client.metaclusterName = db->metaclusterName;
 
 			TraceEvent("CCWDB", cluster->id)
 			    .detail("NewMaster", dbInfo.master.id().toString())
@@ -799,6 +803,7 @@ void checkOutstandingRequests(ClusterControllerData* self) {
 
 ACTOR Future<Void> rebootAndCheck(ClusterControllerData* cluster, Optional<Standalone<StringRef>> processID) {
 	{
+		ASSERT(processID.present());
 		auto watcher = cluster->id_worker.find(processID);
 		ASSERT(watcher != cluster->id_worker.end());
 
@@ -1013,7 +1018,10 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
 	if (db->clientInfo->get().commitProxies != req.commitProxies ||
 	    db->clientInfo->get().grvProxies != req.grvProxies ||
 	    db->clientInfo->get().tenantMode != db->config.tenantMode || db->clientInfo->get().clusterId != req.clusterId ||
-	    db->clientInfo->get().isEncryptionEnabled != SERVER_KNOBS->ENABLE_ENCRYPTION) {
+	    db->clientInfo->get().isEncryptionEnabled != SERVER_KNOBS->ENABLE_ENCRYPTION ||
+	    db->clientInfo->get().clusterType != db->clusterType ||
+	    db->clientInfo->get().metaclusterName != db->metaclusterName ||
+	    db->clientInfo->get().encryptKeyProxy != db->serverInfo->get().encryptKeyProxy) {
 		TraceEvent("PublishNewClientInfo", self->id)
 		    .detail("Master", dbInfo.master.id())
 		    .detail("GrvProxies", db->clientInfo->get().grvProxies)
@@ -1023,17 +1031,24 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
 		    .detail("TenantMode", db->clientInfo->get().tenantMode.toString())
 		    .detail("ReqTenantMode", db->config.tenantMode.toString())
 		    .detail("ClusterId", db->clientInfo->get().clusterId)
+		    .detail("ReqClusterId", req.clusterId)
 		    .detail("EncryptionEnabled", SERVER_KNOBS->ENABLE_ENCRYPTION)
-		    .detail("ReqClusterId", req.clusterId);
+		    .detail("ClusterType", db->clientInfo->get().clusterType)
+		    .detail("ReqClusterType", db->clusterType)
+		    .detail("MetaclusterName", db->clientInfo->get().metaclusterName)
+		    .detail("ReqMetaclusterName", db->metaclusterName);
 		isChanged = true;
 		// TODO why construct a new one and not just copy the old one and change proxies + id?
 		ClientDBInfo clientInfo;
+		clientInfo.encryptKeyProxy = db->serverInfo->get().encryptKeyProxy;
 		clientInfo.id = deterministicRandom()->randomUniqueID();
 		clientInfo.isEncryptionEnabled = SERVER_KNOBS->ENABLE_ENCRYPTION;
 		clientInfo.commitProxies = req.commitProxies;
 		clientInfo.grvProxies = req.grvProxies;
-		clientInfo.tenantMode = db->config.tenantMode;
+		clientInfo.tenantMode = TenantAPI::tenantModeForClusterType(db->clusterType, db->config.tenantMode);
 		clientInfo.clusterId = req.clusterId;
+		clientInfo.clusterType = db->clusterType;
+		clientInfo.metaclusterName = db->metaclusterName;
 		db->clientInfo->set(clientInfo);
 		dbInfo.client = db->clientInfo->get();
 	}
@@ -1234,6 +1249,10 @@ ACTOR Future<Void> registerWorker(RegisterWorkerRequest req,
 		if (info->second.details.interf.id() != w.id()) {
 			self->removedDBInfoEndpoints.insert(info->second.details.interf.updateServerDBInfo.getEndpoint());
 			info->second.details.interf = w;
+			// Cancel the existing watcher actor; possible race condition could be, the older registered watcher
+			// detects failures and removes the worker from id_worker even before the new watcher starts monitoring the
+			// new interface
+			info->second.watcher.cancel();
 			info->second.watcher = workerAvailabilityWatch(w, newProcessClass, self);
 		}
 		if (req.requestDbInfo) {
@@ -2039,8 +2058,9 @@ ACTOR Future<Void> monitorDataDistributor(ClusterControllerData* self) {
 			choose {
 				when(wait(waitFailureClient(self->db.serverInfo->get().distributor.get().waitFailure,
 				                            SERVER_KNOBS->DD_FAILURE_TIME))) {
-					TraceEvent("CCDataDistributorDied", self->id)
-					    .detail("DDID", self->db.serverInfo->get().distributor.get().id());
+					const auto& distributor = self->db.serverInfo->get().distributor;
+					TraceEvent("CCDataDistributorDied", self->id).detail("DDID", distributor.get().id());
+					DataDistributorSingleton(distributor).halt(self, distributor.get().locality.processId());
 					self->db.clearInterf(ProcessClass::DataDistributorClass);
 				}
 				when(wait(self->recruitDistributor.onChange())) {}
@@ -2130,8 +2150,9 @@ ACTOR Future<Void> monitorRatekeeper(ClusterControllerData* self) {
 			choose {
 				when(wait(waitFailureClient(self->db.serverInfo->get().ratekeeper.get().waitFailure,
 				                            SERVER_KNOBS->RATEKEEPER_FAILURE_TIME))) {
-					TraceEvent("CCRatekeeperDied", self->id)
-					    .detail("RKID", self->db.serverInfo->get().ratekeeper.get().id());
+					const auto& ratekeeper = self->db.serverInfo->get().ratekeeper;
+					TraceEvent("CCRatekeeperDied", self->id).detail("RKID", ratekeeper.get().id());
+					RatekeeperSingleton(ratekeeper).halt(self, ratekeeper.get().locality.processId());
 					self->db.clearInterf(ProcessClass::RatekeeperClass);
 				}
 				when(wait(self->recruitRatekeeper.onChange())) {}
@@ -2226,6 +2247,8 @@ ACTOR Future<Void> monitorEncryptKeyProxy(ClusterControllerData* self) {
 				when(wait(waitFailureClient(self->db.serverInfo->get().encryptKeyProxy.get().waitFailure,
 				                            SERVER_KNOBS->ENCRYPT_KEY_PROXY_FAILURE_TIME))) {
 					TraceEvent("CCEKP_Died", self->id);
+					const auto& encryptKeyProxy = self->db.serverInfo->get().encryptKeyProxy;
+					EncryptKeyProxySingleton(encryptKeyProxy).halt(self, encryptKeyProxy.get().locality.processId());
 					self->db.clearInterf(ProcessClass::EncryptKeyProxyClass);
 				}
 				when(wait(self->recruitEncryptKeyProxy.onChange())) {}
@@ -2298,7 +2321,8 @@ ACTOR Future<Void> startBlobManager(ClusterControllerData* self, double waitTime
 			self->recruitingBlobManagerID = req.reqId;
 			TraceEvent("CCRecruitBlobManager", self->id)
 			    .detail("Addr", worker.interf.address())
-			    .detail("BMID", req.reqId);
+			    .detail("BMID", req.reqId)
+			    .detail("Epoch", nextEpoch);
 
 			ErrorOr<BlobManagerInterface> interf = wait(worker.interf.blobManager.getReplyUnlessFailedFor(
 			    req, SERVER_KNOBS->WAIT_FOR_BLOB_MANAGER_JOIN_DELAY, 0));
@@ -2369,8 +2393,9 @@ ACTOR Future<Void> monitorBlobManager(ClusterControllerData* self) {
 			loop {
 				choose {
 					when(wait(wfClient)) {
-						TraceEvent("CCBlobManagerDied", self->id)
-						    .detail("BMID", self->db.serverInfo->get().blobManager.get().id());
+						const auto& blobManager = self->db.serverInfo->get().blobManager;
+						TraceEvent("CCBlobManagerDied", self->id).detail("BMID", blobManager.get().id());
+						BlobManagerSingleton(blobManager).halt(self, blobManager.get().locality.processId());
 						self->db.clearInterf(ProcessClass::BlobManagerClass);
 						break;
 					}
diff --git a/fdbserver/ClusterRecovery.actor.cpp b/fdbserver/ClusterRecovery.actor.cpp
index 604656fccd..44c84baf01 100644
--- a/fdbserver/ClusterRecovery.actor.cpp
+++ b/fdbserver/ClusterRecovery.actor.cpp
@@ -18,10 +18,12 @@
  * limitations under the License.
  */
 
+#include "fdbclient/Metacluster.h"
 #include "fdbrpc/sim_validation.h"
 #include "fdbserver/ApplyMetadataMutation.h"
 #include "fdbserver/BackupProgress.actor.h"
 #include "fdbserver/ClusterRecovery.actor.h"
+#include "fdbserver/EncryptionOpsUtils.h"
 #include "fdbserver/MasterInterface.h"
 #include "fdbserver/WaitFailure.h"
 
@@ -1057,14 +1059,15 @@ ACTOR Future<Void> readTransactionSystemState(Reference<ClusterRecoveryData> sel
 	if (self->txnStateStore)
 		self->txnStateStore->close();
 	self->txnStateLogAdapter = openDiskQueueAdapter(oldLogSystem, myLocality, txsPoppedVersion);
-	self->txnStateStore = keyValueStoreLogSystem(self->txnStateLogAdapter,
-	                                             self->dbInfo,
-	                                             self->dbgid,
-	                                             self->memoryLimit,
-	                                             false,
-	                                             false,
-	                                             true,
-	                                             SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION);
+	self->txnStateStore = keyValueStoreLogSystem(
+	    self->txnStateLogAdapter,
+	    self->dbInfo,
+	    self->dbgid,
+	    self->memoryLimit,
+	    false,
+	    false,
+	    true,
+	    isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION, self->dbInfo->get().client));
 
 	// Version 0 occurs at the version epoch. The version epoch is the number
 	// of microseconds since the Unix epoch. It can be set through fdbcli.
@@ -1157,6 +1160,17 @@ ACTOR Future<Void> readTransactionSystemState(Reference<ClusterRecoveryData> sel
 		self->allTags.push_back(decodeServerTagValue(kv.value));
 	}
 
+	Optional<Value> metaclusterRegistrationVal =
+	    wait(self->txnStateStore->readValue(MetaclusterMetadata::metaclusterRegistration().key));
+	Optional<MetaclusterRegistrationEntry> metaclusterRegistration =
+	    MetaclusterRegistrationEntry::decode(metaclusterRegistrationVal);
+	if (metaclusterRegistration.present()) {
+		self->controllerData->db.metaclusterName = metaclusterRegistration.get().metaclusterName;
+		self->controllerData->db.clusterType = metaclusterRegistration.get().clusterType;
+	} else {
+		self->controllerData->db.clusterType = ClusterType::STANDALONE;
+	}
+
 	uniquify(self->allTags);
 
 	// auto kvs = self->txnStateStore->readRange( systemKeys );
@@ -1424,7 +1438,7 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
 
 	wait(self->cstate.read());
 
-	if (self->cstate.prevDBState.lowestCompatibleProtocolVersion > currentProtocolVersion) {
+	if (self->cstate.prevDBState.lowestCompatibleProtocolVersion > currentProtocolVersion()) {
 		TraceEvent(SevWarnAlways, "IncompatibleProtocolVersion", self->dbgid).log();
 		throw internal_error();
 	}
@@ -1485,10 +1499,10 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
 	DBCoreState newState = self->cstate.myDBState;
 	newState.recoveryCount++;
 	if (self->cstate.prevDBState.newestProtocolVersion.isInvalid() ||
-	    self->cstate.prevDBState.newestProtocolVersion < currentProtocolVersion) {
+	    self->cstate.prevDBState.newestProtocolVersion < currentProtocolVersion()) {
 		ASSERT(self->cstate.myDBState.lowestCompatibleProtocolVersion.isInvalid() ||
 		       !self->cstate.myDBState.newestProtocolVersion.isInvalid());
-		newState.newestProtocolVersion = currentProtocolVersion;
+		newState.newestProtocolVersion = currentProtocolVersion();
 		newState.lowestCompatibleProtocolVersion = minCompatibleProtocolVersion;
 	}
 	wait(self->cstate.write(newState) || recoverAndEndEpoch);
@@ -1649,7 +1663,8 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
 	                       self->dbgid,
 	                       recoveryCommitRequest.arena,
 	                       tr.mutations.slice(mmApplied, tr.mutations.size()),
-	                       self->txnStateStore);
+	                       self->txnStateStore,
+	                       self->dbInfo);
 	mmApplied = tr.mutations.size();
 
 	tr.read_snapshot = self->recoveryTransactionVersion; // lastEpochEnd would make more sense, but isn't in the initial
diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp
index 9da19b6529..59c8e62abe 100644
--- a/fdbserver/CommitProxyServer.actor.cpp
+++ b/fdbserver/CommitProxyServer.actor.cpp
@@ -34,9 +34,9 @@
 #include "fdbserver/ApplyMetadataMutation.h"
 #include "fdbserver/ConflictSet.h"
 #include "fdbserver/DataDistributorInterface.h"
-#include "fdbserver/EncryptedMutationMessage.h"
+#include "fdbserver/EncryptionOpsUtils.h"
 #include "fdbserver/FDBExecHelper.actor.h"
-#include "fdbserver/GetEncryptCipherKeys.h"
+#include "fdbclient/GetEncryptCipherKeys.actor.h"
 #include "fdbserver/IKeyValueStore.h"
 #include "fdbserver/Knobs.h"
 #include "fdbserver/LogSystem.h"
@@ -380,7 +380,7 @@ ACTOR Future<Void> commitBatcher(ProxyCommitData* commitData,
 					// Drop requests if memory is under severe pressure
 					if (commitData->commitBatchesMemBytesCount + bytes > memBytesLimit) {
 						++commitData->stats.txnCommitErrors;
-						req.reply.sendError(proxy_memory_limit_exceeded());
+						req.reply.sendError(commit_proxy_memory_limit_exceeded());
 						TraceEvent(SevWarnAlways, "ProxyCommitBatchMemoryThresholdExceeded")
 						    .suppressFor(60)
 						    .detail("MemBytesCount", commitData->commitBatchesMemBytesCount)
@@ -878,7 +878,7 @@ ACTOR Future<Void> getResolution(CommitBatchContext* self) {
 	state double resolutionStart = now();
 	// Sending these requests is the fuzzy border between phase 1 and phase 2; it could conceivably overlap with
 	// resolution processing but is still using CPU
-	ProxyCommitData* pProxyCommitData = self->pProxyCommitData;
+	state ProxyCommitData* pProxyCommitData = self->pProxyCommitData;
 	std::vector<CommitTransactionRequest>& trs = self->trs;
 	state Span span("MP:getResolution"_loc, self->span.context);
 
@@ -916,7 +916,7 @@ ACTOR Future<Void> getResolution(CommitBatchContext* self) {
 
 	// Fetch cipher keys if needed.
 	state Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>> getCipherKeys;
-	if (SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION) {
+	if (pProxyCommitData->isEncryptionEnabled) {
 		static std::unordered_map<EncryptCipherDomainId, EncryptCipherDomainName> defaultDomains = {
 			{ SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME },
 			{ ENCRYPT_HEADER_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME }
@@ -959,8 +959,7 @@ ACTOR Future<Void> getResolution(CommitBatchContext* self) {
 		g_traceBatch.addEvent(
 		    "CommitDebug", self->debugID.get().first(), "CommitProxyServer.commitBatch.AfterResolution");
 	}
-
-	if (SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION) {
+	if (pProxyCommitData->isEncryptionEnabled) {
 		std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>> cipherKeys = wait(getCipherKeys);
 		self->cipherKeys = cipherKeys;
 	}
@@ -1104,7 +1103,7 @@ ACTOR Future<Void> applyMetadataToCommittedTransactions(CommitBatchContext* self
 				                       pProxyCommitData->logSystem,
 				                       trs[t].transaction.mutations,
 				                       SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS ? nullptr : &self->toCommit,
-				                       SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION ? &self->cipherKeys : nullptr,
+				                       pProxyCommitData->isEncryptionEnabled ? &self->cipherKeys : nullptr,
 				                       self->forceRecovery,
 				                       self->commitVersion,
 				                       self->commitVersion + 1,
@@ -1159,7 +1158,7 @@ ACTOR Future<Void> applyMetadataToCommittedTransactions(CommitBatchContext* self
 
 void writeMutation(CommitBatchContext* self, int64_t tenantId, const MutationRef& mutation) {
 	static_assert(TenantInfo::INVALID_TENANT == ENCRYPT_INVALID_DOMAIN_ID);
-	if (!SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION || tenantId == TenantInfo::INVALID_TENANT) {
+	if (!self->pProxyCommitData->isEncryptionEnabled || tenantId == TenantInfo::INVALID_TENANT) {
 		// TODO(yiwu): In raw access mode, use tenant prefix to figure out tenant id for user data
 		bool isRawAccess = tenantId == TenantInfo::INVALID_TENANT && !isSystemKey(mutation.param1) &&
 		                   !(mutation.type == MutationRef::ClearRange && isSystemKey(mutation.param2)) &&
@@ -1168,8 +1167,7 @@ void writeMutation(CommitBatchContext* self, int64_t tenantId, const MutationRef
 		self->toCommit.writeTypedMessage(mutation);
 	} else {
 		Arena arena;
-		self->toCommit.writeTypedMessage(
-		    EncryptedMutationMessage::encrypt(arena, self->cipherKeys, tenantId /*domainId*/, mutation));
+		self->toCommit.writeTypedMessage(mutation.encrypt(self->cipherKeys, tenantId /*domainId*/, arena));
 	}
 }
 
@@ -1948,7 +1946,7 @@ ACTOR static Future<Void> readRequestServer(CommitProxyInterface proxy,
 		    commitData->stats.keyServerLocationIn.getValue() - commitData->stats.keyServerLocationOut.getValue() >
 		        SERVER_KNOBS->KEY_LOCATION_MAX_QUEUE_SIZE) {
 			++commitData->stats.keyServerLocationErrors;
-			req.reply.sendError(proxy_memory_limit_exceeded());
+			req.reply.sendError(commit_proxy_memory_limit_exceeded());
 			TraceEvent(SevWarnAlways, "ProxyLocationRequestThresholdExceeded").suppressFor(60);
 		} else {
 			++commitData->stats.keyServerLocationIn;
@@ -2174,7 +2172,8 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
 				TraceEvent("SnapCommitProxy_DDSnapResponseError")
 				    .errorUnsuppressed(e)
 				    .detail("SnapPayload", snapReq.snapPayload)
-				    .detail("SnapUID", snapReq.snapUID);
+				    .detail("SnapUID", snapReq.snapUID)
+				    .detail("Retry", snapReqRetry);
 				// Retry if we have network issues
 				if (e.code() != error_code_request_maybe_delivered ||
 				    ++snapReqRetry > SERVER_KNOBS->SNAP_NETWORK_FAILURE_RETRY_LIMIT)
@@ -2470,7 +2469,8 @@ ACTOR Future<Void> commitProxyServerCore(CommitProxyInterface proxy,
 	// Wait until we can load the "real" logsystem, since we don't support switching them currently
 	while (!(masterLifetime.isEqual(commitData.db->get().masterLifetime) &&
 	         commitData.db->get().recoveryState >= RecoveryState::RECOVERY_TRANSACTION &&
-	         (!SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION || commitData.db->get().encryptKeyProxy.present()))) {
+	         (!isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION, db->get().client) ||
+	          commitData.db->get().encryptKeyProxy.present()))) {
 		//TraceEvent("ProxyInit2", proxy.id()).detail("LSEpoch", db->get().logSystemConfig.epoch).detail("Need", epoch);
 		wait(commitData.db->onChange());
 	}
@@ -2496,8 +2496,15 @@ ACTOR Future<Void> commitProxyServerCore(CommitProxyInterface proxy,
 	commitData.logSystem = ILogSystem::fromServerDBInfo(proxy.id(), commitData.db->get(), false, addActor);
 	commitData.logAdapter =
 	    new LogSystemDiskQueueAdapter(commitData.logSystem, Reference<AsyncVar<PeekTxsInfo>>(), 1, false);
-	commitData.txnStateStore = keyValueStoreLogSystem(
-	    commitData.logAdapter, commitData.db, proxy.id(), 2e9, true, true, true, SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION);
+	commitData.txnStateStore =
+	    keyValueStoreLogSystem(commitData.logAdapter,
+	                           commitData.db,
+	                           proxy.id(),
+	                           2e9,
+	                           true,
+	                           true,
+	                           true,
+	                           isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION, db->get().client));
 	createWhitelistBinPathVec(whitelistBinPaths, commitData.whitelistedBinPathVec);
 
 	commitData.updateLatencyBandConfig(commitData.db->get().latencyBandConfig);
diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DDRelocationQueue.actor.cpp
similarity index 81%
rename from fdbserver/DataDistributionQueue.actor.cpp
rename to fdbserver/DDRelocationQueue.actor.cpp
index 90f78c81f6..928cb4ec9e 100644
--- a/fdbserver/DataDistributionQueue.actor.cpp
+++ b/fdbserver/DDRelocationQueue.actor.cpp
@@ -29,11 +29,13 @@
 #include "fdbrpc/sim_validation.h"
 #include "fdbclient/SystemData.h"
 #include "fdbserver/DataDistribution.actor.h"
+#include "fdbserver/DDSharedContext.h"
 #include "fdbclient/DatabaseContext.h"
 #include "fdbserver/MoveKeys.actor.h"
 #include "fdbserver/Knobs.h"
 #include "fdbrpc/simulator.h"
 #include "fdbserver/DDTxnProcessor.h"
+#include "flow/DebugTrace.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
 
 #define WORK_FULL_UTILIZATION 10000 // This is not a knob; it is a fixed point scaling factor!
@@ -67,65 +69,55 @@ inline bool isDataMovementForValleyFiller(DataMovementReason reason) {
 	       reason == DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM;
 }
 
-int dataMovementPriority(DataMovementReason reason) {
-	int priority;
-	switch (reason) {
-	case DataMovementReason::INVALID:
-		priority = -1;
-		break;
-	case DataMovementReason::RECOVER_MOVE:
-		priority = SERVER_KNOBS->PRIORITY_RECOVER_MOVE;
-		break;
-	case DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM:
-		priority = SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM;
-		break;
-	case DataMovementReason::REBALANCE_OVERUTILIZED_TEAM:
-		priority = SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM;
-		break;
-	case DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM:
-		priority = SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM;
-		break;
-	case DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM:
-		priority = SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM;
-		break;
-	case DataMovementReason::PERPETUAL_STORAGE_WIGGLE:
-		priority = SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE;
-		break;
-	case DataMovementReason::TEAM_HEALTHY:
-		priority = SERVER_KNOBS->PRIORITY_TEAM_HEALTHY;
-		break;
-	case DataMovementReason::TEAM_CONTAINS_UNDESIRED_SERVER:
-		priority = SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER;
-		break;
-	case DataMovementReason::TEAM_REDUNDANT:
-		priority = SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT;
-		break;
-	case DataMovementReason::MERGE_SHARD:
-		priority = SERVER_KNOBS->PRIORITY_MERGE_SHARD;
-		break;
-	case DataMovementReason::POPULATE_REGION:
-		priority = SERVER_KNOBS->PRIORITY_POPULATE_REGION;
-		break;
-	case DataMovementReason::TEAM_UNHEALTHY:
-		priority = SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY;
-		break;
-	case DataMovementReason::TEAM_2_LEFT:
-		priority = SERVER_KNOBS->PRIORITY_TEAM_2_LEFT;
-		break;
-	case DataMovementReason::TEAM_1_LEFT:
-		priority = SERVER_KNOBS->PRIORITY_TEAM_1_LEFT;
-		break;
-	case DataMovementReason::TEAM_FAILED:
-		priority = SERVER_KNOBS->PRIORITY_TEAM_FAILED;
-		break;
-	case DataMovementReason::TEAM_0_LEFT:
-		priority = SERVER_KNOBS->PRIORITY_TEAM_0_LEFT;
-		break;
-	case DataMovementReason::SPLIT_SHARD:
-		priority = SERVER_KNOBS->PRIORITY_SPLIT_SHARD;
-		break;
+typedef std::map<DataMovementReason, int> DmReasonPriorityMapping;
+typedef std::map<int, DataMovementReason> PriorityDmReasonMapping;
+std::pair<const DmReasonPriorityMapping*, const PriorityDmReasonMapping*> buildPriorityMappings() {
+	static DmReasonPriorityMapping reasonPriority{
+		{ DataMovementReason::INVALID, -1 },
+		{ DataMovementReason::RECOVER_MOVE, SERVER_KNOBS->PRIORITY_RECOVER_MOVE },
+		{ DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM },
+		{ DataMovementReason::REBALANCE_OVERUTILIZED_TEAM, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM },
+		{ DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM, SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM },
+		{ DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM, SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM },
+		{ DataMovementReason::PERPETUAL_STORAGE_WIGGLE, SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE },
+		{ DataMovementReason::TEAM_HEALTHY, SERVER_KNOBS->PRIORITY_TEAM_HEALTHY },
+		{ DataMovementReason::TEAM_CONTAINS_UNDESIRED_SERVER, SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER },
+		{ DataMovementReason::TEAM_REDUNDANT, SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT },
+		{ DataMovementReason::MERGE_SHARD, SERVER_KNOBS->PRIORITY_MERGE_SHARD },
+		{ DataMovementReason::POPULATE_REGION, SERVER_KNOBS->PRIORITY_POPULATE_REGION },
+		{ DataMovementReason::TEAM_UNHEALTHY, SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY },
+		{ DataMovementReason::TEAM_2_LEFT, SERVER_KNOBS->PRIORITY_TEAM_2_LEFT },
+		{ DataMovementReason::TEAM_1_LEFT, SERVER_KNOBS->PRIORITY_TEAM_1_LEFT },
+		{ DataMovementReason::TEAM_FAILED, SERVER_KNOBS->PRIORITY_TEAM_FAILED },
+		{ DataMovementReason::TEAM_0_LEFT, SERVER_KNOBS->PRIORITY_TEAM_0_LEFT },
+		{ DataMovementReason::SPLIT_SHARD, SERVER_KNOBS->PRIORITY_SPLIT_SHARD },
+		{ DataMovementReason::ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD,
+		  SERVER_KNOBS->PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD }
+	};
+
+	static PriorityDmReasonMapping priorityReason;
+	if (priorityReason.empty()) { // only build once
+		for (const auto& [r, p] : reasonPriority) {
+			priorityReason[p] = r;
+		}
+		// Don't allow 2 priorities value being the same.
+		if (priorityReason.size() != reasonPriority.size()) {
+			TraceEvent(SevError, "DuplicateDataMovementPriority").log();
+			ASSERT(false);
+		}
 	}
-	return priority;
+
+	return std::make_pair(&reasonPriority, &priorityReason);
+}
+
+int dataMovementPriority(DataMovementReason reason) {
+	auto [reasonPriority, _] = buildPriorityMappings();
+	return reasonPriority->at(reason);
+}
+
+DataMovementReason priorityToDataMovementReason(int priority) {
+	auto [_, priorityReason] = buildPriorityMappings();
+	return priorityReason->at(priority);
 }
 
 struct RelocateData {
@@ -136,7 +128,7 @@ struct RelocateData {
 	RelocateReason reason;
 
 	double startTime;
-	UID randomId;
+	UID randomId; // inherit from RelocateShard.traceId
 	UID dataMoveId;
 	int workFactor;
 	std::vector<UID> src;
@@ -148,17 +140,18 @@ struct RelocateData {
 	std::shared_ptr<DataMove> dataMove;
 
 	RelocateData()
-	  : priority(-1), boundaryPriority(-1), healthPriority(-1), reason(RelocateReason::INVALID), startTime(-1),
+	  : priority(-1), boundaryPriority(-1), healthPriority(-1), reason(RelocateReason::OTHER), startTime(-1),
 	    dataMoveId(anonymousShardId), workFactor(0), wantsNewServers(false), cancellable(false),
 	    interval("QueuedRelocation") {}
 	explicit RelocateData(RelocateShard const& rs)
 	  : keys(rs.keys), priority(rs.priority), boundaryPriority(isBoundaryPriority(rs.priority) ? rs.priority : -1),
 	    healthPriority(isHealthPriority(rs.priority) ? rs.priority : -1), reason(rs.reason), startTime(now()),
-	    randomId(deterministicRandom()->randomUniqueID()), dataMoveId(rs.dataMoveId), workFactor(0),
-	    wantsNewServers(
-	        isDataMovementForMountainChopper(rs.moveReason) || isDataMovementForValleyFiller(rs.moveReason) ||
-	        rs.moveReason == DataMovementReason::SPLIT_SHARD || rs.moveReason == DataMovementReason::TEAM_REDUNDANT),
-	    cancellable(true), interval("QueuedRelocation"), dataMove(rs.dataMove) {
+	    randomId(rs.traceId.isValid() ? rs.traceId : deterministicRandom()->randomUniqueID()),
+	    dataMoveId(rs.dataMoveId), workFactor(0), wantsNewServers(isDataMovementForMountainChopper(rs.moveReason) ||
+	                                                              isDataMovementForValleyFiller(rs.moveReason) ||
+	                                                              rs.moveReason == DataMovementReason::SPLIT_SHARD ||
+	                                                              rs.moveReason == DataMovementReason::TEAM_REDUNDANT),
+	    cancellable(true), interval("QueuedRelocation", randomId), dataMove(rs.dataMove) {
 		if (dataMove != nullptr) {
 			this->src.insert(this->src.end(), dataMove->meta.src.begin(), dataMove->meta.src.end());
 		}
@@ -519,14 +512,14 @@ void complete(RelocateData const& relocation, std::map<UID, Busyness>& busymap,
 }
 
 // Cancells in-flight data moves intersecting with range.
-ACTOR Future<Void> cancelDataMove(struct DDQueueData* self, KeyRange range, const DDEnabledState* ddEnabledState);
+ACTOR Future<Void> cancelDataMove(struct DDQueue* self, KeyRange range, const DDEnabledState* ddEnabledState);
 
-ACTOR Future<Void> dataDistributionRelocator(struct DDQueueData* self,
+ACTOR Future<Void> dataDistributionRelocator(struct DDQueue* self,
                                              RelocateData rd,
                                              Future<Void> prevCleanup,
                                              const DDEnabledState* ddEnabledState);
 
-struct DDQueueData {
+struct DDQueue : public IDDRelocationQueue {
 	struct DDDataMove {
 		DDDataMove() = default;
 		explicit DDDataMove(UID id) : id(id) {}
@@ -537,6 +530,100 @@ struct DDQueueData {
 		Future<Void> cancel;
 	};
 
+	struct ServerCounter {
+		enum CountType : uint8_t { ProposedSource = 0, QueuedSource, LaunchedSource, LaunchedDest, __COUNT };
+
+	private:
+		typedef std::array<int, (int)__COUNT> Item; // one for each CountType
+		typedef std::array<Item, RelocateReason::typeCount()> ReasonItem; // one for each RelocateReason
+
+		std::unordered_map<UID, ReasonItem> counter;
+
+		std::string toString(const Item& item) const {
+			return format("%d %d %d %d", item[0], item[1], item[2], item[3]);
+		}
+
+		void traceReasonItem(TraceEvent* event, const ReasonItem& item) const {
+			for (int i = 0; i < item.size(); ++i) {
+				if (std::accumulate(item[i].cbegin(), item[i].cend(), 0) > 0) {
+					// "PQSD" corresponding to CounterType
+					event->detail(RelocateReason(i).toString() + "PQSD", toString(item[i]));
+				}
+			}
+		}
+
+		bool countNonZero(const ReasonItem& item, CountType type) const {
+			return std::any_of(item.cbegin(), item.cend(), [type](const Item& item) { return item[(int)type] > 0; });
+		}
+
+		void increase(const UID& id, RelocateReason reason, CountType type) {
+			int idx = (int)(reason);
+			// if (idx < 0 || idx >= RelocateReason::typeCount()) {
+			// 	TraceEvent(SevWarnAlways, "ServerCounterDebug").detail("Reason", reason.toString());
+			// }
+			ASSERT(idx >= 0 && idx < RelocateReason::typeCount());
+			counter[id][idx][(int)type] += 1;
+		}
+
+		void summarizeLaunchedServers(decltype(counter.cbegin()) begin,
+		                              decltype(counter.cend()) end,
+		                              TraceEvent* event) const {
+			if (begin == end)
+				return;
+
+			std::string execSrc, execDest;
+			for (; begin != end; ++begin) {
+				if (countNonZero(begin->second, LaunchedSource)) {
+					execSrc += begin->first.shortString() + ",";
+				}
+				if (countNonZero(begin->second, LaunchedDest)) {
+					execDest += begin->first.shortString() + ",";
+				}
+			}
+			event->detail("RemainedLaunchedSources", execSrc).detail("RemainedLaunchedDestinations", execDest);
+		}
+
+	public:
+		void clear() { counter.clear(); }
+
+		int get(const UID& id, RelocateReason reason, CountType type) const {
+			return counter.at(id)[(int)reason][(int)type];
+		}
+
+		void increaseForTeam(const std::vector<UID>& ids, RelocateReason reason, CountType type) {
+			for (auto& id : ids) {
+				increase(id, reason, type);
+			}
+		}
+
+		void traceAll(const UID& debugId = UID()) const {
+			auto it = counter.cbegin();
+			int count = 0;
+			for (; count < SERVER_KNOBS->DD_QUEUE_COUNTER_MAX_LOG && it != counter.cend(); ++count, ++it) {
+				TraceEvent event("DDQueueServerCounter", debugId);
+				event.detail("ServerId", it->first);
+				traceReasonItem(&event, it->second);
+			}
+
+			if (it != counter.cend()) {
+				TraceEvent e(SevWarn, "DDQueueServerCounterTooMany", debugId);
+				e.detail("Servers", size());
+				if (SERVER_KNOBS->DD_QUEUE_COUNTER_SUMMARIZE) {
+					summarizeLaunchedServers(it, counter.cend(), &e);
+					return;
+				}
+			}
+		}
+
+		size_t size() const { return counter.size(); }
+
+		// for random test
+		static CountType randomCountType() {
+			int i = deterministicRandom()->randomInt(0, (int)__COUNT);
+			return (CountType)i;
+		}
+	};
+
 	ActorCollectionNoErrors noErrorActors; // has to be the last one to be destroyed because other Actors may use it.
 	UID distributorId;
 	MoveKeysLock lock;
@@ -545,6 +632,7 @@ struct DDQueueData {
 
 	std::vector<TeamCollectionInterface> teamCollections;
 	Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure;
+	Reference<PhysicalShardCollection> physicalShardCollection;
 	PromiseStream<Promise<int64_t>> getAverageShardBytes;
 
 	FlowLock startMoveKeysParallelismLock;
@@ -570,6 +658,7 @@ struct DDQueueData {
 	// The last time one server was selected as source team for read rebalance reason. We want to throttle read
 	// rebalance on time bases because the read workload sample update has delay after the previous moving
 	std::map<UID, double> lastAsSource;
+	ServerCounter serverCounter;
 
 	KeyRangeMap<RelocateData> inFlight;
 	// Track all actors that relocates specified keys to a good place; Key: keyRange; Value: actor
@@ -637,20 +726,22 @@ struct DDQueueData {
 		}
 	}
 
-	DDQueueData(UID mid,
-	            MoveKeysLock lock,
-	            Database cx,
-	            std::vector<TeamCollectionInterface> teamCollections,
-	            Reference<ShardsAffectedByTeamFailure> sABTF,
-	            PromiseStream<Promise<int64_t>> getAverageShardBytes,
-	            int teamSize,
-	            int singleRegionTeamSize,
-	            PromiseStream<RelocateShard> output,
-	            FutureStream<RelocateShard> input,
-	            PromiseStream<GetMetricsRequest> getShardMetrics,
-	            PromiseStream<GetTopKMetricsRequest> getTopKMetrics)
-	  : distributorId(mid), lock(lock), cx(cx), txnProcessor(new DDTxnProcessor(cx)), teamCollections(teamCollections),
-	    shardsAffectedByTeamFailure(sABTF), getAverageShardBytes(getAverageShardBytes),
+	DDQueue(UID mid,
+	        MoveKeysLock lock,
+	        Database cx,
+	        std::vector<TeamCollectionInterface> teamCollections,
+	        Reference<ShardsAffectedByTeamFailure> sABTF,
+	        Reference<PhysicalShardCollection> physicalShardCollection,
+	        PromiseStream<Promise<int64_t>> getAverageShardBytes,
+	        int teamSize,
+	        int singleRegionTeamSize,
+	        PromiseStream<RelocateShard> output,
+	        FutureStream<RelocateShard> input,
+	        PromiseStream<GetMetricsRequest> getShardMetrics,
+	        PromiseStream<GetTopKMetricsRequest> getTopKMetrics)
+	  : IDDRelocationQueue(), distributorId(mid), lock(lock), cx(cx), txnProcessor(new DDTxnProcessor(cx)),
+	    teamCollections(teamCollections), shardsAffectedByTeamFailure(sABTF),
+	    physicalShardCollection(physicalShardCollection), getAverageShardBytes(getAverageShardBytes),
 	    startMoveKeysParallelismLock(SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM),
 	    finishMoveKeysParallelismLock(SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM),
 	    cleanUpDataMoveParallelismLock(SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM),
@@ -660,6 +751,7 @@ struct DDQueueData {
 	    suppressIntervals(0), rawProcessingUnhealthy(new AsyncVar<bool>(false)),
 	    rawProcessingWiggle(new AsyncVar<bool>(false)), unhealthyRelocations(0),
 	    movedKeyServersEventHolder(makeReference<EventCacheHolder>("MovedKeyServers")) {}
+	DDQueue() = default;
 
 	void validate() {
 		if (EXPENSIVE_VALIDATION) {
@@ -805,7 +897,7 @@ struct DDQueueData {
 		}
 	}
 
-	ACTOR static Future<Void> getSourceServersForRange(DDQueueData* self,
+	ACTOR static Future<Void> getSourceServersForRange(DDQueue* self,
 	                                                   RelocateData input,
 	                                                   PromiseStream<RelocateData> output,
 	                                                   Reference<FlowLock> fetchLock) {
@@ -907,11 +999,14 @@ struct DDQueueData {
 
 			if (rrs.src.size() == 0 && (rrs.keys == rd.keys || fetchingSourcesQueue.erase(rrs) > 0)) {
 				rrs.keys = affectedQueuedItems[r];
+				rrs.interval = TraceInterval("QueuedRelocation", rrs.randomId); // inherit the old randomId
+
+				DebugRelocationTraceEvent(rrs.interval.begin(), distributorId)
+				    .detail("KeyBegin", rrs.keys.begin)
+				    .detail("KeyEnd", rrs.keys.end)
+				    .detail("Priority", rrs.priority)
+				    .detail("WantsNewServers", rrs.wantsNewServers);
 
-				rrs.interval = TraceInterval("QueuedRelocation");
-				/*TraceEvent(rrs.interval.begin(), distributorId);
-				  .detail("KeyBegin", rrs.keys.begin).detail("KeyEnd", rrs.keys.end)
-				    .detail("Priority", rrs.priority).detail("WantsNewServers", rrs.wantsNewServers);*/
 				queuedRelocations++;
 				TraceEvent(SevVerbose, "QueuedRelocationsChanged")
 				    .detail("DataMoveID", rrs.dataMoveId)
@@ -933,11 +1028,15 @@ struct DDQueueData {
 
 					if (serverQueue.erase(rrs) > 0) {
 						if (!foundActiveRelocation) {
-							newData.interval = TraceInterval("QueuedRelocation");
-							/*TraceEvent(newData.interval.begin(), distributorId);
-							  .detail("KeyBegin", newData.keys.begin).detail("KeyEnd", newData.keys.end)
-							    .detail("Priority", newData.priority).detail("WantsNewServers",
-							  newData.wantsNewServers);*/
+							newData.interval =
+							    TraceInterval("QueuedRelocation", rrs.randomId); // inherit the old randomId
+
+							DebugRelocationTraceEvent(newData.interval.begin(), distributorId)
+							    .detail("KeyBegin", newData.keys.begin)
+							    .detail("KeyEnd", newData.keys.end)
+							    .detail("Priority", newData.priority)
+							    .detail("WantsNewServers", newData.wantsNewServers);
+
 							queuedRelocations++;
 							TraceEvent(SevVerbose, "QueuedRelocationsChanged")
 							    .detail("DataMoveID", newData.dataMoveId)
@@ -958,11 +1057,11 @@ struct DDQueueData {
 			}
 		}
 
-		/*TraceEvent("ReceivedRelocateShard", distributorId)
-		  .detail("KeyBegin", rd.keys.begin)
-		  .detail("KeyEnd", rd.keys.end)
+		DebugRelocationTraceEvent("ReceivedRelocateShard", distributorId)
+		    .detail("KeyBegin", rd.keys.begin)
+		    .detail("KeyEnd", rd.keys.end)
 		    .detail("Priority", rd.priority)
-		    .detail("AffectedRanges", affectedQueuedItems.size()); */
+		    .detail("AffectedRanges", affectedQueuedItems.size());
 	}
 
 	void completeSourceFetch(const RelocateData& results) {
@@ -976,6 +1075,7 @@ struct DDQueueData {
 			queue[results.src[i]].insert(results);
 		}
 		updateLastAsSource(results.src);
+		serverCounter.increaseForTeam(results.src, results.reason, ServerCounter::CountType::QueuedSource);
 	}
 
 	void logRelocation(const RelocateData& rd, const char* title) {
@@ -1044,10 +1144,12 @@ struct DDQueueData {
 				if (fetchKeysComplete.count(it->value()) && inFlightActors.liveActorAt(it->range().begin) &&
 				    !rd.keys.contains(it->range()) && it->value().priority >= rd.priority &&
 				    rd.healthPriority < SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY) {
-					/*TraceEvent("OverlappingInFlight", distributorId)
+
+					DebugRelocationTraceEvent("OverlappingInFlight", distributorId)
 					    .detail("KeyBegin", it->value().keys.begin)
 					    .detail("KeyEnd", it->value().keys.end)
-					    .detail("Priority", it->value().priority);*/
+					    .detail("Priority", it->value().priority);
+
 					overlappingInFlight = true;
 					break;
 				}
@@ -1082,8 +1184,8 @@ struct DDQueueData {
 			// because they do not have too much inflight data movement.
 
 			// logRelocation( rd, "LaunchingRelocation" );
+			DebugRelocationTraceEvent(rd.interval.end(), distributorId).detail("Result", "Success");
 
-			//TraceEvent(rd.interval.end(), distributorId).detail("Result","Success");
 			if (!rd.isRestore()) {
 				queuedRelocations--;
 				TraceEvent(SevVerbose, "QueuedRelocationsChanged")
@@ -1128,7 +1230,11 @@ struct DDQueueData {
 					// TODO(psm): The shard id is determined by DD.
 					rrs.dataMove.reset();
 					if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
-						rrs.dataMoveId = deterministicRandom()->randomUniqueID();
+						if (SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+							rrs.dataMoveId = UID();
+						} else {
+							rrs.dataMoveId = deterministicRandom()->randomUniqueID();
+						}
 					} else {
 						rrs.dataMoveId = anonymousShardId;
 					}
@@ -1200,7 +1306,7 @@ struct DDQueueData {
 			}
 		}
 
-		DDQueueData::DDDataMove dataMove(dataMoveId);
+		DDQueue::DDDataMove dataMove(dataMoveId);
 		dataMove.cancel = cleanUpDataMove(
 		    this->cx, dataMoveId, this->lock, &this->cleanUpDataMoveParallelismLock, range, ddEnabledState);
 		this->dataMoves.insert(range, dataMove);
@@ -1208,9 +1314,19 @@ struct DDQueueData {
 		    .detail("DataMoveID", dataMoveId)
 		    .detail("Range", range);
 	}
+
+	Future<Void> periodicalRefreshCounter() {
+		auto f = [this]() {
+			serverCounter.traceAll(distributorId);
+			serverCounter.clear();
+		};
+		return recurring(f, SERVER_KNOBS->DD_QUEUE_COUNTER_REFRESH_INTERVAL);
+	}
+
+	int getUnhealthyRelocationCount() override { return unhealthyRelocations; }
 };
 
-ACTOR Future<Void> cancelDataMove(struct DDQueueData* self, KeyRange range, const DDEnabledState* ddEnabledState) {
+ACTOR Future<Void> cancelDataMove(struct DDQueue* self, KeyRange range, const DDEnabledState* ddEnabledState) {
 	std::vector<Future<Void>> cleanup;
 	auto f = self->dataMoves.intersectingRanges(range);
 	for (auto it = f.begin(); it != f.end(); ++it) {
@@ -1231,7 +1347,7 @@ ACTOR Future<Void> cancelDataMove(struct DDQueueData* self, KeyRange range, cons
 	wait(waitForAll(cleanup));
 	auto ranges = self->dataMoves.getAffectedRangesAfterInsertion(range);
 	if (!ranges.empty()) {
-		self->dataMoves.insert(KeyRangeRef(ranges.front().begin, ranges.back().end), DDQueueData::DDDataMove());
+		self->dataMoves.insert(KeyRangeRef(ranges.front().begin, ranges.back().end), DDQueue::DDDataMove());
 	}
 	return Void();
 }
@@ -1250,12 +1366,12 @@ static std::string destServersString(std::vector<std::pair<Reference<IDataDistri
 
 // This actor relocates the specified keys to a good place.
 // The inFlightActor key range map stores the actor for each RelocateData
-ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,
+ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
                                              RelocateData rd,
                                              Future<Void> prevCleanup,
                                              const DDEnabledState* ddEnabledState) {
 	state Promise<Void> errorOut(self->error);
-	state TraceInterval relocateShardInterval("RelocateShard");
+	state TraceInterval relocateShardInterval("RelocateShard", rd.randomId);
 	state PromiseStream<RelocateData> dataTransferComplete(self->dataTransferComplete);
 	state PromiseStream<RelocateData> relocationComplete(self->relocationComplete);
 	state bool signalledTransferComplete = false;
@@ -1271,6 +1387,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,
 	state std::vector<std::pair<Reference<IDataDistributionTeam>, bool>> bestTeams;
 	state double startTime = now();
 	state std::vector<UID> destIds;
+	state uint64_t debugID = deterministicRandom()->randomUInt64();
 
 	try {
 		if (now() - self->lastInterval < 1.0) {
@@ -1282,7 +1399,6 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,
 		    .detail("KeyBegin", rd.keys.begin)
 		    .detail("KeyEnd", rd.keys.end)
 		    .detail("Priority", rd.priority)
-		    .detail("RelocationID", relocateShardInterval.pairID)
 		    .detail("SuppressedEventCount", self->suppressIntervals);
 
 		if (relocateShardInterval.severity != SevDebug) {
@@ -1310,12 +1426,20 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,
 					    .detail("Range", kr);
 				}
 			}
-			self->dataMoves.insert(rd.keys, DDQueueData::DDDataMove(rd.dataMoveId));
+			if (rd.isRestore() || !SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+				if (SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+					ASSERT(rd.dataMoveId.isValid());
+				}
+				self->dataMoves.insert(rd.keys, DDQueue::DDDataMove(rd.dataMoveId));
+			}
 		}
 
 		state StorageMetrics metrics =
 		    wait(brokenPromiseToNever(self->getShardMetrics.getReply(GetMetricsRequest(rd.keys))));
 
+		state uint64_t physicalShardIDCandidate = UID().first();
+		state bool forceToUseNewPhysicalShard = false;
+
 		ASSERT(rd.src.size());
 		loop {
 			destOverloadedCount = 0;
@@ -1376,6 +1500,20 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,
 						req.src = rd.src;
 						req.completeSources = rd.completeSources;
 
+						if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
+						    tciIndex == 1) {
+							ASSERT(physicalShardIDCandidate != UID().first() &&
+							       physicalShardIDCandidate != anonymousShardId.first());
+							Optional<ShardsAffectedByTeamFailure::Team> remoteTeamWithPhysicalShard =
+							    self->physicalShardCollection->tryGetAvailableRemoteTeamWith(
+							        physicalShardIDCandidate, metrics, debugID);
+							if (remoteTeamWithPhysicalShard.present()) {
+								// Exists a remoteTeam in the mapping that has the physicalShardIDCandidate
+								// use the remoteTeam with the physicalShard as the bestTeam
+								req = GetTeamRequest(remoteTeamWithPhysicalShard.get().servers);
+							}
+						}
+
 						// bestTeam.second = false if the bestTeam in the teamCollection (in the DC) does not have any
 						// server that hosts the relocateData. This is possible, for example, in a fearless
 						// configuration when the remote DC is just brought up.
@@ -1407,10 +1545,62 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,
 							anyWithSource = true;
 						}
 
-						bestTeams.emplace_back(bestTeam.first.get(), bestTeam.second);
+						if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+							// critical to the correctness of team selection by PhysicalShardCollection
+							// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary
+							// team Thus, tryGetAvailableRemoteTeamWith() may select an almost full remote team In this
+							// case, we must re-select a remote team We set foundTeams = false to avoid finishing team
+							// selection Then, forceToUseNewPhysicalShard is set, which enforce to use getTeam to select
+							// a remote team
+							if (tciIndex == 1 && !forceToUseNewPhysicalShard) {
+								bool minAvailableSpaceRatio = bestTeam.first.get()->getMinAvailableSpaceRatio(true);
+								if (minAvailableSpaceRatio < SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO) {
+									foundTeams = false;
+									break;
+								}
+							}
+						}
+
+						if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+							bestTeams.emplace_back(bestTeam.first.get(), true);
+							// Always set bestTeams[i].second = true to disable optimization in data move between DCs
+							// for the correctness of PhysicalShardCollection
+							// Currently, enabling the optimization will break the invariant of PhysicalShardCollection
+							// Invariant: once a physical shard is created with a specific set of SSes, this SS set will
+							// never get changed.
+						} else {
+							bestTeams.emplace_back(bestTeam.first.get(), bestTeam.second);
+						}
+
+						// get physicalShardIDCandidate
+						if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
+						    tciIndex == 0) {
+							ASSERT(foundTeams);
+							ShardsAffectedByTeamFailure::Team primaryTeam =
+							    ShardsAffectedByTeamFailure::Team(bestTeams[0].first->getServerIDs(), true);
+							physicalShardIDCandidate =
+							    self->physicalShardCollection->determinePhysicalShardIDGivenPrimaryTeam(
+							        primaryTeam, metrics, forceToUseNewPhysicalShard, debugID);
+							ASSERT(physicalShardIDCandidate != UID().first() &&
+							       physicalShardIDCandidate != anonymousShardId.first());
+						}
 					}
 					tciIndex++;
 				}
+
+				// critical to the correctness of team selection by PhysicalShardCollection
+				// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary team
+				// Thus, tryGetAvailableRemoteTeamWith() may select an unhealthy remote team
+				// In this case, we must re-select a remote team
+				// We set foundTeams = false to avoid finishing team selection
+				// Then, forceToUseNewPhysicalShard is set, which enforce to use getTeam to select a remote team
+				if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
+				    bestTeams.size() > 1 && !forceToUseNewPhysicalShard) {
+					if (!bestTeams[1].first->isHealthy()) {
+						foundTeams = false;
+					}
+				}
+
 				// once we've found healthy candidate teams, make sure they're not overloaded with outstanding moves
 				// already
 				anyDestOverloaded = !canLaunchDest(bestTeams, rd.priority, self->destBusymap);
@@ -1431,6 +1621,11 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,
 					    .detail("AnyDestOverloaded", anyDestOverloaded)
 					    .detail("NumOfTeamCollections", self->teamCollections.size())
 					    .detail("Servers", destServersString(bestTeams));
+					if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+						if (rd.isRestore() && destOverloadedCount > 50) {
+							throw data_move_dest_team_not_found();
+						}
+					}
 					wait(delay(SERVER_KNOBS->DEST_OVERLOADED_DELAY, TaskPriority::DataDistributionLaunch));
 				} else {
 					CODE_PROBE(true, "did not find a healthy destination team on the first attempt");
@@ -1447,10 +1642,42 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,
 					}
 					wait(delay(SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskPriority::DataDistributionLaunch));
 				}
+				// When forceToUseNewPhysicalShard = false, we get paired primary team and remote team
+				// However, this may be failed
+				// Any retry triggers to use new physicalShard which enters the normal routine
+				if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+					forceToUseNewPhysicalShard = true;
+				}
 
 				// TODO different trace event + knob for overloaded? Could wait on an async var for done moves
 			}
 
+			if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+				if (!rd.isRestore()) {
+					// when !rd.isRestore(), dataMoveId is just decided as physicalShardIDCandidate
+					// thus, update the physicalShardIDCandidate to related data structures
+					ASSERT(physicalShardIDCandidate != UID().first());
+					rd.dataMoveId = newShardId(physicalShardIDCandidate, AssignEmptyRange::False);
+					auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
+					inFlightRange.value().dataMoveId = rd.dataMoveId;
+					auto f = self->dataMoves.intersectingRanges(rd.keys);
+					for (auto it = f.begin(); it != f.end(); ++it) {
+						KeyRangeRef kr(it->range().begin, it->range().end);
+						const UID mId = it->value().id;
+						if (mId.isValid() && mId != rd.dataMoveId) {
+							TraceEvent("DDRelocatorConflictingDataMoveAfterGetTeam", distributorId)
+							    .detail("CurrentDataMoveID", rd.dataMoveId)
+							    .detail("DataMoveID", mId)
+							    .detail("Range", kr);
+						}
+					}
+					self->dataMoves.insert(rd.keys, DDQueue::DDDataMove(rd.dataMoveId));
+				}
+				ASSERT(rd.dataMoveId.first() != UID().first());
+				auto dataMoveRange = self->dataMoves.rangeContaining(rd.keys.begin);
+				ASSERT(dataMoveRange.value().id == rd.dataMoveId);
+			}
+
 			// set cancellable to false on inFlight's entry for this key range
 			auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
 			ASSERT(inFlightRange.range() == rd.keys);
@@ -1536,6 +1763,10 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,
 				    .detail("ExtraIds", describe(extraIds));
 			}
 
+			self->serverCounter.increaseForTeam(rd.src, rd.reason, DDQueue::ServerCounter::LaunchedSource);
+			self->serverCounter.increaseForTeam(destIds, rd.reason, DDQueue::ServerCounter::LaunchedDest);
+			self->serverCounter.increaseForTeam(extraIds, rd.reason, DDQueue::ServerCounter::LaunchedDest);
+
 			state Error error = success();
 			state Promise<Void> dataMovementComplete;
 			// Move keys from source to destination by changing the serverKeyList and keyServerList system keys
@@ -1583,7 +1814,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,
 									auto ranges = self->dataMoves.getAffectedRangesAfterInsertion(rd.keys);
 									if (ranges.size() == 1 && static_cast<KeyRange>(ranges[0]) == rd.keys &&
 									    ranges[0].value.id == rd.dataMoveId && !ranges[0].value.cancel.isValid()) {
-										self->dataMoves.insert(rd.keys, DDQueueData::DDDataMove());
+										self->dataMoves.insert(rd.keys, DDQueue::DDDataMove());
 										TraceEvent(SevVerbose, "DequeueDataMoveOnSuccess", self->distributorId)
 										    .detail("DataMoveID", rd.dataMoveId)
 										    .detail("DataMoveRange", rd.keys);
@@ -1665,6 +1896,20 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,
 					self->bytesWritten += metrics.bytes;
 					self->shardsAffectedByTeamFailure->finishMove(rd.keys);
 					relocationComplete.send(rd);
+
+					if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+						// update physical shard collection
+						std::vector<ShardsAffectedByTeamFailure::Team> selectedTeams;
+						for (int i = 0; i < bestTeams.size(); i++) {
+							auto serverIds = bestTeams[i].first->getServerIDs();
+							selectedTeams.push_back(ShardsAffectedByTeamFailure::Team(serverIds, i == 0));
+						}
+						// The update of PhysicalShardToTeams, PhysicalShardInstances, keyRangePhysicalShardIDMap should
+						// be atomic
+						self->physicalShardCollection->updatePhysicalShardCollection(
+						    rd.keys, rd.isRestore(), selectedTeams, rd.dataMoveId.first(), metrics, debugID);
+					}
+
 					return Void();
 				} else {
 					throw error;
@@ -1730,7 +1975,7 @@ inline double getWorstCpu(const HealthMetrics& metrics, const std::vector<UID>&
 
 // Move the shard with the top K highest read density of sourceTeam's to destTeam if sourceTeam has much more read load
 // than destTeam
-ACTOR Future<bool> rebalanceReadLoad(DDQueueData* self,
+ACTOR Future<bool> rebalanceReadLoad(DDQueue* self,
                                      DataMovementReason moveReason,
                                      Reference<IDataDistributionTeam> sourceTeam,
                                      Reference<IDataDistributionTeam> destTeam,
@@ -1799,8 +2044,15 @@ ACTOR Future<bool> rebalanceReadLoad(DDQueueData* self,
 	    ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary));
 	for (int i = 0; i < shards.size(); i++) {
 		if (shard == shards[i]) {
-			self->output.send(RelocateShard(shard, moveReason, RelocateReason::REBALANCE_READ));
-			self->updateLastAsSource(sourceTeam->getServerIDs());
+			UID traceId = deterministicRandom()->randomUniqueID();
+			self->output.send(RelocateShard(shard, moveReason, RelocateReason::REBALANCE_READ, traceId));
+			traceEvent->detail("TraceId", traceId);
+
+			auto serverIds = sourceTeam->getServerIDs();
+			self->updateLastAsSource(serverIds);
+
+			self->serverCounter.increaseForTeam(
+			    serverIds, RelocateReason::REBALANCE_READ, DDQueue::ServerCounter::ProposedSource);
 			return true;
 		}
 	}
@@ -1809,7 +2061,7 @@ ACTOR Future<bool> rebalanceReadLoad(DDQueueData* self,
 }
 
 // Move a random shard from sourceTeam if sourceTeam has much more data than provided destTeam
-ACTOR static Future<bool> rebalanceTeams(DDQueueData* self,
+ACTOR static Future<bool> rebalanceTeams(DDQueue* self,
                                          DataMovementReason moveReason,
                                          Reference<IDataDistributionTeam const> sourceTeam,
                                          Reference<IDataDistributionTeam const> destTeam,
@@ -1871,7 +2123,12 @@ ACTOR static Future<bool> rebalanceTeams(DDQueueData* self,
 	    ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary));
 	for (int i = 0; i < shards.size(); i++) {
 		if (moveShard == shards[i]) {
-			self->output.send(RelocateShard(moveShard, moveReason, RelocateReason::REBALANCE_DISK));
+			UID traceId = deterministicRandom()->randomUniqueID();
+			self->output.send(RelocateShard(moveShard, moveReason, RelocateReason::REBALANCE_DISK, traceId));
+			traceEvent->detail("TraceId", traceId);
+
+			self->serverCounter.increaseForTeam(
+			    sourceTeam->getServerIDs(), RelocateReason::REBALANCE_DISK, DDQueue::ServerCounter::ProposedSource);
 			return true;
 		}
 	}
@@ -1880,7 +2137,7 @@ ACTOR static Future<bool> rebalanceTeams(DDQueueData* self,
 	return false;
 }
 
-ACTOR Future<SrcDestTeamPair> getSrcDestTeams(DDQueueData* self,
+ACTOR Future<SrcDestTeamPair> getSrcDestTeams(DDQueue* self,
                                               int teamCollectionIndex,
                                               GetTeamRequest srcReq,
                                               GetTeamRequest destReq,
@@ -1907,7 +2164,7 @@ ACTOR Future<SrcDestTeamPair> getSrcDestTeams(DDQueueData* self,
 	return {};
 }
 
-ACTOR Future<Void> BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, DataMovementReason reason) {
+ACTOR Future<Void> BgDDLoadRebalance(DDQueue* self, int teamCollectionIndex, DataMovementReason reason) {
 	state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT;
 	state Transaction tr(self->cx);
 	state double lastRead = 0;
@@ -2008,7 +2265,7 @@ ACTOR Future<Void> BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex,
 	}
 }
 
-ACTOR Future<Void> BgDDMountainChopper(DDQueueData* self, int teamCollectionIndex) {
+ACTOR Future<Void> BgDDMountainChopper(DDQueue* self, int teamCollectionIndex) {
 	state double rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL;
 	state Transaction tr(self->cx);
 	state double lastRead = 0;
@@ -2106,7 +2363,7 @@ ACTOR Future<Void> BgDDMountainChopper(DDQueueData* self, int teamCollectionInde
 	}
 }
 
-ACTOR Future<Void> BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) {
+ACTOR Future<Void> BgDDValleyFiller(DDQueue* self, int teamCollectionIndex) {
 	state double rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL;
 	state Transaction tr(self->cx);
 	state double lastRead = 0;
@@ -2214,6 +2471,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
                                          Reference<AsyncVar<bool>> processingWiggle,
                                          std::vector<TeamCollectionInterface> teamCollections,
                                          Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
+                                         Reference<PhysicalShardCollection> physicalShardCollection,
                                          MoveKeysLock lock,
                                          PromiseStream<Promise<int64_t>> getAverageShardBytes,
                                          FutureStream<Promise<int>> getUnhealthyRelocationCount,
@@ -2221,41 +2479,43 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
                                          int teamSize,
                                          int singleRegionTeamSize,
                                          const DDEnabledState* ddEnabledState) {
-	state DDQueueData self(distributorId,
-	                       lock,
-	                       cx,
-	                       teamCollections,
-	                       shardsAffectedByTeamFailure,
-	                       getAverageShardBytes,
-	                       teamSize,
-	                       singleRegionTeamSize,
-	                       output,
-	                       input,
-	                       getShardMetrics,
-	                       getTopKMetrics);
+	state DDQueue self(distributorId,
+	                   lock,
+	                   cx,
+	                   teamCollections,
+	                   shardsAffectedByTeamFailure,
+	                   physicalShardCollection,
+	                   getAverageShardBytes,
+	                   teamSize,
+	                   singleRegionTeamSize,
+	                   output,
+	                   input,
+	                   getShardMetrics,
+	                   getTopKMetrics);
 	state std::set<UID> serversToLaunchFrom;
 	state KeyRange keysToLaunchFrom;
 	state RelocateData launchData;
 	state Future<Void> recordMetrics = delay(SERVER_KNOBS->DD_QUEUE_LOGGING_INTERVAL);
 
-	state std::vector<Future<Void>> balancingFutures;
+	state std::vector<Future<Void>> ddQueueFutures;
 
 	state PromiseStream<KeyRange> rangesComplete;
 	state Future<Void> launchQueuedWorkTimeout = Never();
 
 	for (int i = 0; i < teamCollections.size(); i++) {
 		// FIXME: Use BgDDLoadBalance for disk rebalance too after DD simulation test proof.
-		// balancingFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_OVERUTILIZED_TEAM));
-		// balancingFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM));
+		// ddQueueFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_OVERUTILIZED_TEAM));
+		// ddQueueFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM));
 		if (SERVER_KNOBS->READ_SAMPLING_ENABLED) {
-			balancingFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM));
-			balancingFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM));
+			ddQueueFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM));
+			ddQueueFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM));
 		}
-		balancingFutures.push_back(BgDDMountainChopper(&self, i));
-		balancingFutures.push_back(BgDDValleyFiller(&self, i));
+		ddQueueFutures.push_back(BgDDMountainChopper(&self, i));
+		ddQueueFutures.push_back(BgDDValleyFiller(&self, i));
 	}
-	balancingFutures.push_back(delayedAsyncVar(self.rawProcessingUnhealthy, processingUnhealthy, 0));
-	balancingFutures.push_back(delayedAsyncVar(self.rawProcessingWiggle, processingWiggle, 0));
+	ddQueueFutures.push_back(delayedAsyncVar(self.rawProcessingUnhealthy, processingUnhealthy, 0));
+	ddQueueFutures.push_back(delayedAsyncVar(self.rawProcessingWiggle, processingWiggle, 0));
+	ddQueueFutures.push_back(self.periodicalRefreshCounter());
 
 	try {
 		loop {
@@ -2368,8 +2628,10 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
 					                                // key we use here must match the key used in the holder.
 				}
 				when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator
-				when(wait(waitForAll(balancingFutures))) {}
-				when(Promise<int> r = waitNext(getUnhealthyRelocationCount)) { r.send(self.unhealthyRelocations); }
+				when(wait(waitForAll(ddQueueFutures))) {}
+				when(Promise<int> r = waitNext(getUnhealthyRelocationCount)) {
+					r.send(self.getUnhealthyRelocationCount());
+				}
 			}
 		}
 	} catch (Error& e) {
@@ -2381,3 +2643,29 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
 		throw e;
 	}
 }
+
+ACTOR Future<Void> dataDistributionQueue(Reference<DDSharedContext> context, Database cx);
+
+TEST_CASE("/DataDistribution/DDQueue/ServerCounterTrace") {
+	state double duration = 2.5 * SERVER_KNOBS->DD_QUEUE_COUNTER_REFRESH_INTERVAL;
+	state DDQueue self;
+	state Future<Void> counterFuture = self.periodicalRefreshCounter();
+	state Future<Void> finishFuture = delay(duration);
+	std::cout << "Start trace counter unit test for " << duration << "s ...\n";
+	loop choose {
+		when(wait(counterFuture)) {}
+		when(wait(finishFuture)) { break; }
+		when(wait(delayJittered(2.0))) {
+			std::vector<UID> team(3);
+			for (int i = 0; i < team.size(); ++i) {
+				team[i] = UID(deterministicRandom()->randomInt(1, 400), 0);
+			}
+			auto reason = RelocateReason(deterministicRandom()->randomInt(0, RelocateReason::typeCount()));
+			auto countType = DDQueue::ServerCounter::randomCountType();
+			self.serverCounter.increaseForTeam(team, reason, countType);
+			ASSERT(self.serverCounter.get(team[0], reason, countType));
+		}
+	}
+	std::cout << "Finished.";
+	return Void();
+}
\ No newline at end of file
diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DDShardTracker.actor.cpp
similarity index 59%
rename from fdbserver/DataDistributionTracker.actor.cpp
rename to fdbserver/DDShardTracker.actor.cpp
index 7f85a63927..9a95d87190 100644
--- a/fdbserver/DataDistributionTracker.actor.cpp
+++ b/fdbserver/DDShardTracker.actor.cpp
@@ -21,6 +21,7 @@
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbclient/SystemData.h"
 #include "fdbserver/DataDistribution.actor.h"
+#include "fdbserver/DDSharedContext.h"
 #include "fdbserver/Knobs.h"
 #include "fdbclient/DatabaseContext.h"
 #include "flow/ActorCollection.h"
@@ -68,10 +69,10 @@ ACTOR Future<Void> updateMaxShardSize(Reference<AsyncVar<int64_t>> dbSizeEstimat
 	}
 }
 
-struct DataDistributionTracker {
+struct DataDistributionTracker : public IDDShardTracker {
 	Database cx;
 	UID distributorId;
-	KeyRangeMap<ShardTrackedData>& shards;
+	KeyRangeMap<ShardTrackedData>* shards;
 	ActorCollection sizeChanges;
 
 	int64_t systemSizeEstimate;
@@ -83,6 +84,9 @@ struct DataDistributionTracker {
 	PromiseStream<RelocateShard> output;
 	Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure;
 
+	// PhysicalShard Tracker
+	Reference<PhysicalShardCollection> physicalShardCollection;
+
 	Promise<Void> readyToStart;
 	Reference<AsyncVar<bool>> anyZeroHealthyTeams;
 
@@ -92,7 +96,7 @@ struct DataDistributionTracker {
 	// The reference to trackerCancelled must be extracted by actors,
 	// because by the time (trackerCancelled == true) this memory cannot
 	// be accessed
-	bool& trackerCancelled;
+	bool* trackerCancelled;
 
 	// This class extracts the trackerCancelled reference from a DataDistributionTracker object
 	// Because some actors spawned by the dataDistributionTracker outlive the DataDistributionTracker
@@ -104,7 +108,7 @@ struct DataDistributionTracker {
 
 	public:
 		SafeAccessor(DataDistributionTracker* tracker)
-		  : trackerCancelled(tracker->trackerCancelled), tracker(*tracker) {
+		  : trackerCancelled(*tracker->trackerCancelled), tracker(*tracker) {
 			ASSERT(!trackerCancelled);
 		}
 
@@ -122,24 +126,29 @@ struct DataDistributionTracker {
 	                        Promise<Void> const& readyToStart,
 	                        PromiseStream<RelocateShard> const& output,
 	                        Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
+	                        Reference<PhysicalShardCollection> physicalShardCollection,
 	                        Reference<AsyncVar<bool>> anyZeroHealthyTeams,
-	                        KeyRangeMap<ShardTrackedData>& shards,
-	                        bool& trackerCancelled)
-	  : cx(cx), distributorId(distributorId), shards(shards), sizeChanges(false), systemSizeEstimate(0),
-	    dbSizeEstimate(new AsyncVar<int64_t>()), maxShardSize(new AsyncVar<Optional<int64_t>>()), output(output),
-	    shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), readyToStart(readyToStart),
+	                        KeyRangeMap<ShardTrackedData>* shards,
+	                        bool* trackerCancelled)
+	  : IDDShardTracker(), cx(cx), distributorId(distributorId), shards(shards), sizeChanges(false),
+	    systemSizeEstimate(0), dbSizeEstimate(new AsyncVar<int64_t>()), maxShardSize(new AsyncVar<Optional<int64_t>>()),
+	    output(output), shardsAffectedByTeamFailure(shardsAffectedByTeamFailure),
+	    physicalShardCollection(physicalShardCollection), readyToStart(readyToStart),
 	    anyZeroHealthyTeams(anyZeroHealthyTeams), trackerCancelled(trackerCancelled) {}
 
-	~DataDistributionTracker() {
-		trackerCancelled = true;
+	~DataDistributionTracker() override {
+		*trackerCancelled = true;
 		// Cancel all actors so they aren't waiting on sizeChanged broken promise
 		sizeChanges.clear(false);
 	}
+
+	double getAverageShardBytes() override { return maxShardSize->get().get() / 2.0; }
 };
 
 void restartShardTrackers(DataDistributionTracker* self,
                           KeyRangeRef keys,
-                          Optional<ShardMetrics> startingMetrics = Optional<ShardMetrics>());
+                          Optional<ShardMetrics> startingMetrics = Optional<ShardMetrics>(),
+                          bool whenDDInit = false);
 
 // Gets the permitted size and IO bounds for a shard. A shard that starts at allKeys.begin
 //  (i.e. '') will have a permitted size of 0, since the database can contain no data.
@@ -186,7 +195,8 @@ int64_t getMaxShardSize(double dbSizeEstimate) {
 
 ACTOR Future<Void> trackShardMetrics(DataDistributionTracker::SafeAccessor self,
                                      KeyRange keys,
-                                     Reference<AsyncVar<Optional<ShardMetrics>>> shardMetrics) {
+                                     Reference<AsyncVar<Optional<ShardMetrics>>> shardMetrics,
+                                     bool whenDDInit) {
 	state BandwidthStatus bandwidthStatus =
 	    shardMetrics->get().present() ? getBandwidthStatus(shardMetrics->get().get().metrics) : BandwidthStatusNormal;
 	state double lastLowBandwidthStartTime =
@@ -195,7 +205,7 @@ ACTOR Future<Void> trackShardMetrics(DataDistributionTracker::SafeAccessor self,
 	state ReadBandwidthStatus readBandwidthStatus = shardMetrics->get().present()
 	                                                    ? getReadBandwidthStatus(shardMetrics->get().get().metrics)
 	                                                    : ReadBandwidthStatusNormal;
-
+	state bool initWithNewMetrics = whenDDInit;
 	wait(delay(0, TaskPriority::DataDistribution));
 
 	/*TraceEvent("TrackShardMetricsStarting")
@@ -303,6 +313,23 @@ ACTOR Future<Void> trackShardMetrics(DataDistributionTracker::SafeAccessor self,
 					if (shardMetrics->get().present()) {
 						self()->dbSizeEstimate->set(self()->dbSizeEstimate->get() + metrics.first.get().bytes -
 						                            shardMetrics->get().get().metrics.bytes);
+						if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+							// update physicalShard metrics and return whether the keys needs to move out of
+							// physicalShard
+							const MoveKeyRangeOutPhysicalShard needToMove =
+							    self()->physicalShardCollection->trackPhysicalShard(
+							        keys, metrics.first.get(), shardMetrics->get().get().metrics, initWithNewMetrics);
+							if (needToMove) {
+								// Do we need to update shardsAffectedByTeamFailure here?
+								self()->output.send(
+								    RelocateShard(keys,
+								                  DataMovementReason::ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD,
+								                  RelocateReason::OTHER));
+							}
+							if (initWithNewMetrics) {
+								initWithNewMetrics = false;
+							}
+						}
 						if (keys.begin >= systemKeys.begin) {
 							self()->systemSizeEstimate +=
 							    metrics.first.get().bytes - shardMetrics->get().get().metrics.bytes;
@@ -399,7 +426,7 @@ ACTOR Future<int64_t> getFirstSize(Reference<AsyncVar<Optional<ShardMetrics>>> s
 ACTOR Future<Void> changeSizes(DataDistributionTracker* self, KeyRange keys, int64_t oldShardsEndingSize) {
 	state std::vector<Future<int64_t>> sizes;
 	state std::vector<Future<int64_t>> systemSizes;
-	for (auto it : self->shards.intersectingRanges(keys)) {
+	for (auto it : self->shards->intersectingRanges(keys)) {
 		Future<int64_t> thisSize = getFirstSize(it->value().stats);
 		sizes.push_back(thisSize);
 		if (it->range().begin >= systemKeys.begin) {
@@ -477,7 +504,8 @@ private:
 ACTOR Future<Void> shardSplitter(DataDistributionTracker* self,
                                  KeyRange keys,
                                  Reference<AsyncVar<Optional<ShardMetrics>>> shardSize,
-                                 ShardSizeBounds shardBounds) {
+                                 ShardSizeBounds shardBounds,
+                                 RelocateReason reason) {
 	state StorageMetrics metrics = shardSize->get().get().metrics;
 	state BandwidthStatus bandwidthStatus = getBandwidthStatus(metrics);
 
@@ -524,12 +552,12 @@ ACTOR Future<Void> shardSplitter(DataDistributionTracker* self,
 		for (int i = 0; i < skipRange; i++) {
 			KeyRangeRef r(splitKeys[i], splitKeys[i + 1]);
 			self->shardsAffectedByTeamFailure->defineShard(r);
-			self->output.send(RelocateShard(r, DataMovementReason::SPLIT_SHARD, RelocateReason::OTHER));
+			self->output.send(RelocateShard(r, DataMovementReason::SPLIT_SHARD, reason));
 		}
 		for (int i = numShards - 1; i > skipRange; i--) {
 			KeyRangeRef r(splitKeys[i], splitKeys[i + 1]);
 			self->shardsAffectedByTeamFailure->defineShard(r);
-			self->output.send(RelocateShard(r, DataMovementReason::SPLIT_SHARD, RelocateReason::OTHER));
+			self->output.send(RelocateShard(r, DataMovementReason::SPLIT_SHARD, reason));
 		}
 
 		self->sizeChanges.add(changeSizes(self, keys, shardSize->get().get().metrics.bytes));
@@ -556,8 +584,8 @@ Future<Void> shardMerger(DataDistributionTracker* self,
                          Reference<AsyncVar<Optional<ShardMetrics>>> shardSize) {
 	int64_t maxShardSize = self->maxShardSize->get().get();
 
-	auto prevIter = self->shards.rangeContaining(keys.begin);
-	auto nextIter = self->shards.rangeContaining(keys.begin);
+	auto prevIter = self->shards->rangeContaining(keys.begin);
+	auto nextIter = self->shards->rangeContaining(keys.begin);
 
 	CODE_PROBE(true, "shard to be merged");
 	ASSERT(keys.begin > allKeys.begin);
@@ -675,7 +703,7 @@ Future<Void> shardMerger(DataDistributionTracker* self,
 	}
 	restartShardTrackers(self, mergeRange, ShardMetrics(endingStats, lastLowBandwidthStartTime, shardCount));
 	self->shardsAffectedByTeamFailure->defineShard(mergeRange);
-	self->output.send(RelocateShard(mergeRange, DataMovementReason::MERGE_SHARD, RelocateReason::OTHER));
+	self->output.send(RelocateShard(mergeRange, DataMovementReason::MERGE_SHARD, RelocateReason::MERGE_SHARD));
 
 	// We are about to be cancelled by the call to restartShardTrackers
 	return Void();
@@ -693,9 +721,9 @@ ACTOR Future<Void> shardEvaluator(DataDistributionTracker* self,
 	ShardSizeBounds shardBounds = getShardSizeBounds(keys, self->maxShardSize->get().get());
 	StorageMetrics const& stats = shardSize->get().get().metrics;
 	auto bandwidthStatus = getBandwidthStatus(stats);
-
-	bool shouldSplit = stats.bytes > shardBounds.max.bytes ||
-	                   (bandwidthStatus == BandwidthStatusHigh && keys.begin < keyServersKeys.begin);
+	bool sizeSplit = stats.bytes > shardBounds.max.bytes,
+	     writeSplit = bandwidthStatus == BandwidthStatusHigh && keys.begin < keyServersKeys.begin;
+	bool shouldSplit = sizeSplit || writeSplit;
 	bool shouldMerge = stats.bytes < shardBounds.min.bytes && bandwidthStatus == BandwidthStatusLow;
 
 	// Every invocation must set this or clear it
@@ -721,14 +749,14 @@ ACTOR Future<Void> shardEvaluator(DataDistributionTracker* self,
 	//     .detail("ShardBoundsMaxBytes", shardBounds.max.bytes)
 	//     .detail("ShardBoundsMinBytes", shardBounds.min.bytes)
 	//     .detail("WriteBandwitdhStatus", bandwidthStatus)
-	//     .detail("SplitBecauseHighWriteBandWidth",
-	//             (bandwidthStatus == BandwidthStatusHigh && keys.begin < keyServersKeys.begin) ? "Yes" : "No");
+	//     .detail("SplitBecauseHighWriteBandWidth", writeSplit ? "Yes" : "No");
 
 	if (!self->anyZeroHealthyTeams->get() && wantsToMerge->hasBeenTrueForLongEnough()) {
 		onChange = onChange || shardMerger(self, keys, shardSize);
 	}
 	if (shouldSplit) {
-		onChange = onChange || shardSplitter(self, keys, shardSize, shardBounds);
+		RelocateReason reason = writeSplit ? RelocateReason::WRITE_SPLIT : RelocateReason::SIZE_SPLIT;
+		onChange = onChange || shardSplitter(self, keys, shardSize, shardBounds, reason);
 	}
 
 	wait(onChange);
@@ -777,8 +805,11 @@ ACTOR Future<Void> shardTracker(DataDistributionTracker::SafeAccessor self,
 	}
 }
 
-void restartShardTrackers(DataDistributionTracker* self, KeyRangeRef keys, Optional<ShardMetrics> startingMetrics) {
-	auto ranges = self->shards.getAffectedRangesAfterInsertion(keys, ShardTrackedData());
+void restartShardTrackers(DataDistributionTracker* self,
+                          KeyRangeRef keys,
+                          Optional<ShardMetrics> startingMetrics,
+                          bool whenDDInit) {
+	auto ranges = self->shards->getAffectedRangesAfterInsertion(keys, ShardTrackedData());
 	for (int i = 0; i < ranges.size(); i++) {
 		if (!ranges[i].value.trackShard.isValid() && ranges[i].begin != keys.begin) {
 			// When starting, key space will be full of "dummy" default contructed entries.
@@ -804,8 +835,9 @@ void restartShardTrackers(DataDistributionTracker* self, KeyRangeRef keys, Optio
 		ShardTrackedData data;
 		data.stats = shardMetrics;
 		data.trackShard = shardTracker(DataDistributionTracker::SafeAccessor(self), ranges[i], shardMetrics);
-		data.trackBytes = trackShardMetrics(DataDistributionTracker::SafeAccessor(self), ranges[i], shardMetrics);
-		self->shards.insert(ranges[i], data);
+		data.trackBytes =
+		    trackShardMetrics(DataDistributionTracker::SafeAccessor(self), ranges[i], shardMetrics, whenDDInit);
+		self->shards->insert(ranges[i], data);
 	}
 }
 
@@ -818,7 +850,8 @@ ACTOR Future<Void> trackInitialShards(DataDistributionTracker* self, Reference<I
 
 	state int s;
 	for (s = 0; s < initData->shards.size() - 1; s++) {
-		restartShardTrackers(self, KeyRangeRef(initData->shards[s].key, initData->shards[s + 1].key));
+		restartShardTrackers(
+		    self, KeyRangeRef(initData->shards[s].key, initData->shards[s + 1].key), Optional<ShardMetrics>(), true);
 		wait(yield(TaskPriority::DataDistribution));
 	}
 
@@ -847,7 +880,7 @@ ACTOR Future<Void> fetchTopKShardMetrics_impl(DataDistributionTracker* self, Get
 			for (i = 0; i < SERVER_KNOBS->DD_SHARD_COMPARE_LIMIT && i < req.keys.size(); ++i) {
 				auto range = req.keys[i];
 				StorageMetrics metrics;
-				for (auto t : self->shards.intersectingRanges(range)) {
+				for (auto t : self->shards->intersectingRanges(range)) {
 					auto& stats = t.value().stats;
 					if (!stats->get().present()) {
 						onChange = stats->onChange();
@@ -913,7 +946,7 @@ ACTOR Future<Void> fetchShardMetrics_impl(DataDistributionTracker* self, GetMetr
 		loop {
 			Future<Void> onChange;
 			StorageMetrics returnMetrics;
-			for (auto t : self->shards.intersectingRanges(req.keys)) {
+			for (auto t : self->shards->intersectingRanges(req.keys)) {
 				auto& stats = t.value().stats;
 				if (!stats->get().present()) {
 					onChange = stats->onChange();
@@ -957,8 +990,8 @@ ACTOR Future<Void> fetchShardMetricsList_impl(DataDistributionTracker* self, Get
 			// list of metrics, regenerate on loop when full range unsuccessful
 			Standalone<VectorRef<DDMetricsRef>> result;
 			Future<Void> onChange;
-			auto beginIter = self->shards.containedRanges(req.keys).begin();
-			auto endIter = self->shards.intersectingRanges(req.keys).end();
+			auto beginIter = self->shards->containedRanges(req.keys).begin();
+			auto endIter = self->shards->intersectingRanges(req.keys).end();
 			for (auto t = beginIter; t != endIter; ++t) {
 				auto& stats = t.value().stats;
 				if (!stats->get().present()) {
@@ -999,6 +1032,7 @@ ACTOR Future<Void> dataDistributionTracker(Reference<InitialDataDistribution> in
                                            Database cx,
                                            PromiseStream<RelocateShard> output,
                                            Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
+                                           Reference<PhysicalShardCollection> physicalShardCollection,
                                            PromiseStream<GetMetricsRequest> getShardMetrics,
                                            FutureStream<GetTopKMetricsRequest> getTopKMetrics,
                                            PromiseStream<GetMetricsListRequest> getShardMetricsList,
@@ -1013,9 +1047,10 @@ ACTOR Future<Void> dataDistributionTracker(Reference<InitialDataDistribution> in
 	                                   readyToStart,
 	                                   output,
 	                                   shardsAffectedByTeamFailure,
+	                                   physicalShardCollection,
 	                                   anyZeroHealthyTeams,
-	                                   *shards,
-	                                   *trackerCancelled);
+	                                   shards,
+	                                   trackerCancelled);
 	state Future<Void> loggingTrigger = Void();
 	state Future<Void> readHotDetect = readHotDetector(&self);
 	state Reference<EventCacheHolder> ddTrackerStatsEventHolder = makeReference<EventCacheHolder>("DDTrackerStats");
@@ -1024,12 +1059,10 @@ ACTOR Future<Void> dataDistributionTracker(Reference<InitialDataDistribution> in
 		initData = Reference<InitialDataDistribution>();
 
 		loop choose {
-			when(Promise<int64_t> req = waitNext(getAverageShardBytes)) {
-				req.send(self.maxShardSize->get().get() / 2);
-			}
+			when(Promise<int64_t> req = waitNext(getAverageShardBytes)) { req.send(self.getAverageShardBytes()); }
 			when(wait(loggingTrigger)) {
 				TraceEvent("DDTrackerStats", self.distributorId)
-				    .detail("Shards", self.shards.size())
+				    .detail("Shards", self.shards->size())
 				    .detail("TotalSizeBytes", self.dbSizeEstimate->get())
 				    .detail("SystemSizeBytes", self.systemSizeEstimate)
 				    .trackLatest(ddTrackerStatsEventHolder->trackingKey);
@@ -1055,3 +1088,542 @@ ACTOR Future<Void> dataDistributionTracker(Reference<InitialDataDistribution> in
 		throw e;
 	}
 }
+
+// Not used yet
+ACTOR Future<Void> dataDistributionTracker(Reference<DDSharedContext> context,
+                                           Reference<InitialDataDistribution> initData,
+                                           Database cx,
+                                           KeyRangeMap<ShardTrackedData>* shards);
+
+// Methods for PhysicalShardCollection
+FDB_DEFINE_BOOLEAN_PARAM(InAnonymousPhysicalShard);
+FDB_DEFINE_BOOLEAN_PARAM(PhysicalShardHasMoreThanKeyRange);
+FDB_DEFINE_BOOLEAN_PARAM(InOverSizePhysicalShard);
+FDB_DEFINE_BOOLEAN_PARAM(PhysicalShardAvailable);
+FDB_DEFINE_BOOLEAN_PARAM(MoveKeyRangeOutPhysicalShard);
+
+// Decide whether a physical shard is available at the moment when the func is calling
+PhysicalShardAvailable PhysicalShardCollection::checkPhysicalShardAvailable(uint64_t physicalShardID,
+                                                                            StorageMetrics const& moveInMetrics) {
+	ASSERT(physicalShardID != UID().first() && physicalShardID != anonymousShardId.first());
+	ASSERT(physicalShardInstances.count(physicalShardID) > 0);
+	if (physicalShardInstances[physicalShardID].metrics.bytes + moveInMetrics.bytes >
+	    SERVER_KNOBS->MAX_PHYSICAL_SHARD_BYTES) {
+		return PhysicalShardAvailable::False;
+	}
+	return PhysicalShardAvailable::True;
+}
+
+std::string PhysicalShardCollection::convertIDsToString(std::set<uint64_t> ids) {
+	std::string r = "";
+	for (auto id : ids) {
+		r = r + std::to_string(id) + " ";
+	}
+	return r;
+}
+
+void PhysicalShardCollection::updateTeamPhysicalShardIDsMap(uint64_t inputPhysicalShardID,
+                                                            std::vector<ShardsAffectedByTeamFailure::Team> inputTeams,
+                                                            uint64_t debugID) {
+	ASSERT(inputTeams.size() <= 2);
+	ASSERT(inputPhysicalShardID != anonymousShardId.first() && inputPhysicalShardID != UID().first());
+	for (auto inputTeam : inputTeams) {
+		if (teamPhysicalShardIDs.count(inputTeam) == 0) {
+			std::set<uint64_t> physicalShardIDSet;
+			physicalShardIDSet.insert(inputPhysicalShardID);
+			teamPhysicalShardIDs.insert(std::make_pair(inputTeam, physicalShardIDSet));
+		} else {
+			teamPhysicalShardIDs[inputTeam].insert(inputPhysicalShardID);
+		}
+	}
+	return;
+}
+
+void PhysicalShardCollection::insertPhysicalShardToCollection(uint64_t physicalShardID,
+                                                              StorageMetrics const& metrics,
+                                                              std::vector<ShardsAffectedByTeamFailure::Team> teams,
+                                                              uint64_t debugID,
+                                                              PhysicalShardCreationTime whenCreated) {
+	ASSERT(physicalShardID != anonymousShardId.first() && physicalShardID != UID().first());
+	ASSERT(physicalShardInstances.count(physicalShardID) == 0);
+	physicalShardInstances.insert(
+	    std::make_pair(physicalShardID, PhysicalShard(physicalShardID, metrics, teams, whenCreated)));
+	return;
+}
+
+void PhysicalShardCollection::updatekeyRangePhysicalShardIDMap(KeyRange keyRange,
+                                                               uint64_t physicalShardID,
+                                                               uint64_t debugID) {
+	ASSERT(physicalShardID != UID().first());
+	keyRangePhysicalShardIDMap.insert(keyRange, physicalShardID);
+	return;
+}
+
+// At beginning of the transition from the initial state without physical shard notion
+// to the physical shard aware state, the physicalShard set only contains one element which is anonymousShardId[0]
+// After a period in the transition, the physicalShard set of the team contains some meaningful physicalShardIDs
+Optional<uint64_t> PhysicalShardCollection::trySelectAvailablePhysicalShardFor(ShardsAffectedByTeamFailure::Team team,
+                                                                               StorageMetrics const& moveInMetrics,
+                                                                               uint64_t debugID) {
+	ASSERT(team.servers.size() > 0);
+	// Case: The team is not tracked in the mapping (teamPhysicalShardIDs)
+	if (teamPhysicalShardIDs.count(team) == 0) {
+		return Optional<uint64_t>();
+	}
+	ASSERT(teamPhysicalShardIDs[team].size() >= 1);
+	// Case: The team is tracked in the mapping and the system already has physical shard notion
+	// 		and the number of physicalShard is large
+	std::vector<uint64_t> availablePhysicalShardIDs;
+	for (auto physicalShardID : teamPhysicalShardIDs[team]) {
+		if (physicalShardID == anonymousShardId.first() || physicalShardID == UID().first()) {
+			ASSERT(false);
+		}
+		ASSERT(physicalShardInstances.count(physicalShardID));
+		/*TraceEvent("TryGetPhysicalShardIDCandidates")
+		    .detail("PhysicalShardID", physicalShardID)
+		    .detail("Bytes", physicalShardInstances[physicalShardID].metrics.bytes)
+		    .detail("BelongTeam", team.toString())
+		    .detail("DebugID", debugID);*/
+		if (!checkPhysicalShardAvailable(physicalShardID, moveInMetrics)) {
+			continue;
+		}
+		availablePhysicalShardIDs.push_back(physicalShardID);
+	}
+	if (availablePhysicalShardIDs.size() == 0) {
+		/*TraceEvent("TryGetPhysicalShardIDResultFailed")
+		    .detail("Reason", "no valid physicalShard")
+		    .detail("MoveInBytes", moveInMetrics.bytes)
+		    .detail("MaxPhysicalShardBytes", SERVER_KNOBS->MAX_PHYSICAL_SHARD_BYTES)
+		    .detail("DebugID", debugID);*/
+		return Optional<uint64_t>();
+	}
+	return deterministicRandom()->randomChoice(availablePhysicalShardIDs);
+}
+
+uint64_t PhysicalShardCollection::generateNewPhysicalShardID(uint64_t debugID) {
+	uint64_t physicalShardID = UID().first();
+	int stuckCount = 0;
+	while (physicalShardID == UID().first() || physicalShardID == anonymousShardId.first()) {
+		physicalShardID = deterministicRandom()->randomUInt64();
+		stuckCount = stuckCount + 1;
+		if (stuckCount > 50) {
+			ASSERT(false);
+		}
+	}
+	ASSERT(physicalShardID != UID().first() && physicalShardID != anonymousShardId.first());
+	//TraceEvent("GenerateNewPhysicalShardID").detail("PhysicalShardID", physicalShardID).detail("DebugID", debugID);
+	return physicalShardID;
+}
+
+void PhysicalShardCollection::reduceMetricsForMoveOut(uint64_t physicalShardID, StorageMetrics const& moveOutMetrics) {
+	ASSERT(physicalShardInstances.count(physicalShardID) != 0);
+	ASSERT(physicalShardID != UID().first() && physicalShardID != anonymousShardId.first());
+	physicalShardInstances[physicalShardID].metrics = physicalShardInstances[physicalShardID].metrics - moveOutMetrics;
+	return;
+}
+
+void PhysicalShardCollection::increaseMetricsForMoveIn(uint64_t physicalShardID, StorageMetrics const& moveInMetrics) {
+	ASSERT(physicalShardInstances.count(physicalShardID) != 0);
+	ASSERT(physicalShardID != UID().first() && physicalShardID != anonymousShardId.first());
+	physicalShardInstances[physicalShardID].metrics = physicalShardInstances[physicalShardID].metrics + moveInMetrics;
+	return;
+}
+
+void PhysicalShardCollection::updatePhysicalShardMetricsByKeyRange(KeyRange keyRange,
+                                                                   StorageMetrics const& newMetrics,
+                                                                   StorageMetrics const& oldMetrics,
+                                                                   bool initWithNewMetrics) {
+	auto ranges = keyRangePhysicalShardIDMap.intersectingRanges(keyRange);
+	std::set<uint64_t> physicalShardIDSet;
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		physicalShardIDSet.insert(it->value());
+	}
+	StorageMetrics delta;
+	if (initWithNewMetrics) {
+		delta = newMetrics;
+	} else {
+		delta = newMetrics - oldMetrics;
+	}
+	for (auto physicalShardID : physicalShardIDSet) {
+		ASSERT(physicalShardID != UID().first());
+		if (physicalShardID == anonymousShardId.first()) {
+			continue; // we ignore anonymousShard when updating physicalShard metrics
+		}
+		increaseMetricsForMoveIn(physicalShardID, (delta * (1.0 / physicalShardIDSet.size())));
+	}
+	return;
+}
+
+InAnonymousPhysicalShard PhysicalShardCollection::isInAnonymousPhysicalShard(KeyRange keyRange) {
+	InAnonymousPhysicalShard res = InAnonymousPhysicalShard::True;
+	auto ranges = keyRangePhysicalShardIDMap.intersectingRanges(keyRange);
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		uint64_t physicalShardID = it->value();
+		if (physicalShardID != anonymousShardId.first()) {
+			// res = false if exists a part of keyRange belongs to a non-anonymous physicalShard
+			// exist a case where some keyRange of anonymousShard is decided to move
+			// to a non-anonymous physicalShard but not completes
+			res = InAnonymousPhysicalShard::False;
+		}
+	}
+	return res;
+}
+
+// TODO: require optimize
+// It is slow to go through the keyRangePhysicalShardIDRanges for each time
+// Do we need a D/S to store the keyRange for each physicalShard?
+PhysicalShardHasMoreThanKeyRange PhysicalShardCollection::whetherPhysicalShardHasMoreThanKeyRange(
+    uint64_t physicalShardID,
+    KeyRange keyRange) {
+	KeyRangeMap<uint64_t>::Ranges keyRangePhysicalShardIDRanges = keyRangePhysicalShardIDMap.ranges();
+	KeyRangeMap<uint64_t>::iterator it = keyRangePhysicalShardIDRanges.begin();
+	for (; it != keyRangePhysicalShardIDRanges.end(); ++it) {
+		if (it->value() != physicalShardID) {
+			continue;
+		}
+		auto keyRangePiece = KeyRangeRef(it->range().begin, it->range().end);
+		if (!keyRange.intersects(keyRangePiece)) {
+			return PhysicalShardHasMoreThanKeyRange::True;
+		}
+		// if keyRange and keyRangePiece have intersection
+		if (!keyRange.contains(keyRangePiece)) {
+			return PhysicalShardHasMoreThanKeyRange::True;
+		}
+	}
+	return PhysicalShardHasMoreThanKeyRange::False;
+}
+
+InOverSizePhysicalShard PhysicalShardCollection::isInOverSizePhysicalShard(KeyRange keyRange) {
+	auto ranges = keyRangePhysicalShardIDMap.intersectingRanges(keyRange);
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		uint64_t physicalShardID = it->value();
+		ASSERT(physicalShardID != UID().first());
+		if (physicalShardID == anonymousShardId.first()) {
+			continue;
+		}
+		if (checkPhysicalShardAvailable(physicalShardID, StorageMetrics())) {
+			continue;
+		}
+		if (!whetherPhysicalShardHasMoreThanKeyRange(physicalShardID, keyRange)) {
+			continue;
+		}
+		return InOverSizePhysicalShard::True;
+	}
+	return InOverSizePhysicalShard::False;
+}
+
+uint64_t PhysicalShardCollection::determinePhysicalShardIDGivenPrimaryTeam(
+    ShardsAffectedByTeamFailure::Team primaryTeam,
+    StorageMetrics const& metrics,
+    bool forceToUseNewPhysicalShard,
+    uint64_t debugID) {
+	ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
+	ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD);
+	ASSERT(primaryTeam.primary == true);
+	if (forceToUseNewPhysicalShard) {
+		return generateNewPhysicalShardID(debugID);
+	}
+	Optional<uint64_t> physicalShardIDFetch = trySelectAvailablePhysicalShardFor(primaryTeam, metrics, debugID);
+	if (!physicalShardIDFetch.present()) {
+		return generateNewPhysicalShardID(debugID);
+	}
+	return physicalShardIDFetch.get();
+}
+
+// May return a problematic remote team
+Optional<ShardsAffectedByTeamFailure::Team> PhysicalShardCollection::tryGetAvailableRemoteTeamWith(
+    uint64_t inputPhysicalShardID,
+    StorageMetrics const& moveInMetrics,
+    uint64_t debugID) {
+	ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
+	ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD);
+	ASSERT(inputPhysicalShardID != anonymousShardId.first() && inputPhysicalShardID != UID().first());
+	if (physicalShardInstances.count(inputPhysicalShardID) == 0) {
+		return Optional<ShardsAffectedByTeamFailure::Team>();
+	}
+	if (!checkPhysicalShardAvailable(inputPhysicalShardID, moveInMetrics)) {
+		return Optional<ShardsAffectedByTeamFailure::Team>();
+	}
+	for (auto team : physicalShardInstances[inputPhysicalShardID].teams) {
+		if (team.primary == false) {
+			/*TraceEvent("TryGetRemoteTeamWith")
+			    .detail("PhysicalShardID", inputPhysicalShardID)
+			    .detail("Team", team.toString())
+			    .detail("TeamSize", team.servers.size())
+			    .detail("PhysicalShardsOfTeam", convertIDsToString(teamPhysicalShardIDs[team]))
+			    .detail("DebugID", debugID);*/
+			return team;
+		}
+	}
+	UNREACHABLE();
+}
+
+// The update of PhysicalShardToTeams, Collection, keyRangePhysicalShardIDMap should be atomic
+void PhysicalShardCollection::initPhysicalShardCollection(KeyRange keys,
+                                                          std::vector<ShardsAffectedByTeamFailure::Team> selectedTeams,
+                                                          uint64_t physicalShardID,
+                                                          uint64_t debugID) {
+	ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
+	ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD);
+	ASSERT(physicalShardID != UID().first());
+	if (physicalShardID != anonymousShardId.first()) {
+		updateTeamPhysicalShardIDsMap(physicalShardID, selectedTeams, debugID);
+		if (physicalShardInstances.count(physicalShardID) == 0) {
+			insertPhysicalShardToCollection(
+			    physicalShardID, StorageMetrics(), selectedTeams, debugID, PhysicalShardCreationTime::DDInit);
+		} else {
+			// This assertion will be broken if we enable the optimization of data move traffic between DCs
+			ASSERT(physicalShardInstances[physicalShardID].teams == selectedTeams);
+		}
+	} else {
+		// If any physicalShard restored when DD init is the anonymousShard,
+		// Then DD enters Transition state where DD graduatelly moves Shard (or KeyRange)
+		// out of the anonymousShard
+		setTransitionCheck();
+	}
+	updatekeyRangePhysicalShardIDMap(keys, physicalShardID, debugID);
+	return;
+}
+
+// The update of PhysicalShardToTeams, Collection, keyRangePhysicalShardIDMap should be atomic
+void PhysicalShardCollection::updatePhysicalShardCollection(
+    KeyRange keys,
+    bool isRestore,
+    std::vector<ShardsAffectedByTeamFailure::Team> selectedTeams,
+    uint64_t physicalShardID,
+    const StorageMetrics& metrics,
+    uint64_t debugID) {
+	ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
+	ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD);
+	ASSERT(physicalShardID != UID().first());
+	/*TraceEvent e("UpdatePhysicalShard");
+	e.detail("DebugID", debugID);
+	e.detail("KeyRange", keys);
+	e.detail("IsRestore", isRestore);*/
+	// When updates metrics in physicalShard collection, we assume:
+	// It is impossible to move a keyRange from anonymousShard to a valid physicalShard
+	// Thus, we ignore anonymousShard when updating metrics
+	if (physicalShardID != anonymousShardId.first()) {
+		updateTeamPhysicalShardIDsMap(physicalShardID, selectedTeams, debugID);
+		// Update physicalShardInstances
+		// Add the metrics to in-physicalShard
+		// e.detail("PhysicalShardIDIn", physicalShardID);
+		if (physicalShardInstances.count(physicalShardID) == 0) {
+			// e.detail("Op", "Insert");
+			insertPhysicalShardToCollection(
+			    physicalShardID, metrics, selectedTeams, debugID, PhysicalShardCreationTime::DDRelocator);
+		} else {
+			// e.detail("Op", "Update");
+			//  This assertion is true since we disable the optimization of data move traffic between DCs
+			ASSERT(physicalShardInstances[physicalShardID].teams == selectedTeams);
+			increaseMetricsForMoveIn(physicalShardID, metrics);
+		}
+	}
+	// Minus the metrics from the existing (multiple) out-physicalShard(s)
+	auto ranges = keyRangePhysicalShardIDMap.intersectingRanges(keys);
+	std::set<uint64_t> physicalShardIDSet;
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		physicalShardIDSet.insert(it->value());
+	}
+	/*std::string physicalShardIDOut = "";
+	for (auto id : physicalShardIDSet) {
+	    physicalShardIDOut = physicalShardIDOut + std::to_string(id) + " ";
+	}*/
+	// e.detail("PhysicalShardIDOut", physicalShardIDOut);
+	for (auto physicalShardID : physicalShardIDSet) { // imprecise: evenly move out bytes
+		if (physicalShardID == anonymousShardId.first()) {
+			continue; // we ignore anonymousShard when updating physicalShard metrics
+		}
+		StorageMetrics toReduceMetrics = metrics * (1.0 / physicalShardIDSet.size());
+		reduceMetricsForMoveOut(physicalShardID, toReduceMetrics);
+	}
+	// keyRangePhysicalShardIDMap must be update after updating the metrics of physicalShardInstances
+	updatekeyRangePhysicalShardIDMap(keys, physicalShardID, debugID);
+	return;
+}
+
+// return false if no need to move keyRange out of current physical shard
+MoveKeyRangeOutPhysicalShard PhysicalShardCollection::trackPhysicalShard(KeyRange keyRange,
+                                                                         StorageMetrics const& newMetrics,
+                                                                         StorageMetrics const& oldMetrics,
+                                                                         bool initWithNewMetrics) {
+	ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
+	ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD);
+	updatePhysicalShardMetricsByKeyRange(keyRange, newMetrics, oldMetrics, initWithNewMetrics);
+	if (requireTransitionCheck() &&
+	    now() - lastTransitionStartTime > SERVER_KNOBS->ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME) {
+		if (isInAnonymousPhysicalShard(keyRange)) {
+			// Currently, whenever a shard updates metrics, it checks whether is in AnonymousPhysicalShard
+			// If yes, and if the shard has been created for long time, then triggers a data move on the shard.
+			resetLastTransitionStartTime();
+			TraceEvent("PhysicalShardTiggerTransitionMove")
+			    .detail("KeyRange", keyRange)
+			    .detail("TransitionCoolDownTime", SERVER_KNOBS->ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME);
+			return MoveKeyRangeOutPhysicalShard::True;
+		}
+	}
+	if (isInOverSizePhysicalShard(keyRange)) {
+		return MoveKeyRangeOutPhysicalShard::True;
+	}
+	return MoveKeyRangeOutPhysicalShard::False;
+}
+
+// The update of PhysicalShardToTeams, PhysicalShardInstances, KeyRangePhysicalShardIDMap should be atomic
+void PhysicalShardCollection::cleanUpPhysicalShardCollection() {
+	ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
+	ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD);
+	std::set<uint64_t> physicalShardsInUse;
+	std::map<uint64_t, StorageMetrics> metricsReplies;
+	KeyRangeMap<uint64_t>::Ranges keyRangePhysicalShardIDRanges = keyRangePhysicalShardIDMap.ranges();
+	KeyRangeMap<uint64_t>::iterator it = keyRangePhysicalShardIDRanges.begin();
+	// Assume that once a physical shard is disappear in keyRangePhysicalShardIDMap,
+	// the physical shard (with the deleted id) should be deprecated.
+	// This function aims at clean up those deprecated physical shards in PhysicalShardCollection
+	// This function collects the physicalShard usage info from KeyRangePhysicalShardIDMap,
+	// then based on the info to update PhysicalShardToTeams and PhysicalShardInstances
+
+	// keyRangePhysicalShardIDMap indicates which physicalShard actually has data
+	// Step 1: Clear unused physicalShard in physicalShardInstances based on keyRangePhysicalShardIDMap
+	for (; it != keyRangePhysicalShardIDRanges.end(); ++it) {
+		uint64_t physicalShardID = it->value();
+		if (physicalShardID == anonymousShardId.first()) {
+			continue;
+		}
+		physicalShardsInUse.insert(physicalShardID);
+	}
+	for (auto it = physicalShardInstances.begin(); it != physicalShardInstances.end();) {
+		uint64_t physicalShardID = it->first;
+		ASSERT(physicalShardInstances.count(physicalShardID) > 0);
+		if (physicalShardsInUse.count(physicalShardID) == 0) {
+			/*TraceEvent("PhysicalShardisEmpty")
+			    .detail("PhysicalShard", physicalShardID)
+			    .detail("RemainBytes", physicalShardInstances[physicalShardID].metrics.bytes);*/
+			// "RemainBytes" indicates the deviation of current physical shard metric update
+			it = physicalShardInstances.erase(it);
+		} else {
+			it++;
+		}
+	}
+	// Step 2: Clean up teamPhysicalShardIDs
+	std::set<ShardsAffectedByTeamFailure::Team> toRemoveTeams;
+	for (auto [team, _] : teamPhysicalShardIDs) {
+		for (auto it = teamPhysicalShardIDs[team].begin(); it != teamPhysicalShardIDs[team].end();) {
+			uint64_t physicalShardID = *it;
+			if (physicalShardInstances.count(physicalShardID) == 0) {
+				// physicalShardID has been removed from physicalShardInstances (see step 1)
+				// So, remove the physicalShard from teamPhysicalShardID[team]
+				it = teamPhysicalShardIDs[team].erase(it);
+			} else {
+				it++;
+			}
+		}
+		if (teamPhysicalShardIDs[team].size() == 0) {
+			// If a team has no physicalShard, remove the team from teamPhysicalShardID
+			toRemoveTeams.insert(team);
+		}
+	}
+	for (auto team : toRemoveTeams) {
+		teamPhysicalShardIDs.erase(team);
+	}
+}
+
+void PhysicalShardCollection::logPhysicalShardCollection() {
+	ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
+	ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD);
+	// Step 1: Logging non-empty physicalShard
+	for (auto [physicalShardID, physicalShard] : physicalShardInstances) {
+		ASSERT(physicalShardID == physicalShard.id);
+		TraceEvent e("PhysicalShardStatus");
+		e.detail("PhysicalShardID", physicalShardID);
+		e.detail("TotalBytes", physicalShard.metrics.bytes);
+	}
+	// Step 2: Logging TeamPhysicalShardStatus
+	for (auto [team, physicalShardIDs] : teamPhysicalShardIDs) {
+		TraceEvent e("TeamPhysicalShardStatus");
+		e.detail("Team", team.toString());
+		// std::string metricsStr = "";
+		int64_t counter = 0;
+		int64_t totalBytes = 0;
+		int64_t maxPhysicalShardBytes = -1;
+		int64_t minPhysicalShardBytes = StorageMetrics::infinity;
+		uint64_t maxPhysicalShardID = 0;
+		uint64_t minPhysicalShardID = 0;
+		for (auto physicalShardID : physicalShardIDs) {
+			ASSERT(physicalShardInstances.count(physicalShardID) > 0);
+			uint64_t id = physicalShardInstances[physicalShardID].id;
+			int64_t bytes = physicalShardInstances[physicalShardID].metrics.bytes;
+			if (bytes > maxPhysicalShardBytes) {
+				maxPhysicalShardBytes = bytes;
+				maxPhysicalShardID = id;
+			}
+			if (bytes < minPhysicalShardBytes) {
+				minPhysicalShardBytes = bytes;
+				minPhysicalShardID = id;
+			}
+			totalBytes = totalBytes + bytes;
+			/* metricsStr = metricsStr + std::to_string(id) + ":" + std::to_string(bytes);
+			if (counter < physicalShardIDs.size() - 1) {
+			    metricsStr = metricsStr + ",";
+			} */
+			counter = counter + 1;
+		}
+		// e.detail("Metrics", metricsStr);
+		e.detail("TotalBytes", totalBytes);
+		e.detail("NumPhysicalShards", counter);
+		e.detail("MaxPhysicalShard", std::to_string(maxPhysicalShardID) + ":" + std::to_string(maxPhysicalShardBytes));
+		e.detail("MinPhysicalShard", std::to_string(minPhysicalShardID) + ":" + std::to_string(minPhysicalShardBytes));
+	}
+	// Step 3: Logging StorageServerPhysicalShardStatus
+	std::map<UID, std::map<uint64_t, int64_t>> storageServerPhysicalShardStatus;
+	for (auto [team, _] : teamPhysicalShardIDs) {
+		for (auto ssid : team.servers) {
+			for (auto it = teamPhysicalShardIDs[team].begin(); it != teamPhysicalShardIDs[team].end();) {
+				uint64_t physicalShardID = *it;
+				if (storageServerPhysicalShardStatus.count(ssid) != 0) {
+					if (storageServerPhysicalShardStatus[ssid].count(physicalShardID) == 0) {
+						ASSERT(physicalShardInstances.count(physicalShardID) > 0);
+						storageServerPhysicalShardStatus[ssid].insert(
+						    std::make_pair(physicalShardID, physicalShardInstances[physicalShardID].metrics.bytes));
+					}
+				} else {
+					ASSERT(physicalShardInstances.count(physicalShardID) > 0);
+					std::map<uint64_t, int64_t> tmp;
+					tmp.insert(std::make_pair(physicalShardID, physicalShardInstances[physicalShardID].metrics.bytes));
+					storageServerPhysicalShardStatus.insert(std::make_pair(ssid, tmp));
+				}
+				it++;
+			}
+		}
+	}
+	for (auto [serverID, physicalShardMetrics] : storageServerPhysicalShardStatus) {
+		TraceEvent e("ServerPhysicalShardStatus");
+		e.detail("Server", serverID);
+		e.detail("NumPhysicalShards", physicalShardMetrics.size());
+		int64_t totalBytes = 0;
+		int64_t maxPhysicalShardBytes = -1;
+		int64_t minPhysicalShardBytes = StorageMetrics::infinity;
+		uint64_t maxPhysicalShardID = 0;
+		uint64_t minPhysicalShardID = 0;
+		// std::string metricsStr = "";
+		// int64_t counter = 0;
+		for (auto [physicalShardID, bytes] : physicalShardMetrics) {
+			totalBytes = totalBytes + bytes;
+			if (bytes > maxPhysicalShardBytes) {
+				maxPhysicalShardBytes = bytes;
+				maxPhysicalShardID = physicalShardID;
+			}
+			if (bytes < minPhysicalShardBytes) {
+				minPhysicalShardBytes = bytes;
+				minPhysicalShardID = physicalShardID;
+			}
+			/* metricsStr = metricsStr + std::to_string(physicalShardID) + ":" + std::to_string(bytes);
+			if (counter < physicalShardMetrics.size() - 1) {
+			        metricsStr = metricsStr + ",";
+			}
+			counter = counter + 1; */
+		}
+		e.detail("TotalBytes", totalBytes);
+		e.detail("MaxPhysicalShard", std::to_string(maxPhysicalShardID) + ":" + std::to_string(maxPhysicalShardBytes));
+		e.detail("MinPhysicalShard", std::to_string(minPhysicalShardID) + ":" + std::to_string(minPhysicalShardBytes));
+	}
+}
diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp
index 303d13bd02..5babbe0c3e 100644
--- a/fdbserver/DDTeamCollection.actor.cpp
+++ b/fdbserver/DDTeamCollection.actor.cpp
@@ -949,13 +949,13 @@ public:
 								}
 							}
 
-							RelocateShard rs;
-							rs.keys = shards[i];
-							rs.priority = maxPriority;
+							RelocateShard rs(
+							    shards[i], maxPriority, RelocateReason::OTHER, deterministicRandom()->randomUniqueID());
 
 							self->output.send(rs);
 							TraceEvent("SendRelocateToDDQueue", self->distributorId)
 							    .suppressFor(1.0)
+							    .detail("TraceId", rs.traceId)
 							    .detail("ServerPrimary", self->primary)
 							    .detail("ServerTeam", team->getDesc())
 							    .detail("KeyBegin", rs.keys.begin)
@@ -2468,16 +2468,7 @@ public:
 
 		loop {
 			try {
-				// Divide TSS evenly in each DC if there are multiple
-				// TODO would it be better to put all of them in primary DC?
-				targetTSSInDC = self->configuration.desiredTSSCount;
-				if (self->configuration.usableRegions > 1) {
-					targetTSSInDC /= self->configuration.usableRegions;
-					if (self->primary) {
-						// put extras in primary DC if it's uneven
-						targetTSSInDC += (self->configuration.desiredTSSCount % self->configuration.usableRegions);
-					}
-				}
+				targetTSSInDC = self->getTargetTSSInDC();
 				int newTssToRecruit = targetTSSInDC - self->tss_info_by_pair.size() - inProgressTSSCount;
 				// FIXME: Should log this if the recruit count stays the same but the other numbers update?
 				if (newTssToRecruit != tssToRecruit) {
@@ -2830,56 +2821,40 @@ public:
 		}
 	}
 
-	ACTOR static Future<UID> getNextWigglingServerID(DDTeamCollection* teamCollection) {
-		state Optional<Value> localityKey;
-		state Optional<Value> localityValue;
-
-		// NOTE: because normal \xff/conf change through `changeConfig` now will cause DD throw `movekeys_conflict()`
-		// then recruit a new DD, we only need to read current configuration once
-		if (teamCollection->configuration.perpetualStorageWiggleLocality != "0") {
-			// parsing format is like "datahall:0"
-			std::string& localityKeyValue = teamCollection->configuration.perpetualStorageWiggleLocality;
-			ASSERT(isValidPerpetualStorageWiggleLocality(localityKeyValue));
-			// get key and value from perpetual_storage_wiggle_locality.
-			int split = localityKeyValue.find(':');
-			localityKey = Optional<Value>(ValueRef((uint8_t*)localityKeyValue.c_str(), split));
-			localityValue = Optional<Value>(
-			    ValueRef((uint8_t*)localityKeyValue.c_str() + split + 1, localityKeyValue.size() - split - 1));
-		}
-
+	ACTOR static Future<UID> getNextWigglingServerID(Reference<StorageWiggler> wiggler,
+	                                                 Optional<Value> localityKey = Optional<Value>(),
+	                                                 Optional<Value> localityValue = Optional<Value>(),
+	                                                 DDTeamCollection* teamCollection = nullptr) {
+		ASSERT(wiggler->teamCollection == teamCollection);
 		loop {
-			// wait until the wiggle queue is not empty
-			if (teamCollection->storageWiggler->empty()) {
-				wait(teamCollection->storageWiggler->nonEmpty.onChange());
+			// when the DC need more
+			state Optional<UID> id =
+			    wiggler->getNextServerId(teamCollection == nullptr || teamCollection->reachTSSPairTarget());
+			if (!id.present()) {
+				wait(wiggler->onCheck());
+				continue;
 			}
 
 			// if perpetual_storage_wiggle_locality has value and not 0(disabled).
 			if (localityKey.present()) {
 				// Whether the selected server matches the locality
-				auto id = teamCollection->storageWiggler->getNextServerId();
-				if (!id.present())
-					continue;
 				auto server = teamCollection->server_info.at(id.get());
 
 				// TraceEvent("PerpetualLocality").detail("Server", server->getLastKnownInterface().locality.get(localityKey)).detail("Desire", localityValue);
 				if (server->getLastKnownInterface().locality.get(localityKey.get()) == localityValue) {
 					return id.get();
-				} else {
-					if (teamCollection->storageWiggler->empty()) {
-						// None of the entries in wiggle queue matches the given locality.
-						TraceEvent("PerpetualStorageWiggleEmptyQueue", teamCollection->distributorId)
-						    .detail("WriteValue", "No process matched the given perpetualStorageWiggleLocality")
-						    .detail("PerpetualStorageWiggleLocality",
-						            teamCollection->configuration.perpetualStorageWiggleLocality);
-					}
-					continue;
 				}
-			} else {
-				auto id = teamCollection->storageWiggler->getNextServerId();
-				if (!id.present())
-					continue;
-				return id.get();
+
+				if (wiggler->empty()) {
+					// None of the entries in wiggle queue matches the given locality.
+					TraceEvent("PerpetualStorageWiggleEmptyQueue", teamCollection->distributorId)
+					    .detail("WriteValue", "No process matched the given perpetualStorageWiggleLocality")
+					    .detail("PerpetualStorageWiggleLocality",
+					            teamCollection->configuration.perpetualStorageWiggleLocality);
+				}
+				continue;
 			}
+			return id.get();
 		}
 	}
 
@@ -3101,7 +3076,7 @@ public:
 					server_status[key] = self->server_status.get(key);
 				}
 
-				TraceEvent("DDPrintSnapshotTeasmInfo", self->getDistributorId())
+				TraceEvent("DDPrintSnapshotTeamsInfo", self->getDistributorId())
 				    .detail("SnapshotSpeed", now() - snapshotStart)
 				    .detail("Primary", self->isPrimary());
 
@@ -3278,6 +3253,22 @@ public:
 
 }; // class DDTeamCollectionImpl
 
+int32_t DDTeamCollection::getTargetTSSInDC() const {
+	int32_t targetTSSInDC = configuration.desiredTSSCount;
+	if (configuration.usableRegions > 1) {
+		targetTSSInDC /= configuration.usableRegions;
+		if (primary) {
+			// put extras in primary DC if it's uneven
+			targetTSSInDC += (configuration.desiredTSSCount % configuration.usableRegions);
+		}
+	}
+	return targetTSSInDC;
+}
+
+bool DDTeamCollection::reachTSSPairTarget() const {
+	return tss_info_by_pair.size() >= getTargetTSSInDC();
+}
+
 Reference<TCMachineTeamInfo> DDTeamCollection::findMachineTeam(
     std::vector<Standalone<StringRef>> const& machineIDs) const {
 	if (machineIDs.empty()) {
@@ -3538,7 +3529,23 @@ Future<UID> DDTeamCollection::getClusterId() {
 }
 
 Future<UID> DDTeamCollection::getNextWigglingServerID() {
-	return DDTeamCollectionImpl::getNextWigglingServerID(this);
+	Optional<Value> localityKey;
+	Optional<Value> localityValue;
+
+	// NOTE: because normal \xff/conf change through `changeConfig` now will cause DD throw `movekeys_conflict()`
+	// then recruit a new DD, we only need to read current configuration once
+	if (configuration.perpetualStorageWiggleLocality != "0") {
+		// parsing format is like "datahall:0"
+		std::string& localityKeyValue = configuration.perpetualStorageWiggleLocality;
+		ASSERT(isValidPerpetualStorageWiggleLocality(localityKeyValue));
+		// get key and value from perpetual_storage_wiggle_locality.
+		int split = localityKeyValue.find(':');
+		localityKey = Optional<Value>(ValueRef((uint8_t*)localityKeyValue.c_str(), split));
+		localityValue = Optional<Value>(
+		    ValueRef((uint8_t*)localityKeyValue.c_str() + split + 1, localityKeyValue.size() - split - 1));
+	}
+
+	return DDTeamCollectionImpl::getNextWigglingServerID(storageWiggler, localityKey, localityValue, this);
 }
 
 Future<Void> DDTeamCollection::readStorageWiggleMap() {
@@ -5134,6 +5141,7 @@ Future<Void> DDTeamCollection::printSnapshotTeamsInfo(Reference<DDTeamCollection
 }
 
 class DDTeamCollectionUnitTest {
+public:
 	static std::unique_ptr<DDTeamCollection> testTeamCollection(int teamSize,
 	                                                            Reference<IReplicationPolicy> policy,
 	                                                            int processCount) {
@@ -5238,7 +5246,6 @@ class DDTeamCollectionUnitTest {
 		return collection;
 	}
 
-public:
 	ACTOR static Future<Void> AddTeamsBestOf_UseMachineID() {
 		wait(Future<Void>(Void()));
 
@@ -5872,3 +5879,78 @@ TEST_CASE("/DataDistribution/GetTeam/DeprioritizeWigglePausedTeam") {
 	wait(DDTeamCollectionUnitTest::GetTeam_DeprioritizeWigglePausedTeam());
 	return Void();
 }
+
+TEST_CASE("/DataDistribution/StorageWiggler/NextIdWithMinAge") {
+	state StorageWiggler wiggler(nullptr);
+	state double startTime = now();
+	wiggler.addServer(UID(1, 0),
+	                  StorageMetadataType(startTime - SERVER_KNOBS->DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC + 5.0,
+	                                      KeyValueStoreType::SSD_BTREE_V2));
+	wiggler.addServer(UID(2, 0),
+	                  StorageMetadataType(
+	                      startTime + SERVER_KNOBS->DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC, KeyValueStoreType::MEMORY, true));
+	wiggler.addServer(UID(3, 0), StorageMetadataType(startTime - 5.0, KeyValueStoreType::SSD_ROCKSDB_V1, true));
+	wiggler.addServer(UID(4, 0),
+	                  StorageMetadataType(startTime - SERVER_KNOBS->DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC - 1.0,
+	                                      KeyValueStoreType::SSD_BTREE_V2));
+	std::vector<Optional<UID>> correctResult{ UID(3, 0), UID(2, 0), UID(4, 0), Optional<UID>() };
+	for (int i = 0; i < 4; ++i) {
+		auto id = wiggler.getNextServerId();
+		ASSERT(id == correctResult[i]);
+	}
+
+	{
+		std::cout << "Finish Initial Check. Start test getNextWigglingServerID() loop...\n";
+		// test the getNextWigglingServerID() loop
+		UID id = wait(DDTeamCollectionImpl::getNextWigglingServerID(Reference<StorageWiggler>::addRef(&wiggler)));
+		ASSERT(id == UID(1, 0));
+	}
+
+	std::cout << "Test after addServer() ...\n";
+	state Future<UID> nextFuture =
+	    DDTeamCollectionImpl::getNextWigglingServerID(Reference<StorageWiggler>::addRef(&wiggler));
+	ASSERT(!nextFuture.isReady());
+	startTime = now();
+	StorageMetadataType metadata(startTime + SERVER_KNOBS->DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC + 100.0,
+	                             KeyValueStoreType::SSD_BTREE_V2);
+	wiggler.addServer(UID(5, 0), metadata);
+	ASSERT(!nextFuture.isReady());
+
+	std::cout << "Test after updateServer() ...\n";
+	StorageWiggler* ptr = &wiggler;
+	wait(trigger(
+	    [ptr]() {
+		    ptr->updateMetadata(UID(5, 0),
+		                        StorageMetadataType(now() - SERVER_KNOBS->DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC,
+		                                            KeyValueStoreType::SSD_BTREE_V2));
+	    },
+	    delay(5.0)));
+	wait(success(nextFuture));
+	ASSERT(now() - startTime < SERVER_KNOBS->DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC + 100.0);
+	ASSERT(nextFuture.get() == UID(5, 0));
+	return Void();
+}
+
+TEST_CASE("/DataDistribution/StorageWiggler/NextIdWithTSS") {
+	state std::unique_ptr<DDTeamCollection> collection =
+	    DDTeamCollectionUnitTest::testMachineTeamCollection(1, Reference<IReplicationPolicy>(new PolicyOne()), 5);
+	state StorageWiggler wiggler(collection.get());
+
+	std::cout << "Test when need TSS ... \n";
+	collection->configuration.usableRegions = 1;
+	collection->configuration.desiredTSSCount = 1;
+	state double startTime = now();
+	wiggler.addServer(UID(1, 0),
+	                  StorageMetadataType(startTime + SERVER_KNOBS->DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC + 150.0,
+	                                      KeyValueStoreType::SSD_BTREE_V2));
+	wiggler.addServer(UID(2, 0),
+	                  StorageMetadataType(startTime + SERVER_KNOBS->DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC + 150.0,
+	                                      KeyValueStoreType::SSD_BTREE_V2));
+	ASSERT(!wiggler.getNextServerId(true).present());
+	ASSERT(wiggler.getNextServerId(collection->reachTSSPairTarget()) == UID(1, 0));
+	UID id = wait(DDTeamCollectionImpl::getNextWigglingServerID(
+	    Reference<StorageWiggler>::addRef(&wiggler), Optional<Value>(), Optional<Value>(), collection.get()));
+	ASSERT(now() - startTime < SERVER_KNOBS->DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC + 150.0);
+	ASSERT(id == UID(2, 0));
+	return Void();
+}
\ No newline at end of file
diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp
index 3fea169516..962f69e73a 100644
--- a/fdbserver/DDTxnProcessor.actor.cpp
+++ b/fdbserver/DDTxnProcessor.actor.cpp
@@ -395,6 +395,61 @@ class DDTxnProcessorImpl {
 			}
 		}
 	}
+
+	ACTOR static Future<bool> isDataDistributionEnabled(Database cx, const DDEnabledState* ddEnabledState) {
+		state Transaction tr(cx);
+		loop {
+			try {
+				Optional<Value> mode = wait(tr.get(dataDistributionModeKey));
+				if (!mode.present() && ddEnabledState->isDDEnabled())
+					return true;
+				if (mode.present()) {
+					BinaryReader rd(mode.get(), Unversioned());
+					int m;
+					rd >> m;
+					if (m && ddEnabledState->isDDEnabled()) {
+						TraceEvent(SevDebug, "IsDDEnabledSucceeded")
+						    .detail("Mode", m)
+						    .detail("IsDDEnabled", ddEnabledState->isDDEnabled());
+						return true;
+					}
+				}
+				// SOMEDAY: Write a wrapper in MoveKeys.actor.h
+				Optional<Value> readVal = wait(tr.get(moveKeysLockOwnerKey));
+				UID currentOwner =
+				    readVal.present() ? BinaryReader::fromStringRef<UID>(readVal.get(), Unversioned()) : UID();
+				if (ddEnabledState->isDDEnabled() && (currentOwner != dataDistributionModeLock)) {
+					TraceEvent(SevDebug, "IsDDEnabledSucceeded")
+					    .detail("CurrentOwner", currentOwner)
+					    .detail("DDModeLock", dataDistributionModeLock)
+					    .detail("IsDDEnabled", ddEnabledState->isDDEnabled());
+					return true;
+				}
+				TraceEvent(SevDebug, "IsDDEnabledFailed")
+				    .detail("CurrentOwner", currentOwner)
+				    .detail("DDModeLock", dataDistributionModeLock)
+				    .detail("IsDDEnabled", ddEnabledState->isDDEnabled());
+				return false;
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+	}
+
+	ACTOR static Future<Void> pollMoveKeysLock(Database cx, MoveKeysLock lock, const DDEnabledState* ddEnabledState) {
+		loop {
+			wait(delay(SERVER_KNOBS->MOVEKEYS_LOCK_POLLING_DELAY));
+			state Transaction tr(cx);
+			loop {
+				try {
+					wait(checkMoveKeysLockReadOnly(&tr, lock, ddEnabledState));
+					break;
+				} catch (Error& e) {
+					wait(tr.onError(e));
+				}
+			}
+		}
+	}
 };
 
 Future<IDDTxnProcessor::SourceServers> DDTxnProcessor::getSourceServersForRange(const KeyRangeRef range) {
@@ -406,7 +461,7 @@ Future<std::vector<std::pair<StorageServerInterface, ProcessClass>>> DDTxnProces
 	return NativeAPI::getServerListAndProcessClasses(&tr);
 }
 
-Future<MoveKeysLock> DDTxnProcessor::takeMoveKeysLock(UID ddId) const {
+Future<MoveKeysLock> DDTxnProcessor::takeMoveKeysLock(const UID& ddId) const {
 	return ::takeMoveKeysLock(cx, ddId);
 }
 
@@ -431,3 +486,11 @@ Future<Reference<InitialDataDistribution>> DDTxnProcessor::getInitialDataDistrib
 Future<Void> DDTxnProcessor::waitForDataDistributionEnabled(const DDEnabledState* ddEnabledState) const {
 	return DDTxnProcessorImpl::waitForDataDistributionEnabled(cx, ddEnabledState);
 }
+
+Future<bool> DDTxnProcessor::isDataDistributionEnabled(const DDEnabledState* ddEnabledState) const {
+	return DDTxnProcessorImpl::isDataDistributionEnabled(cx, ddEnabledState);
+}
+
+Future<Void> DDTxnProcessor::pollMoveKeysLock(const MoveKeysLock& lock, const DDEnabledState* ddEnabledState) const {
+	return DDTxnProcessorImpl::pollMoveKeysLock(cx, lock, ddEnabledState);
+}
\ No newline at end of file
diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index c83e8d8594..a1145778e2 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -49,7 +49,7 @@
 #include "flow/serialize.h"
 #include "flow/Trace.h"
 #include "flow/UnitTest.h"
-
+#include "fdbserver/DDSharedContext.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
 
 void DataMove::validateShard(const DDShardInfo& shard, KeyRangeRef range, int priority) {
@@ -103,13 +103,16 @@ void DataMove::validateShard(const DDShardInfo& shard, KeyRangeRef range, int pr
 	}
 }
 
+Future<Void> StorageWiggler::onCheck() const {
+	return delay(MIN_ON_CHECK_DELAY_SEC);
+}
+
 // add server to wiggling queue
 void StorageWiggler::addServer(const UID& serverId, const StorageMetadataType& metadata) {
 	// std::cout << "size: " << pq_handles.size() << " add " << serverId.toString() << " DC: "
 	//           << teamCollection->isPrimary() << std::endl;
 	ASSERT(!pq_handles.count(serverId));
 	pq_handles[serverId] = wiggle_pq.emplace(metadata, serverId);
-	nonEmpty.set(true);
 }
 
 void StorageWiggler::removeServer(const UID& serverId) {
@@ -120,7 +123,6 @@ void StorageWiggler::removeServer(const UID& serverId) {
 		pq_handles.erase(serverId);
 		wiggle_pq.erase(handle);
 	}
-	nonEmpty.set(!wiggle_pq.empty());
 }
 
 void StorageWiggler::updateMetadata(const UID& serverId, const StorageMetadataType& metadata) {
@@ -133,9 +135,16 @@ void StorageWiggler::updateMetadata(const UID& serverId, const StorageMetadataTy
 	wiggle_pq.update(handle, std::make_pair(metadata, serverId));
 }
 
-Optional<UID> StorageWiggler::getNextServerId() {
+bool StorageWiggler::necessary(const UID& serverId, const StorageMetadataType& metadata) const {
+	return metadata.wrongConfigured || (now() - metadata.createdTime > SERVER_KNOBS->DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC);
+}
+
+Optional<UID> StorageWiggler::getNextServerId(bool necessaryOnly) {
 	if (!wiggle_pq.empty()) {
 		auto [metadata, id] = wiggle_pq.top();
+		if (necessaryOnly && !necessary(id, metadata)) {
+			return {};
+		}
 		wiggle_pq.pop();
 		pq_handles.erase(id);
 		return Optional<UID>(id);
@@ -193,46 +202,6 @@ ACTOR Future<Void> remoteRecovered(Reference<AsyncVar<ServerDBInfo> const> db) {
 	return Void();
 }
 
-ACTOR Future<bool> isDataDistributionEnabled(Database cx, const DDEnabledState* ddEnabledState) {
-	state Transaction tr(cx);
-	loop {
-		try {
-			Optional<Value> mode = wait(tr.get(dataDistributionModeKey));
-			if (!mode.present() && ddEnabledState->isDDEnabled())
-				return true;
-			if (mode.present()) {
-				BinaryReader rd(mode.get(), Unversioned());
-				int m;
-				rd >> m;
-				if (m && ddEnabledState->isDDEnabled()) {
-					TraceEvent(SevDebug, "IsDDEnabledSucceeded")
-					    .detail("Mode", m)
-					    .detail("IsDDEnabled", ddEnabledState->isDDEnabled());
-					return true;
-				}
-			}
-			// SOMEDAY: Write a wrapper in MoveKeys.actor.h
-			Optional<Value> readVal = wait(tr.get(moveKeysLockOwnerKey));
-			UID currentOwner =
-			    readVal.present() ? BinaryReader::fromStringRef<UID>(readVal.get(), Unversioned()) : UID();
-			if (ddEnabledState->isDDEnabled() && (currentOwner != dataDistributionModeLock)) {
-				TraceEvent(SevDebug, "IsDDEnabledSucceeded")
-				    .detail("CurrentOwner", currentOwner)
-				    .detail("DDModeLock", dataDistributionModeLock)
-				    .detail("IsDDEnabled", ddEnabledState->isDDEnabled());
-				return true;
-			}
-			TraceEvent(SevDebug, "IsDDEnabledFailed")
-			    .detail("CurrentOwner", currentOwner)
-			    .detail("DDModeLock", dataDistributionModeLock)
-			    .detail("IsDDEnabled", ddEnabledState->isDDEnabled());
-			return false;
-		} catch (Error& e) {
-			wait(tr.onError(e));
-		}
-	}
-}
-
 // Ensures that the serverKeys key space is properly coalesced
 // This method is only used for testing and is not implemented in a manner that is safe for large databases
 ACTOR Future<Void> debugCheckCoalescing(Database cx) {
@@ -275,24 +244,10 @@ static std::set<int> const& normalDDQueueErrors() {
 	return s;
 }
 
-ACTOR Future<Void> pollMoveKeysLock(Database cx, MoveKeysLock lock, const DDEnabledState* ddEnabledState) {
-	loop {
-		wait(delay(SERVER_KNOBS->MOVEKEYS_LOCK_POLLING_DELAY));
-		state Transaction tr(cx);
-		loop {
-			try {
-				wait(checkMoveKeysLockReadOnly(&tr, lock, ddEnabledState));
-				break;
-			} catch (Error& e) {
-				wait(tr.onError(e));
-			}
-		}
-	}
-}
-
 struct DataDistributor : NonCopyable, ReferenceCounted<DataDistributor> {
 public:
 	Reference<AsyncVar<ServerDBInfo> const> dbInfo;
+	Reference<DDSharedContext> context;
 	UID ddId;
 	PromiseStream<Future<Void>> addActor;
 
@@ -316,9 +271,13 @@ public:
 	// consumer is a yield stream from producer. The RelocateShard is pushed into relocationProducer and popped from
 	// relocationConsumer (by DDQueue)
 	PromiseStream<RelocateShard> relocationProducer, relocationConsumer;
+	Reference<PhysicalShardCollection> physicalShardCollection;
 
-	DataDistributor(Reference<AsyncVar<ServerDBInfo> const> const& db, UID id)
-	  : dbInfo(db), ddId(id), txnProcessor(nullptr), initialDDEventHolder(makeReference<EventCacheHolder>("InitialDD")),
+	StorageQuotaInfo storageQuotaInfo;
+
+	DataDistributor(Reference<AsyncVar<ServerDBInfo> const> const& db, UID id, Reference<DDSharedContext> context)
+	  : dbInfo(db), context(context), ddId(id), txnProcessor(nullptr),
+	    initialDDEventHolder(makeReference<EventCacheHolder>("InitialDD")),
 	    movingDataEventHolder(makeReference<EventCacheHolder>("MovingData")),
 	    totalDataInFlightEventHolder(makeReference<EventCacheHolder>("TotalDataInFlight")),
 	    totalDataInFlightRemoteEventHolder(makeReference<EventCacheHolder>("TotalDataInFlightRemote")),
@@ -334,13 +293,13 @@ public:
 		return txnProcessor->updateReplicaKeys(primaryDcId, remoteDcIds, configuration);
 	}
 
-	Future<Void> loadInitialDataDistribution(const DDEnabledState* ddEnabledState) {
+	Future<Void> loadInitialDataDistribution() {
 		return store(initData,
 		             txnProcessor->getInitialDataDistribution(
 		                 ddId,
 		                 lock,
 		                 configuration.usableRegions > 1 ? remoteDcIds : std::vector<Optional<Key>>(),
-		                 ddEnabledState));
+		                 context->ddEnabledState.get()));
 	}
 
 	void initDcInfo() {
@@ -355,14 +314,14 @@ public:
 		}
 	}
 
-	Future<Void> waitDataDistributorEnabled(const DDEnabledState* ddEnabledState) const {
-		return txnProcessor->waitForDataDistributionEnabled(ddEnabledState);
+	Future<Void> waitDataDistributorEnabled() const {
+		return txnProcessor->waitForDataDistributionEnabled(context->ddEnabledState.get());
 	}
 
 	// Initialize the required internal states of DataDistributor. It's necessary before DataDistributor start working.
 	// Doesn't include initialization of optional components, like TenantCache, DDQueue, Tracker, TeamCollection. The
 	// components should call its own ::init methods.
-	ACTOR static Future<Void> init(Reference<DataDistributor> self, const DDEnabledState* ddEnabledState) {
+	ACTOR static Future<Void> init(Reference<DataDistributor> self) {
 		loop {
 			TraceEvent("DDInitTakingMoveKeysLock", self->ddId).log();
 			wait(self->takeMoveKeysLock());
@@ -375,7 +334,7 @@ public:
 			wait(self->updateReplicaKeys());
 			TraceEvent("DDInitUpdatedReplicaKeys", self->ddId).log();
 
-			wait(self->loadInitialDataDistribution(ddEnabledState));
+			wait(self->loadInitialDataDistribution());
 
 			if (self->initData->shards.size() > 1) {
 				TraceEvent("DDInitGotInitialDD", self->ddId)
@@ -393,7 +352,7 @@ public:
 				    .trackLatest(self->initialDDEventHolder->trackingKey);
 			}
 
-			if (self->initData->mode && ddEnabledState->isDDEnabled()) {
+			if (self->initData->mode && self->context->isDDEnabled()) {
 				// mode may be set true by system operator using fdbcli and isDDEnabled() set to true
 				break;
 			}
@@ -434,13 +393,27 @@ public:
 			    .detail("HighestPriority", self->configuration.usableRegions > 1 ? 0 : -1)
 			    .trackLatest(self->totalDataInFlightRemoteEventHolder->trackingKey);
 
-			wait(self->waitDataDistributorEnabled(ddEnabledState));
+			wait(self->waitDataDistributorEnabled());
 			TraceEvent("DataDistributionEnabled").log();
 		}
 		return Void();
 	}
 
 	ACTOR static Future<Void> resumeFromShards(Reference<DataDistributor> self, bool traceShard) {
+		// All physicalShard init must be completed before issuing data move
+		if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+			for (int i = 0; i < self->initData->shards.size() - 1; i++) {
+				const DDShardInfo& iShard = self->initData->shards[i];
+				KeyRangeRef keys = KeyRangeRef(iShard.key, self->initData->shards[i + 1].key);
+				std::vector<ShardsAffectedByTeamFailure::Team> teams;
+				teams.push_back(ShardsAffectedByTeamFailure::Team(iShard.primarySrc, true));
+				if (self->configuration.usableRegions > 1) {
+					teams.push_back(ShardsAffectedByTeamFailure::Team(iShard.remoteSrc, false));
+				}
+				self->physicalShardCollection->initPhysicalShardCollection(keys, teams, iShard.srcId.first(), 0);
+			}
+		}
+
 		state int shard = 0;
 		for (; shard < self->initData->shards.size() - 1; shard++) {
 			const DDShardInfo& iShard = self->initData->shards[shard];
@@ -534,12 +507,58 @@ public:
 		Future<Void> shardsReady = resumeFromShards(Reference<DataDistributor>::addRef(this), g_network->isSimulated());
 		return resumeFromDataMoves(Reference<DataDistributor>::addRef(this), shardsReady);
 	}
+
+	Future<Void> pollMoveKeysLock() { return txnProcessor->pollMoveKeysLock(lock, context->ddEnabledState.get()); }
+
+	Future<bool> isDataDistributionEnabled() const {
+		return txnProcessor->isDataDistributionEnabled(context->ddEnabledState.get());
+	}
+
+	Future<Void> removeKeysFromFailedServer(const UID& serverID, const std::vector<UID>& teamForDroppedRange) const {
+		return txnProcessor->removeKeysFromFailedServer(
+		    serverID, teamForDroppedRange, lock, context->ddEnabledState.get());
+	}
+
+	Future<Void> removeStorageServer(const UID& serverID, const Optional<UID>& tssPairID = Optional<UID>()) const {
+		return txnProcessor->removeStorageServer(serverID, tssPairID, lock, context->ddEnabledState.get());
+	}
 };
 
+ACTOR Future<Void> storageQuotaTracker(Database cx, StorageQuotaInfo* storageQuotaInfo) {
+	loop {
+		state Transaction tr(cx);
+		loop {
+			try {
+				state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY));
+				TraceEvent("StorageQuota_ReadCurrentQuotas").detail("Size", currentQuotas.size());
+				for (auto const kv : currentQuotas) {
+					Key const key = kv.key.removePrefix(storageQuotaPrefix);
+					uint64_t const quota = BinaryReader::fromStringRef<uint64_t>(kv.value, Unversioned());
+					storageQuotaInfo->quotaMap[key] = quota;
+				}
+				wait(delay(5.0));
+				break;
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+	}
+}
+
+// Periodically check and log the physicalShard status; clean up empty physicalShard;
+ACTOR Future<Void> monitorPhysicalShardStatus(Reference<PhysicalShardCollection> self) {
+	ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
+	ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD);
+	loop {
+		self->cleanUpPhysicalShardCollection();
+		self->logPhysicalShardCollection();
+		wait(delay(SERVER_KNOBS->PHYSICAL_SHARD_METRICS_DELAY));
+	}
+}
+
 // Runs the data distribution algorithm for FDB, including the DD Queue, DD tracker, and DD team collection
 ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
-                                    PromiseStream<GetMetricsListRequest> getShardMetricsList,
-                                    const DDEnabledState* ddEnabledState) {
+                                    PromiseStream<GetMetricsListRequest> getShardMetricsList) {
 	state Database cx = openDBOnServer(self->dbInfo, TaskPriority::DataDistributionLaunch, LockAware::True);
 	cx->locationCacheSize = SERVER_KNOBS->DD_LOCATION_CACHE_SIZE;
 	self->txnProcessor = std::shared_ptr<IDDTxnProcessor>(new DDTxnProcessor(cx));
@@ -563,7 +582,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 		state KeyRangeMap<ShardTrackedData> shards;
 		state Promise<UID> removeFailedServer;
 		try {
-			wait(DataDistributor::init(self, ddEnabledState));
+			wait(DataDistributor::init(self));
 
 			state Reference<TenantCache> ddTenantCache;
 			if (ddIsTenantAware) {
@@ -583,6 +602,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 			state Promise<Void> readyToStart;
 
 			self->shardsAffectedByTeamFailure = makeReference<ShardsAffectedByTeamFailure>();
+			self->physicalShardCollection = makeReference<PhysicalShardCollection>();
 			wait(self->resumeRelocations());
 
 			std::vector<TeamCollectionInterface> tcis; // primary and remote region interface
@@ -609,11 +629,12 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 				    ddTenantCache->monitorTenantMap(), "DDTenantCacheMonitor", self->ddId, &normalDDQueueErrors()));
 			}
 
-			actors.push_back(pollMoveKeysLock(cx, self->lock, ddEnabledState));
+			actors.push_back(self->pollMoveKeysLock());
 			actors.push_back(reportErrorsExcept(dataDistributionTracker(self->initData,
 			                                                            cx,
 			                                                            self->relocationProducer,
 			                                                            self->shardsAffectedByTeamFailure,
+			                                                            self->physicalShardCollection,
 			                                                            getShardMetrics,
 			                                                            getTopKShardMetrics.getFuture(),
 			                                                            getShardMetricsList,
@@ -635,17 +656,23 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 			                                                          processingWiggle,
 			                                                          tcis,
 			                                                          self->shardsAffectedByTeamFailure,
+			                                                          self->physicalShardCollection,
 			                                                          self->lock,
 			                                                          getAverageShardBytes,
 			                                                          getUnhealthyRelocationCount.getFuture(),
 			                                                          self->ddId,
 			                                                          storageTeamSize,
 			                                                          self->configuration.storageTeamSize,
-			                                                          ddEnabledState),
+			                                                          self->context->ddEnabledState.get()),
 			                                    "DDQueue",
 			                                    self->ddId,
 			                                    &normalDDQueueErrors()));
 
+			actors.push_back(reportErrorsExcept(storageQuotaTracker(cx, &self->storageQuotaInfo),
+			                                    "StorageQuotaTracker",
+			                                    self->ddId,
+			                                    &normalDDQueueErrors()));
+
 			std::vector<DDTeamCollection*> teamCollectionsPtrs;
 			primaryTeamCollection = makeReference<DDTeamCollection>(
 			    cx,
@@ -687,24 +714,32 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 				                                    getUnhealthyRelocationCount);
 				teamCollectionsPtrs.push_back(remoteTeamCollection.getPtr());
 				remoteTeamCollection->teamCollections = teamCollectionsPtrs;
-				actors.push_back(reportErrorsExcept(
-				    DDTeamCollection::run(
-				        remoteTeamCollection, self->initData, tcis[1], recruitStorage, *ddEnabledState),
-				    "DDTeamCollectionSecondary",
-				    self->ddId,
-				    &normalDDQueueErrors()));
+				actors.push_back(reportErrorsExcept(DDTeamCollection::run(remoteTeamCollection,
+				                                                          self->initData,
+				                                                          tcis[1],
+				                                                          recruitStorage,
+				                                                          *self->context->ddEnabledState.get()),
+				                                    "DDTeamCollectionSecondary",
+				                                    self->ddId,
+				                                    &normalDDQueueErrors()));
 				actors.push_back(DDTeamCollection::printSnapshotTeamsInfo(remoteTeamCollection));
 			}
 			primaryTeamCollection->teamCollections = teamCollectionsPtrs;
 			self->teamCollection = primaryTeamCollection.getPtr();
-			actors.push_back(reportErrorsExcept(
-			    DDTeamCollection::run(primaryTeamCollection, self->initData, tcis[0], recruitStorage, *ddEnabledState),
-			    "DDTeamCollectionPrimary",
-			    self->ddId,
-			    &normalDDQueueErrors()));
+			actors.push_back(reportErrorsExcept(DDTeamCollection::run(primaryTeamCollection,
+			                                                          self->initData,
+			                                                          tcis[0],
+			                                                          recruitStorage,
+			                                                          *self->context->ddEnabledState.get()),
+			                                    "DDTeamCollectionPrimary",
+			                                    self->ddId,
+			                                    &normalDDQueueErrors()));
 
 			actors.push_back(DDTeamCollection::printSnapshotTeamsInfo(primaryTeamCollection));
 			actors.push_back(yieldPromiseStream(self->relocationProducer.getFuture(), self->relocationConsumer));
+			if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+				actors.push_back(monitorPhysicalShardStatus(self->physicalShardCollection));
+			}
 
 			wait(waitForAll(actors));
 			return Void();
@@ -741,17 +776,14 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 			TraceEvent("DataDistributorTeamCollectionsDestroyed").error(err);
 			if (removeFailedServer.getFuture().isReady() && !removeFailedServer.getFuture().isError()) {
 				TraceEvent("RemoveFailedServer", removeFailedServer.getFuture().get()).error(err);
-				wait(removeKeysFromFailedServer(
-				    cx, removeFailedServer.getFuture().get(), teamForDroppedRange, self->lock, ddEnabledState));
-				Optional<UID> tssPairID;
-				wait(removeStorageServer(
-				    cx, removeFailedServer.getFuture().get(), tssPairID, self->lock, ddEnabledState));
+				wait(self->removeKeysFromFailedServer(removeFailedServer.getFuture().get(), teamForDroppedRange));
+				wait(self->removeStorageServer(removeFailedServer.getFuture().get()));
 			} else {
 				if (err.code() != error_code_movekeys_conflict) {
 					throw err;
 				}
 
-				bool ddEnabled = wait(isDataDistributionEnabled(cx, ddEnabledState));
+				bool ddEnabled = wait(self->isDataDistributionEnabled());
 				TraceEvent("DataDistributionMoveKeysConflict").error(err).detail("DataDistributionEnabled", ddEnabled);
 				if (ddEnabled) {
 					throw err;
@@ -796,7 +828,8 @@ ACTOR Future<ErrorOr<Void>> trySendSnapReq(RequestStream<WorkerSnapRequest> stre
 		if (reply.isError()) {
 			TraceEvent("SnapDataDistributor_ReqError")
 			    .errorUnsuppressed(reply.getError())
-			    .detail("Peer", stream.getEndpoint().getPrimaryAddress());
+			    .detail("Peer", stream.getEndpoint().getPrimaryAddress())
+			    .detail("Retry", snapReqRetry);
 			if (reply.getError().code() != error_code_request_maybe_delivered ||
 			    ++snapReqRetry > SERVER_KNOBS->SNAP_NETWORK_FAILURE_RETRY_LIMIT)
 				return ErrorOr<Void>(reply.getError());
@@ -902,6 +935,7 @@ ACTOR Future<std::map<NetworkAddress, std::pair<WorkerInterface, std::string>>>
 			// get coordinators
 			Optional<Value> coordinators = wait(tr.get(coordinatorsKey));
 			if (!coordinators.present()) {
+				CODE_PROBE(true, "Failed to read the coordinatorsKey");
 				throw operation_failed();
 			}
 			ClusterConnectionString ccs(coordinators.get().toString());
@@ -992,7 +1026,8 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
 
 		TraceEvent("SnapDataDistributor_GotStatefulWorkers")
 		    .detail("SnapPayload", snapReq.snapPayload)
-		    .detail("SnapUID", snapReq.snapUID);
+		    .detail("SnapUID", snapReq.snapUID)
+		    .detail("StorageFaultTolerance", storageFaultTolerance);
 
 		// we need to snapshot storage nodes before snapshot any tlogs
 		std::vector<Future<ErrorOr<Void>>> storageSnapReqs;
@@ -1004,7 +1039,6 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
 		}
 		wait(waitForMost(storageSnapReqs, storageFaultTolerance, snap_storage_failed()));
 		TraceEvent("SnapDataDistributor_AfterSnapStorage")
-		    .detail("FaultTolerance", storageFaultTolerance)
 		    .detail("SnapPayload", snapReq.snapPayload)
 		    .detail("SnapUID", snapReq.snapUID);
 
@@ -1290,12 +1324,12 @@ ACTOR Future<Void> ddGetMetrics(GetDataDistributorMetricsRequest req,
 }
 
 ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncVar<ServerDBInfo> const> db) {
-	state Reference<DataDistributor> self(new DataDistributor(db, di.id()));
+	state Reference<DDSharedContext> context(new DDSharedContext(di.id()));
+	state Reference<DataDistributor> self(new DataDistributor(db, di.id(), context));
 	state Future<Void> collection = actorCollection(self->addActor.getFuture());
 	state PromiseStream<GetMetricsListRequest> getShardMetricsList;
 	state Database cx = openDBOnServer(db, TaskPriority::DefaultDelay, LockAware::True);
 	state ActorCollection actors(false);
-	state DDEnabledState ddEnabledState;
 	state std::map<UID, DistributorSnapRequest> ddSnapReqMap;
 	state std::map<UID, ErrorOr<Void>> ddSnapReqResultMap;
 	self->addActor.send(actors.getResult());
@@ -1305,11 +1339,8 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
 		TraceEvent("DataDistributorRunning", di.id());
 		self->addActor.send(waitFailureServer(di.waitFailure.getFuture()));
 		self->addActor.send(cacheServerWatcher(&cx));
-		state Future<Void> distributor =
-		    reportErrorsExcept(dataDistribution(self, getShardMetricsList, &ddEnabledState),
-		                       "DataDistribution",
-		                       di.id(),
-		                       &normalDataDistributorErrors());
+		state Future<Void> distributor = reportErrorsExcept(
+		    dataDistribution(self, getShardMetricsList), "DataDistribution", di.id(), &normalDataDistributorErrors());
 
 		loop choose {
 			when(wait(distributor || collection)) {
@@ -1327,20 +1358,21 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
 			when(DistributorSnapRequest snapReq = waitNext(di.distributorSnapReq.getFuture())) {
 				auto& snapUID = snapReq.snapUID;
 				if (ddSnapReqResultMap.count(snapUID)) {
-					CODE_PROBE(true, "Data distributor received a duplicate finished snap request");
+					CODE_PROBE(true, "Data distributor received a duplicate finished snapshot request");
 					auto result = ddSnapReqResultMap[snapUID];
 					result.isError() ? snapReq.reply.sendError(result.getError()) : snapReq.reply.send(result.get());
 					TraceEvent("RetryFinishedDistributorSnapRequest")
 					    .detail("SnapUID", snapUID)
 					    .detail("Result", result.isError() ? result.getError().code() : 0);
 				} else if (ddSnapReqMap.count(snapReq.snapUID)) {
-					CODE_PROBE(true, "Data distributor received a duplicate ongoing snap request");
+					CODE_PROBE(true, "Data distributor received a duplicate ongoing snapshot request");
 					TraceEvent("RetryOngoingDistributorSnapRequest").detail("SnapUID", snapUID);
 					ASSERT(snapReq.snapPayload == ddSnapReqMap[snapUID].snapPayload);
 					ddSnapReqMap[snapUID] = snapReq;
 				} else {
 					ddSnapReqMap[snapUID] = snapReq;
-					actors.add(ddSnapCreate(snapReq, db, &ddEnabledState, &ddSnapReqMap, &ddSnapReqResultMap));
+					actors.add(ddSnapCreate(
+					    snapReq, db, self->context->ddEnabledState.get(), &ddSnapReqMap, &ddSnapReqResultMap));
 					auto* ddSnapReqResultMapPtr = &ddSnapReqResultMap;
 					actors.add(fmap(
 					    [ddSnapReqResultMapPtr, snapUID](Void _) {
@@ -1381,14 +1413,23 @@ inline DDShardInfo doubleToNoLocationShardInfo(double d, bool hasDest) {
 	return res;
 }
 
+inline int getRandomShardCount() {
+#if defined(USE_SANITIZER)
+	return deterministicRandom()->randomInt(1000, 24000); // 24000 * MAX_SHARD_SIZE = 12TB
+#else
+	return deterministicRandom()->randomInt(1000, CLIENT_KNOBS->TOO_MANY); // 2000000000; OOM
+#endif
+}
+
 } // namespace data_distribution_test
 
 TEST_CASE("/DataDistribution/StorageWiggler/Order") {
 	StorageWiggler wiggler(nullptr);
-	wiggler.addServer(UID(1, 0), StorageMetadataType(1, KeyValueStoreType::SSD_BTREE_V2));
-	wiggler.addServer(UID(2, 0), StorageMetadataType(2, KeyValueStoreType::MEMORY, true));
-	wiggler.addServer(UID(3, 0), StorageMetadataType(3, KeyValueStoreType::SSD_ROCKSDB_V1, true));
-	wiggler.addServer(UID(4, 0), StorageMetadataType(4, KeyValueStoreType::SSD_BTREE_V2));
+	double startTime = now() - SERVER_KNOBS->DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC - 0.4;
+	wiggler.addServer(UID(1, 0), StorageMetadataType(startTime, KeyValueStoreType::SSD_BTREE_V2));
+	wiggler.addServer(UID(2, 0), StorageMetadataType(startTime + 0.1, KeyValueStoreType::MEMORY, true));
+	wiggler.addServer(UID(3, 0), StorageMetadataType(startTime + 0.2, KeyValueStoreType::SSD_ROCKSDB_V1, true));
+	wiggler.addServer(UID(4, 0), StorageMetadataType(startTime + 0.3, KeyValueStoreType::SSD_BTREE_V2));
 
 	std::vector<UID> correctOrder{ UID(2, 0), UID(3, 0), UID(1, 0), UID(4, 0) };
 	for (int i = 0; i < correctOrder.size(); ++i) {
@@ -1401,10 +1442,14 @@ TEST_CASE("/DataDistribution/StorageWiggler/Order") {
 }
 
 TEST_CASE("/DataDistribution/Initialization/ResumeFromShard") {
+	state Reference<DDSharedContext> context(new DDSharedContext(UID()));
 	state Reference<AsyncVar<ServerDBInfo> const> dbInfo;
-	state Reference<DataDistributor> self(new DataDistributor(dbInfo, UID()));
+	state Reference<DataDistributor> self(new DataDistributor(dbInfo, UID(), context));
 
 	self->shardsAffectedByTeamFailure = makeReference<ShardsAffectedByTeamFailure>();
+	if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+		self->physicalShardCollection = makeReference<PhysicalShardCollection>();
+	}
 	self->initData = makeReference<InitialDataDistribution>();
 	self->configuration.usableRegions = 1;
 	self->configuration.storageTeamSize = 1;
@@ -1412,7 +1457,7 @@ TEST_CASE("/DataDistribution/Initialization/ResumeFromShard") {
 	// add DDShardInfo
 	self->shardsAffectedByTeamFailure->setCheckMode(
 	    ShardsAffectedByTeamFailure::CheckMode::ForceNoCheck); // skip check when build
-	int shardNum = deterministicRandom()->randomInt(1000, CLIENT_KNOBS->TOO_MANY * 5); // 2000000000; OOM
+	int shardNum = data_distribution_test::getRandomShardCount();
 	std::cout << "generating " << shardNum << " shards...\n";
 	for (int i = 1; i <= SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM; ++i) {
 		self->initData->shards.emplace_back(data_distribution_test::doubleToNoLocationShardInfo(i, true));
diff --git a/fdbserver/EncryptKeyProxy.actor.cpp b/fdbserver/EncryptKeyProxy.actor.cpp
index 554221dc19..44799b521a 100644
--- a/fdbserver/EncryptKeyProxy.actor.cpp
+++ b/fdbserver/EncryptKeyProxy.actor.cpp
@@ -18,7 +18,7 @@
  * limitations under the License.
  */
 
-#include "fdbserver/EncryptKeyProxyInterface.h"
+#include "fdbclient/EncryptKeyProxyInterface.h"
 
 #include "fdbrpc/Locality.h"
 #include "fdbrpc/Stats.h"
@@ -53,6 +53,10 @@
 
 namespace {
 
+const std::string REST_KMS_CONNECTOR_TYPE_STR = "RESTKmsConnector";
+const std::string FDB_PREF_KMS_CONNECTOR_TYPE_STR = "FDBPerfKmsConnector";
+const std::string FDB_SIM_KMS_CONNECTOR_TYPE_STR = "SimKmsConnector";
+
 struct CipherKeyValidityTS {
 	int64_t refreshAtTS;
 	int64_t expAtTS;
@@ -587,12 +591,13 @@ ACTOR Future<Void> refreshEncryptionKeysCore(Reference<EncryptKeyProxyData> ekpP
 	try {
 		KmsConnLookupEKsByDomainIdsReq req;
 		req.debugId = debugId;
-		req.encryptDomainInfos.reserve(req.arena, ekpProxyData->baseCipherDomainIdCache.size());
+		// req.encryptDomainInfos.reserve(req.arena, ekpProxyData->baseCipherDomainIdCache.size());
 
 		int64_t currTS = (int64_t)now();
 		for (auto itr = ekpProxyData->baseCipherDomainIdCache.begin();
 		     itr != ekpProxyData->baseCipherDomainIdCache.end();) {
 			if (isCipherKeyEligibleForRefresh(itr->second, currTS)) {
+				TraceEvent("RefreshEKs").detail("Id", itr->first);
 				req.encryptDomainInfos.emplace_back_deep(req.arena, itr->first, itr->second.domainName);
 			}
 
@@ -608,9 +613,9 @@ ACTOR Future<Void> refreshEncryptionKeysCore(Reference<EncryptKeyProxyData> ekpP
 		for (const auto& item : rep.cipherKeyDetails) {
 			const auto itr = ekpProxyData->baseCipherDomainIdCache.find(item.encryptDomainId);
 			if (itr == ekpProxyData->baseCipherDomainIdCache.end()) {
-				TraceEvent(SevError, "RefreshEKs_DomainIdNotFound", ekpProxyData->myId)
+				TraceEvent(SevInfo, "RefreshEKs_DomainIdNotFound", ekpProxyData->myId)
 				    .detail("DomainId", item.encryptDomainId);
-				// Continue updating the cache with othe elements
+				// Continue updating the cache with other elements
 				continue;
 			}
 
@@ -633,7 +638,7 @@ ACTOR Future<Void> refreshEncryptionKeysCore(Reference<EncryptKeyProxyData> ekpP
 
 		ekpProxyData->baseCipherKeysRefreshed += rep.cipherKeyDetails.size();
 
-		t.detail("nKeys", rep.cipherKeyDetails.size());
+		t.detail("NumKeys", rep.cipherKeyDetails.size());
 	} catch (Error& e) {
 		if (!canReplyWith(e)) {
 			TraceEvent(SevWarn, "RefreshEKs_Error").error(e);
@@ -646,8 +651,8 @@ ACTOR Future<Void> refreshEncryptionKeysCore(Reference<EncryptKeyProxyData> ekpP
 	return Void();
 }
 
-void refreshEncryptionKeys(Reference<EncryptKeyProxyData> ekpProxyData, KmsConnectorInterface kmsConnectorInf) {
-	Future<Void> ignored = refreshEncryptionKeysCore(ekpProxyData, kmsConnectorInf);
+Future<Void> refreshEncryptionKeys(Reference<EncryptKeyProxyData> ekpProxyData, KmsConnectorInterface kmsConnectorInf) {
+	return refreshEncryptionKeysCore(ekpProxyData, kmsConnectorInf);
 }
 
 ACTOR Future<Void> getLatestBlobMetadata(Reference<EncryptKeyProxyData> ekpProxyData,
@@ -774,15 +779,19 @@ void refreshBlobMetadata(Reference<EncryptKeyProxyData> ekpProxyData, KmsConnect
 }
 
 void activateKmsConnector(Reference<EncryptKeyProxyData> ekpProxyData, KmsConnectorInterface kmsConnectorInf) {
-	if (g_network->isSimulated()) {
+	if (g_network->isSimulated() || (SERVER_KNOBS->KMS_CONNECTOR_TYPE.compare(FDB_PREF_KMS_CONNECTOR_TYPE_STR) == 0)) {
 		ekpProxyData->kmsConnector = std::make_unique<SimKmsConnector>();
-	} else if (SERVER_KNOBS->KMS_CONNECTOR_TYPE.compare("RESTKmsConnector")) {
+	} else if (SERVER_KNOBS->KMS_CONNECTOR_TYPE.compare(REST_KMS_CONNECTOR_TYPE_STR) == 0) {
 		ekpProxyData->kmsConnector = std::make_unique<RESTKmsConnector>();
 	} else {
 		throw not_implemented();
 	}
 
-	TraceEvent("EKP_ActiveKmsConnector", ekpProxyData->myId).detail("ConnectorType", SERVER_KNOBS->KMS_CONNECTOR_TYPE);
+	TraceEvent("EKPActiveKmsConnector", ekpProxyData->myId)
+	    .detail("ConnectorType",
+	            g_network->isSimulated() ? FDB_SIM_KMS_CONNECTOR_TYPE_STR : SERVER_KNOBS->KMS_CONNECTOR_TYPE)
+	    .detail("InfId", kmsConnectorInf.id());
+
 	ekpProxyData->addActor.send(ekpProxyData->kmsConnector->connectorCore(kmsConnectorInf));
 }
 
@@ -805,9 +814,11 @@ ACTOR Future<Void> encryptKeyProxyServer(EncryptKeyProxyInterface ekpInterface,
 	// FLOW_KNOB->ENCRRYPTION_KEY_REFRESH_INTERVAL_SEC, allowing the interactions with external Encryption Key Manager
 	// mostly not co-inciding with FDB process encryption key refresh attempts.
 
-	self->encryptionKeyRefresher = recurring([&]() { refreshEncryptionKeys(self, kmsConnectorInf); },
-	                                         FLOW_KNOBS->ENCRYPT_KEY_REFRESH_INTERVAL,
-	                                         TaskPriority::Worker);
+	self->encryptionKeyRefresher = recurringAsync([&]() { return refreshEncryptionKeys(self, kmsConnectorInf); },
+	                                              FLOW_KNOBS->ENCRYPT_KEY_REFRESH_INTERVAL, /* interval */
+	                                              true, /* absoluteIntervalDelay */
+	                                              FLOW_KNOBS->ENCRYPT_KEY_REFRESH_INTERVAL, /* initialDelay */
+	                                              TaskPriority::Worker);
 
 	self->blobMetadataRefresher = recurring([&]() { refreshBlobMetadata(self, kmsConnectorInf); },
 	                                        SERVER_KNOBS->BLOB_METADATA_REFRESH_INTERVAL,
diff --git a/fdbserver/GlobalTagThrottler.actor.cpp b/fdbserver/GlobalTagThrottler.actor.cpp
index 880d5da216..237d99ddfc 100644
--- a/fdbserver/GlobalTagThrottler.actor.cpp
+++ b/fdbserver/GlobalTagThrottler.actor.cpp
@@ -125,11 +125,13 @@ class GlobalTagThrottlerImpl {
 		Optional<ThrottleApi::TagQuotaValue> quota;
 		Smoother transactionCounter;
 		Smoother perClientRate;
+		Smoother targetRate;
 
 	public:
 		explicit PerTagStatistics()
 		  : transactionCounter(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_FOLDING_TIME),
-		    perClientRate(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_FOLDING_TIME) {}
+		    perClientRate(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_FOLDING_TIME),
+		    targetRate(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_FOLDING_TIME) {}
 
 		Optional<ThrottleApi::TagQuotaValue> getQuota() const { return quota; }
 
@@ -150,6 +152,11 @@ class GlobalTagThrottlerImpl {
 			    std::max(perClientRate.smoothTotal(), SERVER_KNOBS->GLOBAL_TAG_THROTTLING_MIN_RATE),
 			    ClientTagThrottleLimits::NO_EXPIRATION);
 		}
+
+		double updateAndGetTargetLimit(double targetTps) {
+			targetRate.setTotal(targetTps);
+			return targetRate.smoothTotal();
+		}
 	};
 
 	Database db;
@@ -378,65 +385,102 @@ class GlobalTagThrottlerImpl {
 		}
 	}
 
+	Optional<double> getTargetTps(TransactionTag tag, bool& isReadBusy, bool& isWriteBusy, TraceEvent& te) {
+		auto const readLimitingTps = getLimitingTps(tag, OpType::READ);
+		auto const writeLimitingTps = getLimitingTps(tag, OpType::WRITE);
+		Optional<double> limitingTps;
+		limitingTps = getMin(readLimitingTps, writeLimitingTps);
+
+		auto const averageTransactionReadCost = getAverageTransactionCost(tag, OpType::READ);
+		auto const averageTransactionWriteCost = getAverageTransactionCost(tag, OpType::WRITE);
+		auto const readDesiredTps = getTps(tag, OpType::READ, LimitType::TOTAL, averageTransactionReadCost);
+		auto const writeDesiredTps = getTps(tag, OpType::WRITE, LimitType::TOTAL, averageTransactionWriteCost);
+		Optional<double> desiredTps;
+		desiredTps = getMin(readDesiredTps, writeDesiredTps);
+
+		if (!desiredTps.present()) {
+			return {};
+		}
+
+		isReadBusy = readLimitingTps.present() && readLimitingTps.get() < readDesiredTps.orDefault(0);
+		isWriteBusy = writeLimitingTps.present() && writeLimitingTps.get() < writeDesiredTps.orDefault(0);
+
+		auto const readReservedTps = getTps(tag, OpType::READ, LimitType::RESERVED, averageTransactionReadCost);
+		auto const writeReservedTps = getTps(tag, OpType::WRITE, LimitType::RESERVED, averageTransactionWriteCost);
+		Optional<double> reservedTps;
+		reservedTps = getMax(readReservedTps, writeReservedTps);
+
+		auto targetTps = getMax(reservedTps, getMin(desiredTps, limitingTps));
+
+		te.detail("Tag", printable(tag))
+		    .detail("TargetTps", targetTps)
+		    .detail("AverageTransactionReadCost", averageTransactionReadCost)
+		    .detail("AverageTransactionWriteCost", averageTransactionWriteCost)
+		    .detail("LimitingTps", limitingTps)
+		    .detail("ReservedTps", reservedTps)
+		    .detail("DesiredTps", desiredTps)
+		    .detail("NumStorageServers", throughput.size());
+
+		return targetTps;
+	}
+
 public:
 	GlobalTagThrottlerImpl(Database db, UID id) : db(db), id(id) {}
 	Future<Void> monitorThrottlingChanges() { return monitorThrottlingChanges(this); }
 	void addRequests(TransactionTag tag, int count) { tagStatistics[tag].addTransactions(static_cast<double>(count)); }
 	uint64_t getThrottledTagChangeId() const { return throttledTagChangeId; }
+
+	PrioritizedTransactionTagMap<double> getProxyRates(int numProxies) {
+		PrioritizedTransactionTagMap<double> result;
+		lastBusyReadTagCount = lastBusyWriteTagCount = 0;
+
+		for (auto& [tag, stats] : tagStatistics) {
+			// Currently there is no differentiation between batch priority and default priority transactions
+			TraceEvent te("GlobalTagThrottler_GotRate", id);
+			bool isReadBusy = false, isWriteBusy = false;
+			auto const targetTps = getTargetTps(tag, isReadBusy, isWriteBusy, te);
+			if (isReadBusy) {
+				++lastBusyReadTagCount;
+			}
+			if (isWriteBusy) {
+				++lastBusyWriteTagCount;
+			}
+			if (targetTps.present()) {
+				auto const smoothedTargetTps = stats.updateAndGetTargetLimit(targetTps.get());
+				result[TransactionPriority::BATCH][tag] = result[TransactionPriority::DEFAULT][tag] =
+				    smoothedTargetTps / numProxies;
+			} else {
+				te.disable();
+			}
+		}
+
+		return result;
+	}
+
 	PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates() {
 		PrioritizedTransactionTagMap<ClientTagThrottleLimits> result;
 		lastBusyReadTagCount = lastBusyWriteTagCount = 0;
+
 		for (auto& [tag, stats] : tagStatistics) {
 			// Currently there is no differentiation between batch priority and default priority transactions
-			auto const readLimitingTps = getLimitingTps(tag, OpType::READ);
-			auto const writeLimitingTps = getLimitingTps(tag, OpType::WRITE);
-			Optional<double> limitingTps;
-			limitingTps = getMin(readLimitingTps, writeLimitingTps);
+			bool isReadBusy = false, isWriteBusy = false;
+			TraceEvent te("GlobalTagThrottler_GotClientRate", id);
+			auto const targetTps = getTargetTps(tag, isReadBusy, isWriteBusy, te);
 
-			auto const averageTransactionReadCost = getAverageTransactionCost(tag, OpType::READ);
-			auto const averageTransactionWriteCost = getAverageTransactionCost(tag, OpType::WRITE);
-			auto const readDesiredTps = getTps(tag, OpType::READ, LimitType::TOTAL, averageTransactionReadCost);
-			auto const writeDesiredTps = getTps(tag, OpType::WRITE, LimitType::TOTAL, averageTransactionWriteCost);
-			Optional<double> desiredTps;
-			desiredTps = getMin(readDesiredTps, writeDesiredTps);
-
-			if (!desiredTps.present()) {
-				continue;
-			}
-
-			if (readLimitingTps.present() && readLimitingTps.get() < readDesiredTps.orDefault(0)) {
+			if (isReadBusy) {
 				++lastBusyReadTagCount;
 			}
-			if (writeLimitingTps.present() && writeLimitingTps.get() < writeDesiredTps.orDefault(0)) {
+			if (isWriteBusy) {
 				++lastBusyWriteTagCount;
 			}
 
-			auto const readReservedTps = getTps(tag, OpType::READ, LimitType::RESERVED, averageTransactionReadCost);
-			auto const writeReservedTps = getTps(tag, OpType::WRITE, LimitType::RESERVED, averageTransactionWriteCost);
-			Optional<double> reservedTps;
-			reservedTps = getMax(readReservedTps, writeReservedTps);
-
-			auto targetTps = desiredTps.get();
-			if (limitingTps.present()) {
-				targetTps = std::min(targetTps, limitingTps.get());
+			if (targetTps.present()) {
+				auto const clientRate = stats.updateAndGetPerClientLimit(targetTps.get());
+				result[TransactionPriority::BATCH][tag] = result[TransactionPriority::DEFAULT][tag] = clientRate;
+				te.detail("ClientTps", clientRate.tpsRate);
+			} else {
+				te.disable();
 			}
-			if (reservedTps.present()) {
-				targetTps = std::max(targetTps, reservedTps.get());
-			}
-
-			auto const clientRate = stats.updateAndGetPerClientLimit(targetTps);
-			result[TransactionPriority::BATCH][tag] = result[TransactionPriority::DEFAULT][tag] = clientRate;
-
-			TraceEvent("GlobalTagThrottler_GotClientRate", id)
-			    .detail("Tag", printable(tag))
-			    .detail("TargetTps", targetTps)
-			    .detail("AverageTransactionReadCost", averageTransactionReadCost)
-			    .detail("AverageTransactionWriteCost", averageTransactionWriteCost)
-			    .detail("ClientTps", clientRate.tpsRate)
-			    .detail("LimitingTps", limitingTps)
-			    .detail("ReservedTps", reservedTps)
-			    .detail("DesiredTps", desiredTps)
-			    .detail("NumStorageServers", throughput.size());
 		}
 		return result;
 	}
@@ -489,6 +533,9 @@ uint64_t GlobalTagThrottler::getThrottledTagChangeId() const {
 PrioritizedTransactionTagMap<ClientTagThrottleLimits> GlobalTagThrottler::getClientRates() {
 	return impl->getClientRates();
 }
+PrioritizedTransactionTagMap<double> GlobalTagThrottler::getProxyRates(int numProxies) {
+	return impl->getProxyRates(numProxies);
+}
 int64_t GlobalTagThrottler::autoThrottleCount() const {
 	return impl->autoThrottleCount();
 }
@@ -653,7 +700,36 @@ Future<Void> monitor(GlobalTagThrottler* globalTagThrottler, Check check) {
 	}
 }
 
-bool rateIsNear(GlobalTagThrottler& globalTagThrottler, TransactionTag tag, Optional<double> expected) {
+bool isNear(double a, double b) {
+	return abs(a - b) < 1.0;
+}
+
+bool isNear(Optional<double> a, Optional<double> b) {
+	if (a.present()) {
+		return b.present() && isNear(a.get(), b.get());
+	} else {
+		return !b.present();
+	}
+}
+
+bool targetRateIsNear(GlobalTagThrottler& globalTagThrottler, TransactionTag tag, Optional<double> expected) {
+	Optional<double> rate;
+	auto targetRates = globalTagThrottler.getProxyRates(1);
+	auto it1 = targetRates.find(TransactionPriority::DEFAULT);
+	if (it1 != targetRates.end()) {
+		auto it2 = it1->second.find(tag);
+		if (it2 != it1->second.end()) {
+			rate = it2->second;
+		}
+	}
+	TraceEvent("GlobalTagThrottling_RateMonitor")
+	    .detail("Tag", tag)
+	    .detail("CurrentTPSRate", rate)
+	    .detail("ExpectedTPSRate", expected);
+	return isNear(rate, expected);
+}
+
+bool clientRateIsNear(GlobalTagThrottler& globalTagThrottler, TransactionTag tag, Optional<double> expected) {
 	Optional<double> rate;
 	auto clientRates = globalTagThrottler.getClientRates();
 	auto it1 = clientRates.find(TransactionPriority::DEFAULT);
@@ -663,15 +739,11 @@ bool rateIsNear(GlobalTagThrottler& globalTagThrottler, TransactionTag tag, Opti
 			rate = it2->second.tpsRate;
 		}
 	}
-	TraceEvent("GlobalTagThrottling_RateMonitor")
+	TraceEvent("GlobalTagThrottling_ClientRateMonitor")
 	    .detail("Tag", tag)
 	    .detail("CurrentTPSRate", rate)
 	    .detail("ExpectedTPSRate", expected);
-	if (rate.present()) {
-		return expected.present() && abs(rate.get() - expected.get()) < 1.0;
-	} else {
-		return !expected.present();
-	}
+	return isNear(rate, expected);
 }
 
 ACTOR Future<Void> updateGlobalTagThrottler(GlobalTagThrottler* globalTagThrottler,
@@ -697,11 +769,11 @@ TEST_CASE("/GlobalTagThrottler/Simple") {
 	state Future<Void> client = GlobalTagThrottlerTesting::runClient(
 	    &globalTagThrottler, &storageServers, testTag, 5.0, 6.0, GlobalTagThrottlerTesting::OpType::READ);
 	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag](auto& gtt) {
-		return GlobalTagThrottlerTesting::rateIsNear(gtt, testTag, 100.0 / 6.0);
+		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag, 100.0 / 6.0);
 	});
 	state Future<Void> updater =
 	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
-	wait(timeoutError(monitor || client || updater, 300.0));
+	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }
 
@@ -715,11 +787,11 @@ TEST_CASE("/GlobalTagThrottler/WriteThrottling") {
 	state Future<Void> client = GlobalTagThrottlerTesting::runClient(
 	    &globalTagThrottler, &storageServers, testTag, 5.0, 6.0, GlobalTagThrottlerTesting::OpType::WRITE);
 	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag](auto& gtt) {
-		return GlobalTagThrottlerTesting::rateIsNear(gtt, testTag, 100.0 / 6.0);
+		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag, 100.0 / 6.0);
 	});
 	state Future<Void> updater =
 	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
-	wait(timeoutError(monitor || client || updater, 300.0));
+	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }
 
@@ -741,10 +813,10 @@ TEST_CASE("/GlobalTagThrottler/MultiTagThrottling") {
 	futures.push_back(GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers));
 	state Future<Void> monitor =
 	    GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag1, testTag2](auto& gtt) {
-		    return GlobalTagThrottlerTesting::rateIsNear(gtt, testTag1, 100.0 / 6.0) &&
-		           GlobalTagThrottlerTesting::rateIsNear(gtt, testTag2, 100.0 / 6.0);
+		    return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag1, 100.0 / 6.0) &&
+		           GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag2, 100.0 / 6.0);
 	    });
-	wait(timeoutError(waitForAny(futures) || monitor, 300.0));
+	wait(timeoutError(waitForAny(futures) || monitor, 600.0));
 	return Void();
 }
 
@@ -758,11 +830,11 @@ TEST_CASE("/GlobalTagThrottler/AttemptWorkloadAboveQuota") {
 	state Future<Void> client = GlobalTagThrottlerTesting::runClient(
 	    &globalTagThrottler, &storageServers, testTag, 20.0, 10.0, GlobalTagThrottlerTesting::OpType::READ);
 	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag](auto& gtt) {
-		return GlobalTagThrottlerTesting::rateIsNear(gtt, testTag, 10.0);
+		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag, 10.0);
 	});
 	state Future<Void> updater =
 	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
-	wait(timeoutError(monitor || client || updater, 300.0));
+	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }
 
@@ -778,11 +850,12 @@ TEST_CASE("/GlobalTagThrottler/MultiClientThrottling") {
 	state Future<Void> client2 = GlobalTagThrottlerTesting::runClient(
 	    &globalTagThrottler, &storageServers, testTag, 5.0, 6.0, GlobalTagThrottlerTesting::OpType::READ);
 	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag](auto& gtt) {
-		return GlobalTagThrottlerTesting::rateIsNear(gtt, testTag, 100.0 / 6.0);
+		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag, 100.0 / 6.0) &&
+		       GlobalTagThrottlerTesting::clientRateIsNear(gtt, testTag, 100.0 / 6.0);
 	});
 	state Future<Void> updater =
 	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
-	wait(timeoutError(monitor || client || client2 || updater, 300.0));
+	wait(timeoutError(monitor || client || client2 || updater, 600.0));
 	return Void();
 }
 
@@ -797,11 +870,13 @@ TEST_CASE("/GlobalTagThrottler/MultiClientThrottling2") {
 	    &globalTagThrottler, &storageServers, testTag, 20.0, 10.0, GlobalTagThrottlerTesting::OpType::READ);
 	state Future<Void> client2 = GlobalTagThrottlerTesting::runClient(
 	    &globalTagThrottler, &storageServers, testTag, 20.0, 10.0, GlobalTagThrottlerTesting::OpType::READ);
-	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(
-	    &globalTagThrottler, [testTag](auto& gtt) { return GlobalTagThrottlerTesting::rateIsNear(gtt, testTag, 5.0); });
+	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag](auto& gtt) {
+		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag, 10.0) &&
+		       GlobalTagThrottlerTesting::clientRateIsNear(gtt, testTag, 5.0);
+	});
 	state Future<Void> updater =
 	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
-	wait(timeoutError(monitor || client || updater, 300.0));
+	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }
 
@@ -818,11 +893,12 @@ TEST_CASE("/GlobalTagThrottler/SkewedMultiClientThrottling") {
 	state Future<Void> client2 = GlobalTagThrottlerTesting::runClient(
 	    &globalTagThrottler, &storageServers, testTag, 25.0, 5.0, GlobalTagThrottlerTesting::OpType::READ);
 	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag](auto& gtt) {
-		return GlobalTagThrottlerTesting::rateIsNear(gtt, testTag, 15.0);
+		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag, 20.0) &&
+		       GlobalTagThrottlerTesting::clientRateIsNear(gtt, testTag, 15.0);
 	});
 	state Future<Void> updater =
 	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
-	wait(timeoutError(monitor || client || updater, 300.0));
+	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }
 
@@ -837,17 +913,17 @@ TEST_CASE("/GlobalTagThrottler/UpdateQuota") {
 	state Future<Void> client = GlobalTagThrottlerTesting::runClient(
 	    &globalTagThrottler, &storageServers, testTag, 5.0, 6.0, GlobalTagThrottlerTesting::OpType::READ);
 	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [](auto& gtt) {
-		return GlobalTagThrottlerTesting::rateIsNear(gtt, "sampleTag1"_sr, 100.0 / 6.0);
+		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, "sampleTag1"_sr, 100.0 / 6.0);
 	});
 	state Future<Void> updater =
 	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
-	wait(timeoutError(monitor || client || updater, 300.0));
+	wait(timeoutError(monitor || client || updater, 600.0));
 	tagQuotaValue.totalReadQuota = 50.0;
 	globalTagThrottler.setQuota(testTag, tagQuotaValue);
 	monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [](auto& gtt) {
-		return GlobalTagThrottlerTesting::rateIsNear(gtt, "sampleTag1"_sr, 50.0 / 6.0);
+		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, "sampleTag1"_sr, 50.0 / 6.0);
 	});
-	wait(timeoutError(monitor || client || updater, 300.0));
+	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }
 
@@ -861,15 +937,16 @@ TEST_CASE("/GlobalTagThrottler/RemoveQuota") {
 	state Future<Void> client = GlobalTagThrottlerTesting::runClient(
 	    &globalTagThrottler, &storageServers, testTag, 5.0, 6.0, GlobalTagThrottlerTesting::OpType::READ);
 	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [](auto& gtt) {
-		return GlobalTagThrottlerTesting::rateIsNear(gtt, "sampleTag1"_sr, 100.0 / 6.0);
+		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, "sampleTag1"_sr, 100.0 / 6.0);
 	});
 	state Future<Void> updater =
 	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
-	wait(timeoutError(monitor || client || updater, 300.0));
+	wait(timeoutError(monitor || client || updater, 600.0));
 	globalTagThrottler.removeQuota(testTag);
-	monitor = GlobalTagThrottlerTesting::monitor(
-	    &globalTagThrottler, [](auto& gtt) { return GlobalTagThrottlerTesting::rateIsNear(gtt, "sampleTag1"_sr, {}); });
-	wait(timeoutError(monitor || client || updater, 300.0));
+	monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [](auto& gtt) {
+		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, "sampleTag1"_sr, {});
+	});
+	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }
 
@@ -883,11 +960,11 @@ TEST_CASE("/GlobalTagThrottler/ActiveThrottling") {
 	state Future<Void> client = GlobalTagThrottlerTesting::runClient(
 	    &globalTagThrottler, &storageServers, testTag, 10.0, 6.0, GlobalTagThrottlerTesting::OpType::READ);
 	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag](auto& gtt) {
-		return GlobalTagThrottlerTesting::rateIsNear(gtt, testTag, 50 / 6.0) && gtt.busyReadTagCount() == 1;
+		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag, 50 / 6.0) && gtt.busyReadTagCount() == 1;
 	});
 	state Future<Void> updater =
 	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
-	wait(timeoutError(monitor || client || updater, 300.0));
+	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }
 
@@ -909,12 +986,12 @@ TEST_CASE("/GlobalTagThrottler/MultiTagActiveThrottling") {
 	    &globalTagThrottler, &storageServers, testTag2, 10.0, 6.0, GlobalTagThrottlerTesting::OpType::READ));
 	state Future<Void> monitor =
 	    GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag1, testTag2](auto& gtt) {
-		    return GlobalTagThrottlerTesting::rateIsNear(gtt, testTag1, (50 / 6.0) / 3) &&
-		           GlobalTagThrottlerTesting::rateIsNear(gtt, testTag2, 2 * (50 / 6.0) / 3) &&
+		    return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag1, (50 / 6.0) / 3) &&
+		           GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag2, 2 * (50 / 6.0) / 3) &&
 		           gtt.busyReadTagCount() == 2;
 	    });
 	futures.push_back(GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers));
-	wait(timeoutError(waitForAny(futures) || monitor, 300.0));
+	wait(timeoutError(waitForAny(futures) || monitor, 600.0));
 	return Void();
 }
 
@@ -936,11 +1013,11 @@ TEST_CASE("/GlobalTagThrottler/MultiTagActiveThrottling2") {
 	    &globalTagThrottler, &storageServers, testTag2, 10.0, 6.0, GlobalTagThrottlerTesting::OpType::READ, { 1, 2 }));
 	state Future<Void> monitor =
 	    GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag1, testTag2](auto& gtt) {
-		    return GlobalTagThrottlerTesting::rateIsNear(gtt, testTag1, 50 / 6.0) &&
-		           GlobalTagThrottlerTesting::rateIsNear(gtt, testTag2, 50 / 6.0) && gtt.busyReadTagCount() == 2;
+		    return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag1, 50 / 6.0) &&
+		           GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag2, 50 / 6.0) && gtt.busyReadTagCount() == 2;
 	    });
 	futures.push_back(GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers));
-	wait(timeoutError(waitForAny(futures) || monitor, 300.0));
+	wait(timeoutError(waitForAny(futures) || monitor, 600.0));
 	return Void();
 }
 
@@ -962,11 +1039,11 @@ TEST_CASE("/GlobalTagThrottler/MultiTagActiveThrottling3") {
 	    &globalTagThrottler, &storageServers, testTag2, 10.0, 6.0, GlobalTagThrottlerTesting::OpType::READ, { 1, 2 }));
 	state Future<Void> monitor =
 	    GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag1, testTag2](auto& gtt) {
-		    return GlobalTagThrottlerTesting::rateIsNear(gtt, testTag1, 50 / 6.0) &&
-		           GlobalTagThrottlerTesting::rateIsNear(gtt, testTag2, 100 / 6.0) && gtt.busyReadTagCount() == 1;
+		    return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag1, 50 / 6.0) &&
+		           GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag2, 100 / 6.0) && gtt.busyReadTagCount() == 1;
 	    });
 	futures.push_back(GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers));
-	wait(timeoutError(waitForAny(futures) || monitor, 300.0));
+	wait(timeoutError(waitForAny(futures) || monitor, 600.0));
 	return Void();
 }
 
@@ -981,11 +1058,11 @@ TEST_CASE("/GlobalTagThrottler/ReservedReadQuota") {
 	state Future<Void> client = GlobalTagThrottlerTesting::runClient(
 	    &globalTagThrottler, &storageServers, testTag, 10.0, 6.0, GlobalTagThrottlerTesting::OpType::READ);
 	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag](auto& gtt) {
-		return GlobalTagThrottlerTesting::rateIsNear(gtt, testTag, 70 / 6.0);
+		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag, 70 / 6.0);
 	});
 	state Future<Void> updater =
 	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
-	wait(timeoutError(monitor || client || updater, 300.0));
+	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }
 
@@ -1000,10 +1077,10 @@ TEST_CASE("/GlobalTagThrottler/ReservedWriteQuota") {
 	state Future<Void> client = GlobalTagThrottlerTesting::runClient(
 	    &globalTagThrottler, &storageServers, testTag, 10.0, 6.0, GlobalTagThrottlerTesting::OpType::WRITE);
 	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag](auto& gtt) {
-		return GlobalTagThrottlerTesting::rateIsNear(gtt, testTag, 70 / 6.0);
+		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag, 70 / 6.0);
 	});
 	state Future<Void> updater =
 	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
-	wait(timeoutError(monitor || client || updater, 300.0));
+	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }
diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp
index f56049c39c..a3b4aab647 100644
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@@ -18,6 +18,7 @@
  * limitations under the License.
  */
 
+#include "fdbclient/ClientKnobs.h"
 #include "fdbclient/DatabaseContext.h"
 #include "fdbclient/Knobs.h"
 #include "fdbclient/Notified.h"
@@ -31,6 +32,7 @@
 #include "fdbserver/WaitFailure.h"
 #include "fdbserver/WorkerInterface.actor.h"
 #include "fdbrpc/sim_validation.h"
+#include "flow/IRandom.h"
 #include "flow/flow.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
 
@@ -168,7 +170,7 @@ struct GrvTransactionRateInfo {
 	Smoother smoothRate;
 	Smoother smoothReleased;
 
-	GrvTransactionRateInfo(double rate)
+	GrvTransactionRateInfo(double rate = 0.0)
 	  : rate(rate), limit(0), budget(0), disabled(true), smoothRate(SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW),
 	    smoothReleased(SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW) {}
 
@@ -185,7 +187,7 @@ struct GrvTransactionRateInfo {
 		limit = SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW * releaseRate;
 	}
 
-	bool canStart(int64_t numAlreadyStarted, int64_t count) {
+	bool canStart(int64_t numAlreadyStarted, int64_t count) const {
 		return numAlreadyStarted + count <=
 		       std::min(limit + budget, SERVER_KNOBS->START_TRANSACTION_MAX_TRANSACTIONS_TO_START);
 	}
@@ -254,6 +256,7 @@ struct GrvProxyData {
 	int updateCommitRequests;
 	NotifiedDouble lastCommitTime;
 
+	Version version;
 	Version minKnownCommittedVersion; // we should ask master for this version.
 
 	// Cache of the latest commit versions of storage servers.
@@ -286,7 +289,7 @@ struct GrvProxyData {
 	                                dbgid,
 	                                SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
 	                                SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
-	    updateCommitRequests(0), lastCommitTime(0), minKnownCommittedVersion(invalidVersion) {}
+	    updateCommitRequests(0), lastCommitTime(0), version(0), minKnownCommittedVersion(invalidVersion) {}
 };
 
 ACTOR Future<Void> healthMetricsRequestServer(GrvProxyInterface grvProxy,
@@ -435,8 +438,10 @@ ACTOR Future<Void> getRate(UID myID,
                            GetHealthMetricsReply* healthMetricsReply,
                            GetHealthMetricsReply* detailedHealthMetricsReply,
                            TransactionTagMap<uint64_t>* transactionTagCounter,
-                           PrioritizedTransactionTagMap<ClientTagThrottleLimits>* throttledTags,
-                           GrvProxyStats* stats) {
+                           PrioritizedTransactionTagMap<ClientTagThrottleLimits>* clientThrottledTags,
+                           PrioritizedTransactionTagMap<GrvTransactionRateInfo>* perTagRateInfo,
+                           GrvProxyStats* stats,
+                           GrvProxyData* proxyData) {
 	state Future<Void> nextRequestTimer = Never();
 	state Future<Void> leaseTimeout = Never();
 	state Future<GetRateInfoReply> reply = Never();
@@ -461,14 +466,13 @@ ACTOR Future<Void> getRate(UID myID,
 			nextRequestTimer = Never();
 			bool detailed = now() - lastDetailedReply > SERVER_KNOBS->DETAILED_METRIC_UPDATE_RATE;
 
-			TransactionTagMap<uint64_t> tagCounts;
-			for (auto itr : *throttledTags) {
-				for (auto priorityThrottles : itr.second) {
-					tagCounts[priorityThrottles.first] = (*transactionTagCounter)[priorityThrottles.first];
-				}
-			}
-			reply = brokenPromiseToNever(db->get().ratekeeper.get().getRateInfo.getReply(GetRateInfoRequest(
-			    myID, *inTransactionCount, *inBatchTransactionCount, *transactionTagCounter, detailed)));
+			reply = brokenPromiseToNever(
+			    db->get().ratekeeper.get().getRateInfo.getReply(GetRateInfoRequest(myID,
+			                                                                       *inTransactionCount,
+			                                                                       *inBatchTransactionCount,
+			                                                                       proxyData->version,
+			                                                                       *transactionTagCounter,
+			                                                                       detailed)));
 			transactionTagCounter->clear();
 			expectingDetailedReply = detailed;
 		}
@@ -492,8 +496,16 @@ ACTOR Future<Void> getRate(UID myID,
 
 			// Replace our throttles with what was sent by ratekeeper. Because we do this,
 			// we are not required to expire tags out of the map
-			if (rep.throttledTags.present()) {
-				*throttledTags = std::move(rep.throttledTags.get());
+			if (rep.clientThrottledTags.present()) {
+				*clientThrottledTags = std::move(rep.clientThrottledTags.get());
+			}
+			if (rep.proxyThrottledTags.present()) {
+				perTagRateInfo->clear();
+				for (const auto& [priority, tagToRate] : rep.proxyThrottledTags.get()) {
+					for (const auto& [tag, rate] : tagToRate) {
+						(*perTagRateInfo)[priority][tag].setRate(rate);
+					}
+				}
 			}
 		}
 		when(wait(leaseTimeout)) {
@@ -510,7 +522,7 @@ ACTOR Future<Void> getRate(UID myID,
 // Respond with an error to the GetReadVersion request when the GRV limit is hit.
 void proxyGRVThresholdExceeded(const GetReadVersionRequest* req, GrvProxyStats* stats) {
 	++stats->txnRequestErrors;
-	req->reply.sendError(proxy_memory_limit_exceeded());
+	req->reply.sendError(grv_proxy_memory_limit_exceeded());
 	if (req->priority == TransactionPriority::IMMEDIATE) {
 		TraceEvent(SevWarnAlways, "ProxyGRVThresholdExceededSystem").suppressFor(60);
 	} else if (req->priority == TransactionPriority::DEFAULT) {
@@ -527,18 +539,20 @@ void dropRequestFromQueue(Deque<GetReadVersionRequest>* queue, GrvProxyStats* st
 }
 
 // Put a GetReadVersion request into the queue corresponding to its priority.
-ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo> const> db,
-                                               SpannedDeque<GetReadVersionRequest>* systemQueue,
-                                               SpannedDeque<GetReadVersionRequest>* defaultQueue,
-                                               SpannedDeque<GetReadVersionRequest>* batchQueue,
-                                               FutureStream<GetReadVersionRequest> readVersionRequests,
-                                               PromiseStream<Void> GRVTimer,
-                                               double* lastGRVTime,
-                                               double* GRVBatchTime,
-                                               FutureStream<double> normalGRVLatency,
-                                               GrvProxyStats* stats,
-                                               GrvTransactionRateInfo* batchRateInfo,
-                                               TransactionTagMap<uint64_t>* transactionTagCounter) {
+ACTOR Future<Void> queueGetReadVersionRequests(
+    Reference<AsyncVar<ServerDBInfo> const> db,
+    SpannedDeque<GetReadVersionRequest>* systemQueue,
+    SpannedDeque<GetReadVersionRequest>* defaultQueue,
+    SpannedDeque<GetReadVersionRequest>* batchQueue,
+    FutureStream<GetReadVersionRequest> readVersionRequests,
+    PromiseStream<Void> GRVTimer,
+    double* lastGRVTime,
+    double* GRVBatchTime,
+    FutureStream<double> normalGRVLatency,
+    GrvProxyStats* stats,
+    GrvTransactionRateInfo* batchRateInfo,
+    TransactionTagMap<uint64_t>* transactionTagCounter,
+    PrioritizedTransactionTagMap<GrvTransactionRateInfo> const* perClientRateInfo) {
 	getCurrentLineage()->modify(&TransactionLineage::operation) =
 	    TransactionLineage::Operation::GetConsistentReadVersion;
 	loop choose {
@@ -548,7 +562,9 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>
 			// WARNING: this code is run at a high priority, so it needs to do as little work as possible
 			bool canBeQueued = true;
 			if (stats->txnRequestIn.getValue() - stats->txnRequestOut.getValue() >
-			    SERVER_KNOBS->START_TRANSACTION_MAX_QUEUE_SIZE) {
+			        SERVER_KNOBS->START_TRANSACTION_MAX_QUEUE_SIZE ||
+			    (g_network->isSimulated() && !g_simulator.speedUpSimulation &&
+			     deterministicRandom()->random01() < 0.01)) {
 				// When the limit is hit, try to drop requests from the lower priority queues.
 				if (req.priority == TransactionPriority::BATCH) {
 					canBeQueued = false;
@@ -702,6 +718,7 @@ ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(SpanContext parentSpan
 	}
 
 	GetRawCommittedVersionReply repFromMaster = wait(replyFromMasterFuture);
+	grvProxyData->version = std::max(grvProxyData->version, repFromMaster.version);
 	grvProxyData->minKnownCommittedVersion =
 	    std::max(grvProxyData->minKnownCommittedVersion, repFromMaster.minKnownCommittedVersion);
 	if (SERVER_KNOBS->ENABLE_VERSION_VECTOR) {
@@ -742,7 +759,7 @@ ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture,
                                   GrvProxyData* grvProxyData,
                                   GrvProxyStats* stats,
                                   Version minKnownCommittedVersion,
-                                  PrioritizedTransactionTagMap<ClientTagThrottleLimits> throttledTags,
+                                  PrioritizedTransactionTagMap<ClientTagThrottleLimits> clientThrottledTags,
                                   int64_t midShardSize = 0) {
 	GetReadVersionReply _reply = wait(replyFuture);
 	GetReadVersionReply reply = _reply;
@@ -778,7 +795,7 @@ ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture,
 		reply.proxyId = grvProxyData->dbgid;
 
 		if (!request.tags.empty()) {
-			auto& priorityThrottledTags = throttledTags[request.priority];
+			auto& priorityThrottledTags = clientThrottledTags[request.priority];
 			for (auto tag : request.tags) {
 				auto tagItr = priorityThrottledTags.find(tag.first);
 				if (tagItr != priorityThrottledTags.end()) {
@@ -880,13 +897,14 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 	state int64_t batchTransactionCount = 0;
 	state GrvTransactionRateInfo normalRateInfo(10);
 	state GrvTransactionRateInfo batchRateInfo(0);
+	state PrioritizedTransactionTagMap<GrvTransactionRateInfo> perTagRateInfo;
 
 	state SpannedDeque<GetReadVersionRequest> systemQueue("GP:transactionStarterSystemQueue"_loc);
 	state SpannedDeque<GetReadVersionRequest> defaultQueue("GP:transactionStarterDefaultQueue"_loc);
 	state SpannedDeque<GetReadVersionRequest> batchQueue("GP:transactionStarterBatchQueue"_loc);
 
 	state TransactionTagMap<uint64_t> transactionTagCounter;
-	state PrioritizedTransactionTagMap<ClientTagThrottleLimits> throttledTags;
+	state PrioritizedTransactionTagMap<ClientTagThrottleLimits> clientThrottledTags;
 
 	state PromiseStream<double> normalGRVLatency;
 	// state Span span;
@@ -905,8 +923,10 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 	                      healthMetricsReply,
 	                      detailedHealthMetricsReply,
 	                      &transactionTagCounter,
-	                      &throttledTags,
-	                      &grvProxyData->stats));
+	                      &clientThrottledTags,
+	                      &perTagRateInfo,
+	                      &grvProxyData->stats,
+	                      grvProxyData));
 	addActor.send(queueGetReadVersionRequests(db,
 	                                          &systemQueue,
 	                                          &defaultQueue,
@@ -918,7 +938,8 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 	                                          normalGRVLatency.getFuture(),
 	                                          &grvProxyData->stats,
 	                                          &batchRateInfo,
-	                                          &transactionTagCounter));
+	                                          &transactionTagCounter,
+	                                          &perTagRateInfo));
 
 	while (std::find(db->get().client.grvProxies.begin(), db->get().client.grvProxies.end(), proxy) ==
 	       db->get().client.grvProxies.end()) {
@@ -1081,7 +1102,7 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 				                             grvProxyData,
 				                             &grvProxyData->stats,
 				                             grvProxyData->minKnownCommittedVersion,
-				                             throttledTags,
+				                             clientThrottledTags,
 				                             midShardSize));
 
 				// Use normal priority transaction's GRV latency to dynamically calculate transaction batching interval.
diff --git a/fdbserver/KeyValueStoreCompressTestData.actor.cpp b/fdbserver/KeyValueStoreCompressTestData.actor.cpp
index 77d3fb7a52..a05f297fd5 100644
--- a/fdbserver/KeyValueStoreCompressTestData.actor.cpp
+++ b/fdbserver/KeyValueStoreCompressTestData.actor.cpp
@@ -56,27 +56,30 @@ struct KeyValueStoreCompressTestData final : IKeyValueStore {
 	void clear(KeyRangeRef range, const Arena* arena = nullptr) override { store->clear(range, arena); }
 	Future<Void> commit(bool sequential = false) override { return store->commit(sequential); }
 
-	Future<Optional<Value>> readValue(KeyRef key, ReadType, Optional<UID> debugID) override {
-		return doReadValue(store, key, debugID);
+	Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> options) override {
+		return doReadValue(store, key, options);
 	}
 
 	// Note that readValuePrefix doesn't do anything in this implementation of IKeyValueStore, so the "atomic bomb"
 	// problem is still present if you are using this storage interface, but this storage interface is not used by
 	// customers ever. However, if you want to try to test malicious atomic op workloads with compressed values for some
 	// reason, you will need to fix this.
-	Future<Optional<Value>> readValuePrefix(KeyRef key, int maxLength, ReadType, Optional<UID> debugID) override {
-		return doReadValuePrefix(store, key, maxLength, debugID);
+	Future<Optional<Value>> readValuePrefix(KeyRef key, int maxLength, Optional<ReadOptions> options) override {
+		return doReadValuePrefix(store, key, maxLength, options);
 	}
 
 	// If rowLimit>=0, reads first rows sorted ascending, otherwise reads last rows sorted descending
 	// The total size of the returned value (less the last entry) will be less than byteLimit
-	Future<RangeResult> readRange(KeyRangeRef keys, int rowLimit, int byteLimit, ReadType) override {
-		return doReadRange(store, keys, rowLimit, byteLimit);
+	Future<RangeResult> readRange(KeyRangeRef keys,
+	                              int rowLimit,
+	                              int byteLimit,
+	                              Optional<ReadOptions> options = Optional<ReadOptions>()) override {
+		return doReadRange(store, keys, rowLimit, byteLimit, options);
 	}
 
 private:
-	ACTOR static Future<Optional<Value>> doReadValue(IKeyValueStore* store, Key key, Optional<UID> debugID) {
-		Optional<Value> v = wait(store->readValue(key, ReadType::NORMAL, debugID));
+	ACTOR static Future<Optional<Value>> doReadValue(IKeyValueStore* store, Key key, Optional<ReadOptions> options) {
+		Optional<Value> v = wait(store->readValue(key, options));
 		if (!v.present())
 			return v;
 		return unpack(v.get());
@@ -85,8 +88,8 @@ private:
 	ACTOR static Future<Optional<Value>> doReadValuePrefix(IKeyValueStore* store,
 	                                                       Key key,
 	                                                       int maxLength,
-	                                                       Optional<UID> debugID) {
-		Optional<Value> v = wait(doReadValue(store, key, debugID));
+	                                                       Optional<ReadOptions> options) {
+		Optional<Value> v = wait(doReadValue(store, key, options));
 		if (!v.present())
 			return v;
 		if (maxLength < v.get().size()) {
@@ -95,8 +98,12 @@ private:
 			return v;
 		}
 	}
-	ACTOR Future<RangeResult> doReadRange(IKeyValueStore* store, KeyRangeRef keys, int rowLimit, int byteLimit) {
-		RangeResult _vs = wait(store->readRange(keys, rowLimit, byteLimit));
+	ACTOR Future<RangeResult> doReadRange(IKeyValueStore* store,
+	                                      KeyRangeRef keys,
+	                                      int rowLimit,
+	                                      int byteLimit,
+	                                      Optional<ReadOptions> options) {
+		RangeResult _vs = wait(store->readRange(keys, rowLimit, byteLimit, options));
 		RangeResult vs = _vs; // Get rid of implicit const& from wait statement
 		Arena& a = vs.arena();
 		for (int i = 0; i < vs.size(); i++)
diff --git a/fdbserver/KeyValueStoreMemory.actor.cpp b/fdbserver/KeyValueStoreMemory.actor.cpp
index ad03439afd..a632c6c717 100644
--- a/fdbserver/KeyValueStoreMemory.actor.cpp
+++ b/fdbserver/KeyValueStoreMemory.actor.cpp
@@ -22,7 +22,7 @@
 #include "fdbclient/Notified.h"
 #include "fdbclient/SystemData.h"
 #include "fdbserver/DeltaTree.h"
-#include "fdbserver/GetEncryptCipherKeys.h"
+#include "fdbclient/GetEncryptCipherKeys.actor.h"
 #include "fdbserver/IDiskQueue.h"
 #include "fdbserver/IKeyValueContainer.h"
 #include "fdbserver/IKeyValueStore.h"
@@ -198,11 +198,11 @@ public:
 		return c;
 	}
 
-	Future<Optional<Value>> readValue(KeyRef key, ReadType, Optional<UID> debugID) override {
+	Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> options) override {
 		if (recovering.isError())
 			throw recovering.getError();
 		if (!recovering.isReady())
-			return waitAndReadValue(this, key);
+			return waitAndReadValue(this, key, options);
 
 		auto it = data.find(key);
 		if (it == data.end())
@@ -210,11 +210,11 @@ public:
 		return Optional<Value>(it.getValue());
 	}
 
-	Future<Optional<Value>> readValuePrefix(KeyRef key, int maxLength, ReadType, Optional<UID> debugID) override {
+	Future<Optional<Value>> readValuePrefix(KeyRef key, int maxLength, Optional<ReadOptions> options) override {
 		if (recovering.isError())
 			throw recovering.getError();
 		if (!recovering.isReady())
-			return waitAndReadValuePrefix(this, key, maxLength);
+			return waitAndReadValuePrefix(this, key, maxLength, options);
 
 		auto it = data.find(key);
 		if (it == data.end())
@@ -229,11 +229,14 @@ public:
 
 	// If rowLimit>=0, reads first rows sorted ascending, otherwise reads last rows sorted descending
 	// The total size of the returned value (less the last entry) will be less than byteLimit
-	Future<RangeResult> readRange(KeyRangeRef keys, int rowLimit, int byteLimit, ReadType) override {
+	Future<RangeResult> readRange(KeyRangeRef keys,
+	                              int rowLimit,
+	                              int byteLimit,
+	                              Optional<ReadOptions> options) override {
 		if (recovering.isError())
 			throw recovering.getError();
 		if (!recovering.isReady())
-			return waitAndReadRange(this, keys, rowLimit, byteLimit);
+			return waitAndReadRange(this, keys, rowLimit, byteLimit, options);
 
 		RangeResult result;
 		if (rowLimit == 0) {
@@ -923,20 +926,26 @@ private:
 		}
 	}
 
-	ACTOR static Future<Optional<Value>> waitAndReadValue(KeyValueStoreMemory* self, Key key) {
+	ACTOR static Future<Optional<Value>> waitAndReadValue(KeyValueStoreMemory* self,
+	                                                      Key key,
+	                                                      Optional<ReadOptions> options) {
 		wait(self->recovering);
-		return static_cast<IKeyValueStore*>(self)->readValue(key).get();
+		return static_cast<IKeyValueStore*>(self)->readValue(key, options).get();
 	}
-	ACTOR static Future<Optional<Value>> waitAndReadValuePrefix(KeyValueStoreMemory* self, Key key, int maxLength) {
+	ACTOR static Future<Optional<Value>> waitAndReadValuePrefix(KeyValueStoreMemory* self,
+	                                                            Key key,
+	                                                            int maxLength,
+	                                                            Optional<ReadOptions> options) {
 		wait(self->recovering);
-		return static_cast<IKeyValueStore*>(self)->readValuePrefix(key, maxLength).get();
+		return static_cast<IKeyValueStore*>(self)->readValuePrefix(key, maxLength, options).get();
 	}
 	ACTOR static Future<RangeResult> waitAndReadRange(KeyValueStoreMemory* self,
 	                                                  KeyRange keys,
 	                                                  int rowLimit,
-	                                                  int byteLimit) {
+	                                                  int byteLimit,
+	                                                  Optional<ReadOptions> options) {
 		wait(self->recovering);
-		return static_cast<IKeyValueStore*>(self)->readRange(keys, rowLimit, byteLimit).get();
+		return static_cast<IKeyValueStore*>(self)->readRange(keys, rowLimit, byteLimit, options).get();
 	}
 	ACTOR static Future<Void> waitAndCommit(KeyValueStoreMemory* self, bool sequential) {
 		wait(self->recovering);
diff --git a/fdbserver/KeyValueStoreRocksDB.actor.cpp b/fdbserver/KeyValueStoreRocksDB.actor.cpp
index 377b43b6bd..005e13f7b1 100644
--- a/fdbserver/KeyValueStoreRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp
@@ -20,29 +20,27 @@
 
 #ifdef SSD_ROCKSDB_EXPERIMENTAL
 
+#include <rocksdb/c.h>
 #include <rocksdb/cache.h>
 #include <rocksdb/db.h>
+#include <rocksdb/env.h>
 #include <rocksdb/filter_policy.h>
 #include <rocksdb/listener.h>
-#include <rocksdb/options.h>
 #include <rocksdb/metadata.h>
+#include <rocksdb/options.h>
+#include <rocksdb/perf_context.h>
+#include <rocksdb/rate_limiter.h>
+#include <rocksdb/slice.h>
 #include <rocksdb/slice_transform.h>
 #include <rocksdb/sst_file_reader.h>
 #include <rocksdb/sst_file_writer.h>
-#include <rocksdb/slice.h>
-#include <rocksdb/env.h>
-#include <rocksdb/options.h>
 #include <rocksdb/statistics.h>
 #include <rocksdb/table.h>
-#include <rocksdb/version.h>
 #include <rocksdb/types.h>
 #include <rocksdb/utilities/checkpoint.h>
 #include <rocksdb/utilities/table_properties_collectors.h>
 #include <rocksdb/version.h>
 
-#include <rocksdb/rate_limiter.h>
-#include <rocksdb/perf_context.h>
-#include <rocksdb/c.h>
 #if defined __has_include
 #if __has_include(<liburing.h>)
 #include <liburing.h>
@@ -79,10 +77,115 @@ static_assert((ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR == 27) ? ROCKSDB_PATCH >= 3 :
 namespace {
 using rocksdb::BackgroundErrorReason;
 
-struct SharedRocksDBState {
-	bool closing = false;
+class SharedRocksDBState {
+public:
+	SharedRocksDBState();
+
+	void setClosing() { this->closing = true; }
+	bool isClosing() const { return this->closing; }
+
+	rocksdb::DBOptions getDbOptions() const { return this->dbOptions; }
+	rocksdb::ColumnFamilyOptions getCfOptions() const { return this->cfOptions; }
+	rocksdb::Options getOptions() const { return rocksdb::Options(this->dbOptions, this->cfOptions); }
+	rocksdb::ReadOptions& getReadOptions() { return this->readOptions; }
+
+private:
+	rocksdb::ColumnFamilyOptions initialCfOptions();
+	rocksdb::DBOptions initialDbOptions();
+	rocksdb::ReadOptions initialReadOptions();
+
+	bool closing;
+	rocksdb::DBOptions dbOptions;
+	rocksdb::ColumnFamilyOptions cfOptions;
+	rocksdb::ReadOptions readOptions;
 };
 
+SharedRocksDBState::SharedRocksDBState()
+  : closing(false), dbOptions(initialDbOptions()), cfOptions(initialCfOptions()), readOptions(initialReadOptions()) {}
+
+rocksdb::ColumnFamilyOptions SharedRocksDBState::initialCfOptions() {
+	rocksdb::ColumnFamilyOptions options;
+	options.level_compaction_dynamic_level_bytes = true;
+	options.OptimizeLevelStyleCompaction(SERVER_KNOBS->ROCKSDB_MEMTABLE_BYTES);
+
+	if (SERVER_KNOBS->ROCKSDB_PERIODIC_COMPACTION_SECONDS > 0) {
+		options.periodic_compaction_seconds = SERVER_KNOBS->ROCKSDB_PERIODIC_COMPACTION_SECONDS;
+	}
+	if (SERVER_KNOBS->ROCKSDB_SOFT_PENDING_COMPACT_BYTES_LIMIT > 0) {
+		options.soft_pending_compaction_bytes_limit = SERVER_KNOBS->ROCKSDB_SOFT_PENDING_COMPACT_BYTES_LIMIT;
+	}
+	if (SERVER_KNOBS->ROCKSDB_HARD_PENDING_COMPACT_BYTES_LIMIT > 0) {
+		options.hard_pending_compaction_bytes_limit = SERVER_KNOBS->ROCKSDB_HARD_PENDING_COMPACT_BYTES_LIMIT;
+	}
+
+	// Compact sstables when there's too much deleted stuff.
+	options.table_properties_collector_factories = { rocksdb::NewCompactOnDeletionCollectorFactory(128, 1) };
+
+	rocksdb::BlockBasedTableOptions bbOpts;
+	if (SERVER_KNOBS->ROCKSDB_PREFIX_LEN > 0) {
+		// Prefix blooms are used during Seek.
+		options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(SERVER_KNOBS->ROCKSDB_PREFIX_LEN));
+
+		// Also turn on bloom filters in the memtable.
+		// TODO: Make a knob for this as well.
+		options.memtable_prefix_bloom_size_ratio = 0.1;
+
+		// 5 -- Can be read by RocksDB's versions since 6.6.0. Full and partitioned
+		// filters use a generally faster and more accurate Bloom filter
+		// implementation, with a different schema.
+		// https://github.com/facebook/rocksdb/blob/b77569f18bfc77fb1d8a0b3218f6ecf571bc4988/include/rocksdb/table.h#L391
+		bbOpts.format_version = 5;
+
+		// Create and apply a bloom filter using the 10 bits
+		// which should yield a ~1% false positive rate:
+		// https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#full-filters-new-format
+		bbOpts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10));
+
+		// The whole key blooms are only used for point lookups.
+		// https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#prefix-vs-whole-key
+		bbOpts.whole_key_filtering = false;
+	}
+
+	if (SERVER_KNOBS->ROCKSDB_BLOCK_CACHE_SIZE > 0) {
+		bbOpts.block_cache = rocksdb::NewLRUCache(SERVER_KNOBS->ROCKSDB_BLOCK_CACHE_SIZE);
+	}
+
+	if (SERVER_KNOBS->ROCKSDB_BLOCK_SIZE > 0) {
+		bbOpts.block_size = SERVER_KNOBS->ROCKSDB_BLOCK_SIZE;
+	}
+
+	options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbOpts));
+
+	return options;
+}
+
+rocksdb::DBOptions SharedRocksDBState::initialDbOptions() {
+	rocksdb::DBOptions options;
+	options.avoid_unnecessary_blocking_io = true;
+	options.create_if_missing = true;
+	if (SERVER_KNOBS->ROCKSDB_BACKGROUND_PARALLELISM > 0) {
+		options.IncreaseParallelism(SERVER_KNOBS->ROCKSDB_BACKGROUND_PARALLELISM);
+	}
+	if (SERVER_KNOBS->ROCKSDB_MAX_SUBCOMPACTIONS > 0) {
+		options.max_subcompactions = SERVER_KNOBS->ROCKSDB_MAX_SUBCOMPACTIONS;
+	}
+	if (SERVER_KNOBS->ROCKSDB_COMPACTION_READAHEAD_SIZE > 0) {
+		options.compaction_readahead_size = SERVER_KNOBS->ROCKSDB_COMPACTION_READAHEAD_SIZE;
+	}
+
+	options.statistics = rocksdb::CreateDBStatistics();
+	options.statistics->set_stats_level(rocksdb::kExceptHistogramOrTimers);
+
+	options.db_log_dir = SERVER_KNOBS->LOG_DIRECTORY;
+	return options;
+}
+
+rocksdb::ReadOptions SharedRocksDBState::initialReadOptions() {
+	rocksdb::ReadOptions options;
+	options.background_purge_on_iterator_cleanup = true;
+	return options;
+}
+
 // Returns string representation of RocksDB background error reason.
 // Error reason code:
 // https://github.com/facebook/rocksdb/blob/12d798ac06bcce36be703b057d5f5f4dab3b270c/include/rocksdb/listener.h#L125
@@ -153,7 +256,6 @@ private:
 };
 using DB = rocksdb::DB*;
 using CF = rocksdb::ColumnFamilyHandle*;
-std::shared_ptr<rocksdb::Cache> rocksdb_block_cache = nullptr;
 
 #define PERSIST_PREFIX "\xff\xff"
 const KeyRef persistVersion = LiteralStringRef(PERSIST_PREFIX "Version");
@@ -252,91 +354,6 @@ StringRef toStringRef(rocksdb::Slice s) {
 	return StringRef(reinterpret_cast<const uint8_t*>(s.data()), s.size());
 }
 
-rocksdb::ColumnFamilyOptions getCFOptions() {
-	rocksdb::ColumnFamilyOptions options;
-	options.level_compaction_dynamic_level_bytes = true;
-	options.OptimizeLevelStyleCompaction(SERVER_KNOBS->ROCKSDB_MEMTABLE_BYTES);
-	if (SERVER_KNOBS->ROCKSDB_PERIODIC_COMPACTION_SECONDS > 0) {
-		options.periodic_compaction_seconds = SERVER_KNOBS->ROCKSDB_PERIODIC_COMPACTION_SECONDS;
-	}
-	if (SERVER_KNOBS->ROCKSDB_SOFT_PENDING_COMPACT_BYTES_LIMIT > 0) {
-		options.soft_pending_compaction_bytes_limit = SERVER_KNOBS->ROCKSDB_SOFT_PENDING_COMPACT_BYTES_LIMIT;
-	}
-	if (SERVER_KNOBS->ROCKSDB_HARD_PENDING_COMPACT_BYTES_LIMIT > 0) {
-		options.hard_pending_compaction_bytes_limit = SERVER_KNOBS->ROCKSDB_HARD_PENDING_COMPACT_BYTES_LIMIT;
-	}
-	// Compact sstables when there's too much deleted stuff.
-	options.table_properties_collector_factories = { rocksdb::NewCompactOnDeletionCollectorFactory(128, 1) };
-
-	rocksdb::BlockBasedTableOptions bbOpts;
-	// TODO: Add a knob for the block cache size. (Default is 8 MB)
-	if (SERVER_KNOBS->ROCKSDB_PREFIX_LEN > 0) {
-		// Prefix blooms are used during Seek.
-		options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(SERVER_KNOBS->ROCKSDB_PREFIX_LEN));
-
-		// Also turn on bloom filters in the memtable.
-		// TODO: Make a knob for this as well.
-		options.memtable_prefix_bloom_size_ratio = 0.1;
-
-		// 5 -- Can be read by RocksDB's versions since 6.6.0. Full and partitioned
-		// filters use a generally faster and more accurate Bloom filter
-		// implementation, with a different schema.
-		// https://github.com/facebook/rocksdb/blob/b77569f18bfc77fb1d8a0b3218f6ecf571bc4988/include/rocksdb/table.h#L391
-		bbOpts.format_version = 5;
-
-		// Create and apply a bloom filter using the 10 bits
-		// which should yield a ~1% false positive rate:
-		// https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#full-filters-new-format
-		bbOpts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10));
-
-		// The whole key blooms are only used for point lookups.
-		// https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#prefix-vs-whole-key
-		bbOpts.whole_key_filtering = false;
-	}
-
-	if (SERVER_KNOBS->ROCKSDB_BLOCK_CACHE_SIZE > 0) {
-		if (rocksdb_block_cache == nullptr) {
-			rocksdb_block_cache = rocksdb::NewLRUCache(SERVER_KNOBS->ROCKSDB_BLOCK_CACHE_SIZE);
-		}
-		bbOpts.block_cache = rocksdb_block_cache;
-	}
-	if (SERVER_KNOBS->ROCKSDB_BLOCK_SIZE > 0) {
-		bbOpts.block_size = SERVER_KNOBS->ROCKSDB_BLOCK_SIZE;
-	}
-
-	options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbOpts));
-
-	return options;
-}
-
-rocksdb::Options getOptions() {
-	rocksdb::Options options({}, getCFOptions());
-	options.avoid_unnecessary_blocking_io = true;
-	options.create_if_missing = true;
-	if (SERVER_KNOBS->ROCKSDB_BACKGROUND_PARALLELISM > 0) {
-		options.IncreaseParallelism(SERVER_KNOBS->ROCKSDB_BACKGROUND_PARALLELISM);
-	}
-	if (SERVER_KNOBS->ROCKSDB_MAX_SUBCOMPACTIONS > 0) {
-		options.max_subcompactions = SERVER_KNOBS->ROCKSDB_MAX_SUBCOMPACTIONS;
-	}
-	if (SERVER_KNOBS->ROCKSDB_COMPACTION_READAHEAD_SIZE > 0) {
-		options.compaction_readahead_size = SERVER_KNOBS->ROCKSDB_COMPACTION_READAHEAD_SIZE;
-	}
-
-	options.statistics = rocksdb::CreateDBStatistics();
-	options.statistics->set_stats_level(rocksdb::kExceptHistogramOrTimers);
-
-	options.db_log_dir = SERVER_KNOBS->LOG_DIRECTORY;
-	return options;
-}
-
-// Set some useful defaults desired for all reads.
-rocksdb::ReadOptions getReadOptions() {
-	rocksdb::ReadOptions options;
-	options.background_purge_on_iterator_cleanup = true;
-	return options;
-}
-
 struct Counters {
 	CounterCollection cc;
 	Counter immediateThrottle;
@@ -370,8 +387,8 @@ gets deleted as the ref count becomes 0.
 */
 class ReadIteratorPool {
 public:
-	ReadIteratorPool(UID id, DB& db, CF& cf)
-	  : db(db), cf(cf), index(0), iteratorsReuseCount(0), readRangeOptions(getReadOptions()) {
+	ReadIteratorPool(UID id, DB& db, CF& cf, const rocksdb::ReadOptions readOptions)
+	  : db(db), cf(cf), index(0), iteratorsReuseCount(0), readRangeOptions(readOptions) {
 		readRangeOptions.background_purge_on_iterator_cleanup = true;
 		readRangeOptions.auto_prefix_mode = (SERVER_KNOBS->ROCKSDB_PREFIX_LEN > 0);
 		TraceEvent("ReadIteratorPool", id)
@@ -829,7 +846,7 @@ ACTOR Future<Void> rocksDBMetricLogger(UID id,
 
 	loop {
 		wait(delay(SERVER_KNOBS->ROCKSDB_METRICS_DELAY));
-		if (sharedState->closing) {
+		if (sharedState->isClosing()) {
 			break;
 		}
 		TraceEvent e("RocksDBMetrics", id);
@@ -930,28 +947,16 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 			ThreadReturnPromise<Void> done;
 		};
 
-		DB& db;
-		CF& cf;
-		std::unordered_set<rocksdb::ColumnFamilyHandle*> cfHandles;
-
-		UID id;
-		std::shared_ptr<rocksdb::RateLimiter> rateLimiter;
-		std::shared_ptr<ReadIteratorPool> readIterPool;
-		std::shared_ptr<PerfContextMetrics> perfContextMetrics;
-		int threadIndex;
-		ThreadReturnPromiseStream<std::pair<std::string, double>>* metricPromiseStream;
-		// ThreadReturnPromiseStream pair.first stores the histogram name and
-		// pair.second stores the corresponding measured latency (seconds)
-
 		explicit Writer(DB& db,
 		                CF& cf,
 		                UID id,
+		                std::shared_ptr<SharedRocksDBState> sharedState,
 		                std::shared_ptr<ReadIteratorPool> readIterPool,
 		                std::shared_ptr<PerfContextMetrics> perfContextMetrics,
 		                int threadIndex,
 		                ThreadReturnPromiseStream<std::pair<std::string, double>>* metricPromiseStream)
-		  : db(db), cf(cf), id(id), readIterPool(readIterPool), perfContextMetrics(perfContextMetrics),
-		    threadIndex(threadIndex), metricPromiseStream(metricPromiseStream),
+		  : db(db), cf(cf), id(id), sharedState(sharedState), readIterPool(readIterPool),
+		    perfContextMetrics(perfContextMetrics), threadIndex(threadIndex), metricPromiseStream(metricPromiseStream),
 		    rateLimiter(SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC > 0
 		                    ? rocksdb::NewGenericRateLimiter(
 		                          SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC, // rate_bytes_per_sec
@@ -970,7 +975,6 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 		void init() override {}
 
 		struct OpenAction : TypedAction<Writer, OpenAction> {
-			std::shared_ptr<SharedRocksDBState> sharedState;
 			std::string path;
 			ThreadReturnPromise<Void> done;
 			Optional<Future<Void>>& metrics;
@@ -978,15 +982,14 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 			const FlowLock* fetchLock;
 			std::shared_ptr<RocksDBErrorListener> errorListener;
 			Counters& counters;
-			OpenAction(std::shared_ptr<SharedRocksDBState> sharedState,
-			           std::string path,
+			OpenAction(std::string path,
 			           Optional<Future<Void>>& metrics,
 			           const FlowLock* readLock,
 			           const FlowLock* fetchLock,
 			           std::shared_ptr<RocksDBErrorListener> errorListener,
 			           Counters& counters)
-			  : sharedState(sharedState), path(std::move(path)), metrics(metrics), readLock(readLock),
-			    fetchLock(fetchLock), errorListener(errorListener), counters(counters) {}
+			  : path(std::move(path)), metrics(metrics), readLock(readLock), fetchLock(fetchLock),
+			    errorListener(errorListener), counters(counters) {}
 
 			double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; }
 		};
@@ -994,13 +997,13 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 			ASSERT(cf == nullptr);
 
 			std::vector<std::string> columnFamilies;
-			rocksdb::Options options = getOptions();
+			rocksdb::DBOptions options = sharedState->getDbOptions();
 			rocksdb::Status status = rocksdb::DB::ListColumnFamilies(options, a.path, &columnFamilies);
 			if (std::find(columnFamilies.begin(), columnFamilies.end(), "default") == columnFamilies.end()) {
 				columnFamilies.push_back("default");
 			}
 
-			rocksdb::ColumnFamilyOptions cfOptions = getCFOptions();
+			rocksdb::ColumnFamilyOptions cfOptions = sharedState->getCfOptions();
 			std::vector<rocksdb::ColumnFamilyDescriptor> descriptors;
 			for (const std::string& name : columnFamilies) {
 				descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ name, cfOptions });
@@ -1050,12 +1053,12 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 				// the metricsLogger.
 				a.metrics =
 				    rocksDBMetricLogger(
-				        id, a.sharedState, options.statistics, perfContextMetrics, db, readIterPool, &a.counters, cf) &&
+				        id, sharedState, options.statistics, perfContextMetrics, db, readIterPool, &a.counters, cf) &&
 				    flowLockLogger(id, a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool);
 			} else {
 				onMainThread([&] {
 					a.metrics = rocksDBMetricLogger(id,
-					                                a.sharedState,
+					                                sharedState,
 					                                options.statistics,
 					                                perfContextMetrics,
 					                                db,
@@ -1215,9 +1218,9 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 				columnFamilies.insert(SERVER_KNOBS->DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY);
 				std::vector<rocksdb::ColumnFamilyDescriptor> descriptors;
 				for (const std::string name : columnFamilies) {
-					descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ name, getCFOptions() });
+					descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ name, sharedState->getCfOptions() });
 				}
-				s = rocksdb::DestroyDB(a.path, getOptions(), descriptors);
+				s = rocksdb::DestroyDB(a.path, sharedState->getOptions(), descriptors);
 				if (!s.ok()) {
 					logRocksDBError(id, s, "Destroy");
 				} else {
@@ -1231,12 +1234,27 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 		void action(CheckpointAction& a);
 
 		void action(RestoreAction& a);
+
+		std::shared_ptr<SharedRocksDBState> sharedState;
+		DB& db;
+		CF& cf;
+		std::unordered_set<rocksdb::ColumnFamilyHandle*> cfHandles;
+		UID id;
+		std::shared_ptr<rocksdb::RateLimiter> rateLimiter;
+		std::shared_ptr<ReadIteratorPool> readIterPool;
+		std::shared_ptr<PerfContextMetrics> perfContextMetrics;
+		int threadIndex;
+
+		// ThreadReturnPromiseStream pair.first stores the histogram name and
+		// pair.second stores the corresponding measured latency (seconds)
+		ThreadReturnPromiseStream<std::pair<std::string, double>>* metricPromiseStream;
 	};
 
 	struct Reader : IThreadPoolReceiver {
 		UID id;
 		DB& db;
 		CF& cf;
+		std::shared_ptr<SharedRocksDBState> sharedState;
 		double readValueTimeout;
 		double readValuePrefixTimeout;
 		double readRangeTimeout;
@@ -1250,12 +1268,13 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 		explicit Reader(UID id,
 		                DB& db,
 		                CF& cf,
+		                std::shared_ptr<SharedRocksDBState> sharedState,
 		                std::shared_ptr<ReadIteratorPool> readIterPool,
 		                std::shared_ptr<PerfContextMetrics> perfContextMetrics,
 		                int threadIndex,
 		                ThreadReturnPromiseStream<std::pair<std::string, double>>* metricPromiseStream)
-		  : id(id), db(db), cf(cf), readIterPool(readIterPool), perfContextMetrics(perfContextMetrics),
-		    metricPromiseStream(metricPromiseStream), threadIndex(threadIndex) {
+		  : id(id), db(db), cf(cf), sharedState(sharedState), readIterPool(readIterPool),
+		    perfContextMetrics(perfContextMetrics), metricPromiseStream(metricPromiseStream), threadIndex(threadIndex) {
 			if (g_network->isSimulated()) {
 				// In simulation, increasing the read operation timeouts to 5 minutes, as some of the tests have
 				// very high load and single read thread cannot process all the load within the timeouts.
@@ -1317,7 +1336,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 			}
 
 			rocksdb::PinnableSlice value;
-			auto options = getReadOptions();
+			auto& options = sharedState->getReadOptions();
 			uint64_t deadlineMircos =
 			    db->GetEnv()->NowMicros() + (readValueTimeout - (readBeginTime - a.startTime)) * 1000000;
 			std::chrono::seconds deadlineSeconds(deadlineMircos / 1000000);
@@ -1404,7 +1423,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 			}
 
 			rocksdb::PinnableSlice value;
-			auto options = getReadOptions();
+			auto& options = sharedState->getReadOptions();
 			uint64_t deadlineMircos =
 			    db->GetEnv()->NowMicros() + (readValuePrefixTimeout - (readBeginTime - a.startTime)) * 1000000;
 			std::chrono::seconds deadlineSeconds(deadlineMircos / 1000000);
@@ -1573,7 +1592,8 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 
 	explicit RocksDBKeyValueStore(const std::string& path, UID id)
 	  : sharedState(std::make_shared<SharedRocksDBState>()), path(path), id(id),
-	    perfContextMetrics(new PerfContextMetrics()), readIterPool(new ReadIteratorPool(id, db, defaultFdbCF)),
+	    perfContextMetrics(new PerfContextMetrics()),
+	    readIterPool(new ReadIteratorPool(id, db, defaultFdbCF, sharedState->getReadOptions())),
 	    readSemaphore(SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
 	    fetchSemaphore(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX),
 	    numReadWaiters(SERVER_KNOBS->ROCKSDB_READ_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
@@ -1611,6 +1631,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 		writeThread->addThread(new Writer(db,
 		                                  defaultFdbCF,
 		                                  id,
+		                                  this->sharedState,
 		                                  readIterPool,
 		                                  perfContextMetrics,
 		                                  SERVER_KNOBS->ROCKSDB_READ_PARALLELISM,
@@ -1625,6 +1646,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 			    new Reader(id,
 			               db,
 			               defaultFdbCF,
+			               this->sharedState,
 			               readIterPool,
 			               perfContextMetrics,
 			               i,
@@ -1728,7 +1750,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 	Future<Void> getError() const override { return errorFuture; }
 
 	ACTOR static void doClose(RocksDBKeyValueStore* self, bool deleteOnClose) {
-		self->sharedState->closing = true;
+		self->sharedState->setClosing();
 
 		// The metrics future retains a reference to the DB, so stop it before we delete it.
 		self->metrics.reset();
@@ -1769,7 +1791,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 			return openFuture;
 		}
 		auto a = std::make_unique<Writer::OpenAction>(
-		    this->sharedState, path, metrics, &readSemaphore, &fetchSemaphore, errorListener, counters);
+		    path, metrics, &readSemaphore, &fetchSemaphore, errorListener, counters);
 		openFuture = a->done.getFuture();
 		writeThread->post(a.release());
 		return openFuture;
@@ -1858,7 +1880,15 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 		return result;
 	}
 
-	Future<Optional<Value>> readValue(KeyRef key, ReadType type, Optional<UID> debugID) override {
+	Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> options) override {
+		ReadType type = ReadType::NORMAL;
+		Optional<UID> debugID;
+
+		if (options.present()) {
+			type = options.get().type;
+			debugID = options.get().debugID;
+		}
+
 		if (!shouldThrottle(type, key)) {
 			auto a = new Reader::ReadValueAction(key, debugID);
 			auto res = a->result.getFuture();
@@ -1874,7 +1904,15 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 		return read(a.release(), &semaphore, readThreads.getPtr(), &counters.failedToAcquire);
 	}
 
-	Future<Optional<Value>> readValuePrefix(KeyRef key, int maxLength, ReadType type, Optional<UID> debugID) override {
+	Future<Optional<Value>> readValuePrefix(KeyRef key, int maxLength, Optional<ReadOptions> options) override {
+		ReadType type = ReadType::NORMAL;
+		Optional<UID> debugID;
+
+		if (options.present()) {
+			type = options.get().type;
+			debugID = options.get().debugID;
+		}
+
 		if (!shouldThrottle(type, key)) {
 			auto a = new Reader::ReadValuePrefixAction(key, maxLength, debugID);
 			auto res = a->result.getFuture();
@@ -1910,7 +1948,16 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 		return result;
 	}
 
-	Future<RangeResult> readRange(KeyRangeRef keys, int rowLimit, int byteLimit, ReadType type) override {
+	Future<RangeResult> readRange(KeyRangeRef keys,
+	                              int rowLimit,
+	                              int byteLimit,
+	                              Optional<ReadOptions> options) override {
+		ReadType type = ReadType::NORMAL;
+
+		if (options.present()) {
+			type = options.get().type;
+		}
+
 		if (!shouldThrottle(type, keys.begin)) {
 			auto a = new Reader::ReadRangeAction(keys, rowLimit, byteLimit);
 			auto res = a->result.getFuture();
@@ -2021,7 +2068,7 @@ void RocksDBKeyValueStore::Writer::action(CheckpointAction& a) {
 	}
 
 	rocksdb::PinnableSlice value;
-	rocksdb::ReadOptions readOptions = getReadOptions();
+	rocksdb::ReadOptions& readOptions = sharedState->getReadOptions();
 	s = db->Get(readOptions, cf, toSlice(persistVersion), &value);
 
 	if (!s.ok() && !s.IsNotFound()) {
@@ -2120,7 +2167,7 @@ void RocksDBKeyValueStore::Writer::action(RestoreAction& a) {
 		rocksdb::ImportColumnFamilyOptions importOptions;
 		importOptions.move_files = true;
 		status = db->CreateColumnFamilyWithImport(
-		    getCFOptions(), SERVER_KNOBS->DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY, importOptions, metaData, &cf);
+		    sharedState->getCfOptions(), SERVER_KNOBS->DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY, importOptions, metaData, &cf);
 		cfHandles.insert(cf);
 
 		if (!status.ok()) {
@@ -2134,7 +2181,8 @@ void RocksDBKeyValueStore::Writer::action(RestoreAction& a) {
 		}
 	} else if (format == RocksDB) {
 		if (cf == nullptr) {
-			status = db->CreateColumnFamily(getCFOptions(), SERVER_KNOBS->DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY, &cf);
+			status = db->CreateColumnFamily(
+			    sharedState->getCfOptions(), SERVER_KNOBS->DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY, &cf);
 			cfHandles.insert(cf);
 			TraceEvent("RocksDBServeRestoreRange", id)
 			    .detail("Path", a.path)
diff --git a/fdbserver/KeyValueStoreSQLite.actor.cpp b/fdbserver/KeyValueStoreSQLite.actor.cpp
index ffba0fb51b..123bd04a52 100644
--- a/fdbserver/KeyValueStoreSQLite.actor.cpp
+++ b/fdbserver/KeyValueStoreSQLite.actor.cpp
@@ -1589,9 +1589,12 @@ public:
 	void clear(KeyRangeRef range, const Arena* arena = nullptr) override;
 	Future<Void> commit(bool sequential = false) override;
 
-	Future<Optional<Value>> readValue(KeyRef key, ReadType, Optional<UID> debugID) override;
-	Future<Optional<Value>> readValuePrefix(KeyRef key, int maxLength, ReadType, Optional<UID> debugID) override;
-	Future<RangeResult> readRange(KeyRangeRef keys, int rowLimit, int byteLimit, ReadType) override;
+	Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> optionss) override;
+	Future<Optional<Value>> readValuePrefix(KeyRef key, int maxLength, Optional<ReadOptions> options) override;
+	Future<RangeResult> readRange(KeyRangeRef keys,
+	                              int rowLimit,
+	                              int byteLimit,
+	                              Optional<ReadOptions> options) override;
 
 	KeyValueStoreSQLite(std::string const& filename,
 	                    UID logID,
@@ -2213,24 +2216,32 @@ Future<Void> KeyValueStoreSQLite::commit(bool sequential) {
 	writeThread->post(p);
 	return f;
 }
-Future<Optional<Value>> KeyValueStoreSQLite::readValue(KeyRef key, ReadType, Optional<UID> debugID) {
+Future<Optional<Value>> KeyValueStoreSQLite::readValue(KeyRef key, Optional<ReadOptions> options) {
 	++readsRequested;
+	Optional<UID> debugID;
+	if (options.present()) {
+		debugID = options.get().debugID;
+	}
 	auto p = new Reader::ReadValueAction(key, debugID);
 	auto f = p->result.getFuture();
 	readThreads->post(p);
 	return f;
 }
-Future<Optional<Value>> KeyValueStoreSQLite::readValuePrefix(KeyRef key,
-                                                             int maxLength,
-                                                             ReadType,
-                                                             Optional<UID> debugID) {
+Future<Optional<Value>> KeyValueStoreSQLite::readValuePrefix(KeyRef key, int maxLength, Optional<ReadOptions> options) {
 	++readsRequested;
+	Optional<UID> debugID;
+	if (options.present()) {
+		debugID = options.get().debugID;
+	}
 	auto p = new Reader::ReadValuePrefixAction(key, maxLength, debugID);
 	auto f = p->result.getFuture();
 	readThreads->post(p);
 	return f;
 }
-Future<RangeResult> KeyValueStoreSQLite::readRange(KeyRangeRef keys, int rowLimit, int byteLimit, ReadType) {
+Future<RangeResult> KeyValueStoreSQLite::readRange(KeyRangeRef keys,
+                                                   int rowLimit,
+                                                   int byteLimit,
+                                                   Optional<ReadOptions> options) {
 	++readsRequested;
 	auto p = new Reader::ReadRangeAction(keys, rowLimit, byteLimit);
 	auto f = p->result.getFuture();
diff --git a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp
index af8ac18e3a..8c8d87da61 100644
--- a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp
@@ -73,6 +73,7 @@ namespace {
 struct PhysicalShard;
 struct DataShard;
 struct ReadIterator;
+struct ShardedRocksDBKeyValueStore;
 
 using rocksdb::BackgroundErrorReason;
 
@@ -164,26 +165,6 @@ std::string getShardMappingKey(KeyRef key, StringRef prefix) {
 	return prefix.toString() + key.toString();
 }
 
-std::vector<std::pair<KeyRange, std::string>> decodeShardMapping(const RangeResult& result, StringRef prefix) {
-	std::vector<std::pair<KeyRange, std::string>> shards;
-	KeyRef endKey;
-	std::string name;
-
-	for (const auto& kv : result) {
-		auto keyWithoutPrefix = kv.key.removePrefix(prefix);
-		if (name.size() > 0) {
-			shards.push_back({ KeyRange(KeyRangeRef(endKey, keyWithoutPrefix)), name });
-			TraceEvent(SevDebug, "DecodeShardMapping")
-			    .detail("BeginKey", endKey)
-			    .detail("EndKey", keyWithoutPrefix)
-			    .detail("Name", name);
-		}
-		endKey = keyWithoutPrefix;
-		name = kv.value.toString();
-	}
-	return shards;
-}
-
 void logRocksDBError(const rocksdb::Status& status, const std::string& method) {
 	auto level = status.IsTimedOut() ? SevWarn : SevError;
 	TraceEvent e(level, "ShardedRocksDBError");
@@ -219,7 +200,7 @@ const char* ShardOpToString(ShardOp op) {
 	}
 }
 void logShardEvent(StringRef name, ShardOp op, Severity severity = SevInfo, const std::string& message = "") {
-	TraceEvent e(severity, "ShardedRocksKVSShardEvent");
+	TraceEvent e(severity, "ShardedRocksDBKVSShardEvent");
 	e.detail("Name", name).detail("Action", ShardOpToString(op));
 	if (!message.empty()) {
 		e.detail("Message", message);
@@ -230,7 +211,7 @@ void logShardEvent(StringRef name,
                    ShardOp op,
                    Severity severity = SevInfo,
                    const std::string& message = "") {
-	TraceEvent e(severity, "ShardedRocksKVSShardEvent");
+	TraceEvent e(severity, "ShardedRocksDBKVSShardEvent");
 	e.detail("Name", name).detail("Action", ShardOpToString(op)).detail("Begin", range.begin).detail("End", range.end);
 	if (message != "") {
 		e.detail("Message", message);
@@ -284,7 +265,7 @@ rocksdb::ColumnFamilyOptions getCFOptions() {
 	}
 
 	if (rocksdb_block_cache == nullptr) {
-		rocksdb_block_cache = rocksdb::NewLRUCache(128);
+		rocksdb_block_cache = rocksdb::NewLRUCache(SERVER_KNOBS->ROCKSDB_BLOCK_CACHE_SIZE);
 	}
 	bbOpts.block_cache = rocksdb_block_cache;
 
@@ -301,7 +282,15 @@ rocksdb::Options getOptions() {
 		options.IncreaseParallelism(SERVER_KNOBS->ROCKSDB_BACKGROUND_PARALLELISM);
 	}
 
-	// TODO: enable rocksdb metrics.
+	options.delete_obsolete_files_period_micros = SERVER_KNOBS->ROCKSDB_DELETE_OBSOLETE_FILE_PERIOD * 1000000;
+	options.max_total_wal_size = SERVER_KNOBS->ROCKSDB_MAX_TOTAL_WAL_SIZE;
+	options.max_subcompactions = SERVER_KNOBS->ROCKSDB_MAX_SUBCOMPACTIONS;
+	options.max_background_jobs = SERVER_KNOBS->ROCKSDB_MAX_BACKGROUND_JOBS;
+
+	options.db_write_buffer_size = SERVER_KNOBS->ROCKSDB_WRITE_BUFFER_SIZE;
+	options.write_buffer_size = SERVER_KNOBS->ROCKSDB_CF_WRITE_BUFFER_SIZE;
+	options.statistics = rocksdb::CreateDBStatistics();
+	options.statistics->set_stats_level(rocksdb::kExceptHistogramOrTimers);
 	options.db_log_dir = SERVER_KNOBS->LOG_DIRECTORY;
 	return options;
 }
@@ -505,6 +494,7 @@ struct PhysicalShard {
 	std::shared_ptr<ReadIteratorPool> readIterPool;
 	bool deletePending = false;
 	std::atomic<bool> isInitialized;
+	double deleteTimeSec;
 };
 
 int readRangeInDb(PhysicalShard* shard, const KeyRangeRef& range, int rowLimit, int byteLimit, RangeResult* result) {
@@ -600,11 +590,10 @@ public:
 		return Void();
 	}
 
-	rocksdb::Status init() {
+	rocksdb::Status init(rocksdb::Options options) {
 		// Open instance.
 		TraceEvent(SevInfo, "ShardedRocksShardManagerInitBegin", this->logId).detail("DataPath", path);
 		std::vector<std::string> columnFamilies;
-		rocksdb::Options options = getOptions();
 		rocksdb::Status status = rocksdb::DB::ListColumnFamilies(options, path, &columnFamilies);
 
 		rocksdb::ColumnFamilyOptions cfOptions = getCFOptions();
@@ -634,6 +623,7 @@ public:
 		if (foundMetadata) {
 			TraceEvent(SevInfo, "ShardedRocksInitLoadPhysicalShards", this->logId)
 			    .detail("PhysicalShardCount", handles.size());
+
 			for (auto handle : handles) {
 				if (handle->GetName() == "kvs-metadata") {
 					metadataShard = std::make_shared<PhysicalShard>(db, "kvs-metadata", handle);
@@ -644,27 +634,78 @@ public:
 				TraceEvent(SevVerbose, "ShardedRocksInitPhysicalShard", this->logId)
 				    .detail("PhysicalShard", handle->GetName());
 			}
-			RangeResult metadata;
-			readRangeInDb(metadataShard.get(), prefixRange(shardMappingPrefix), UINT16_MAX, UINT16_MAX, &metadata);
 
-			std::vector<std::pair<KeyRange, std::string>> mapping = decodeShardMapping(metadata, shardMappingPrefix);
+			std::set<std::string> unusedShards(columnFamilies.begin(), columnFamilies.end());
+			unusedShards.erase("kvs-metadata");
+			unusedShards.erase("default");
 
-			for (const auto& [range, name] : mapping) {
-				TraceEvent(SevVerbose, "ShardedRocksLoadRange", this->logId)
-				    .detail("Range", range)
-				    .detail("PhysicalShard", name);
-				auto it = physicalShards.find(name);
-				// Raise error if physical shard is missing.
-				if (it == physicalShards.end()) {
-					TraceEvent(SevError, "ShardedRocksDB").detail("MissingShard", name);
-					return rocksdb::Status::NotFound();
+			KeyRange keyRange = prefixRange(shardMappingPrefix);
+			while (true) {
+				RangeResult metadata;
+				const int bytes = readRangeInDb(metadataShard.get(),
+				                                keyRange,
+				                                std::max(2, SERVER_KNOBS->ROCKSDB_READ_RANGE_ROW_LIMIT),
+				                                UINT16_MAX,
+				                                &metadata);
+				if (bytes <= 0) {
+					break;
+				}
+
+				ASSERT_GT(metadata.size(), 0);
+				for (int i = 0; i < metadata.size() - 1; ++i) {
+					const std::string name = metadata[i].value.toString();
+					KeyRangeRef range(metadata[i].key.removePrefix(shardMappingPrefix),
+					                  metadata[i + 1].key.removePrefix(shardMappingPrefix));
+					TraceEvent(SevVerbose, "DecodeShardMapping", this->logId)
+					    .detail("Range", range)
+					    .detail("Name", name);
+
+					// Empty name indicates the shard doesn't belong to the SS/KVS.
+					if (name.empty()) {
+						continue;
+					}
+
+					auto it = physicalShards.find(name);
+					// Raise error if physical shard is missing.
+					if (it == physicalShards.end()) {
+						TraceEvent(SevError, "ShardedRocksDB", this->logId).detail("MissingShard", name);
+						return rocksdb::Status::NotFound();
+					}
+
+					std::unique_ptr<DataShard> dataShard = std::make_unique<DataShard>(range, it->second.get());
+					dataShardMap.insert(range, dataShard.get());
+					it->second->dataShards[range.begin.toString()] = std::move(dataShard);
+					activePhysicalShardIds.emplace(name);
+					unusedShards.erase(name);
+				}
+
+				if (metadata.back().key.removePrefix(shardMappingPrefix) == specialKeys.end) {
+					TraceEvent(SevVerbose, "ShardedRocksLoadShardMappingEnd", this->logId);
+					break;
+				}
+
+				// Read from the current last key since the shard begining with it hasn't been processed.
+				if (metadata.size() == 1 && metadata.back().value.toString().empty()) {
+					// Should not happen, just being paranoid.
+					keyRange = KeyRangeRef(keyAfter(metadata.back().key), keyRange.end);
+				} else {
+					keyRange = KeyRangeRef(metadata.back().key, keyRange.end);
 				}
-				std::unique_ptr<DataShard> dataShard = std::make_unique<DataShard>(range, it->second.get());
-				dataShardMap.insert(range, dataShard.get());
-				it->second->dataShards[range.begin.toString()] = std::move(dataShard);
-				activePhysicalShardIds.emplace(name);
 			}
-			// TODO: remove unused column families.
+
+			for (const auto& name : unusedShards) {
+				TraceEvent(SevDebug, "UnusedShardName", logId).detail("Name", name);
+				auto it = physicalShards.find(name);
+				ASSERT(it != physicalShards.end());
+				auto shard = it->second;
+				if (shard->dataShards.size() == 0) {
+					shard->deleteTimeSec = now();
+					pendingDeletionShards.push_back(name);
+				}
+			}
+			if (unusedShards.size() > 0) {
+				TraceEvent("ShardedRocksDB", logId).detail("CleanUpUnusedShards", unusedShards.size());
+			}
 		} else {
 			// DB is opened with default shard.
 			ASSERT(handles.size() == 1);
@@ -673,7 +714,7 @@ public:
 			std::shared_ptr<PhysicalShard> defaultShard = std::make_shared<PhysicalShard>(db, "default", handles[0]);
 			columnFamilyMap[defaultShard->cf->GetID()] = defaultShard->cf;
 			std::unique_ptr<DataShard> dataShard = std::make_unique<DataShard>(specialKeys, defaultShard.get());
-			dataShardMap.insert(dataShard->range, dataShard.get());
+			dataShardMap.insert(specialKeys, dataShard.get());
 			defaultShard->dataShards[specialKeys.begin.toString()] = std::move(dataShard);
 			physicalShards[defaultShard->id] = defaultShard;
 
@@ -796,6 +837,8 @@ public:
 				if (existingShard->dataShards.size() == 0) {
 					TraceEvent(SevDebug, "ShardedRocksDB").detail("EmptyShardId", existingShard->id);
 					shardIds.push_back(existingShard->id);
+					existingShard->deleteTimeSec = now();
+					pendingDeletionShards.push_back(existingShard->id);
 					activePhysicalShardIds.erase(existingShard->id);
 				}
 				continue;
@@ -836,13 +879,22 @@ public:
 		return shardIds;
 	}
 
-	std::vector<std::shared_ptr<PhysicalShard>> cleanUpShards(const std::vector<std::string>& shardIds) {
+	std::vector<std::shared_ptr<PhysicalShard>> getPendingDeletionShards(double cleanUpDelay) {
 		std::vector<std::shared_ptr<PhysicalShard>> emptyShards;
-		for (const auto& id : shardIds) {
+		double currentTime = now();
+		while (!pendingDeletionShards.empty()) {
+			const auto& id = pendingDeletionShards.front();
 			auto it = physicalShards.find(id);
-			if (it != physicalShards.end() && it->second->dataShards.size() == 0) {
+			if (it == physicalShards.end() || it->second->dataShards.size() != 0) {
+				pendingDeletionShards.pop_front();
+				continue;
+			}
+			if (currentTime - it->second->deleteTimeSec > cleanUpDelay) {
+				pendingDeletionShards.pop_front();
 				emptyShards.push_back(it->second);
-				physicalShards.erase(it);
+				physicalShards.erase(id);
+			} else {
+				break;
 			}
 		}
 		return emptyShards;
@@ -883,6 +935,7 @@ public:
 
 		for (auto it = rangeIterator.begin(); it != rangeIterator.end(); ++it) {
 			if (it.value() == nullptr) {
+				TraceEvent(SevDebug, "ShardedRocksDB").detail("ClearNonExistentRange", it.range());
 				continue;
 			}
 			writeBatch->DeleteRange(it.value()->physicalShard->cf, toSlice(range.begin), toSlice(range.end));
@@ -1037,17 +1090,16 @@ private:
 	std::unique_ptr<std::set<PhysicalShard*>> dirtyShards;
 	KeyRangeMap<DataShard*> dataShardMap;
 	std::shared_ptr<PhysicalShard> metadataShard = nullptr;
+	std::deque<std::string> pendingDeletionShards;
 };
 
 class RocksDBMetrics {
 public:
-	RocksDBMetrics();
-	// Statistics
-	std::shared_ptr<rocksdb::Statistics> getStatsObjForRocksDB();
+	RocksDBMetrics(UID debugID, std::shared_ptr<rocksdb::Statistics> stats);
 	void logStats(rocksdb::DB* db);
 	// PerfContext
 	void resetPerfContext();
-	void setPerfContext(int index);
+	void collectPerfContext(int index);
 	void logPerfContext(bool ignoreZeroMetric);
 	// For Readers
 	Reference<Histogram> getReadRangeLatencyHistogram(int index);
@@ -1069,9 +1121,10 @@ public:
 	Reference<Histogram> getWriteHistogram();
 	Reference<Histogram> getDeleteCompactRangeHistogram();
 	// Stat for Memory Usage
-	void logMemUsagePerShard(std::string shardName, rocksdb::DB* db);
+	void logMemUsage(rocksdb::DB* db);
 
 private:
+	const UID debugID;
 	// Global Statistic Input to RocksDB DB instance
 	std::shared_ptr<rocksdb::Statistics> stats;
 	// Statistic Output from RocksDB
@@ -1158,9 +1211,8 @@ Reference<Histogram> RocksDBMetrics::getDeleteCompactRangeHistogram() {
 	return deleteCompactRangeHistogram;
 }
 
-RocksDBMetrics::RocksDBMetrics() {
-	stats = rocksdb::CreateDBStatistics();
-	stats->set_stats_level(rocksdb::kExceptHistogramOrTimers);
+RocksDBMetrics::RocksDBMetrics(UID debugID, std::shared_ptr<rocksdb::Statistics> stats)
+  : debugID(debugID), stats(stats) {
 	tickerStats = {
 		{ "StallMicros", rocksdb::STALL_MICROS, 0 },
 		{ "BytesRead", rocksdb::BYTES_READ, 0 },
@@ -1343,65 +1395,48 @@ RocksDBMetrics::RocksDBMetrics() {
 	    ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_DELETE_COMPACTRANGE_HISTOGRAM, Histogram::Unit::microseconds);
 }
 
-std::shared_ptr<rocksdb::Statistics> RocksDBMetrics::getStatsObjForRocksDB() {
-	// Zhe: reserved for statistic of RocksDBMetrics per shard
-	// ASSERT(shard != nullptr && shard->stats != nullptr);
-	// return shard->stats;
-	ASSERT(stats != nullptr);
-	return stats;
-}
-
 void RocksDBMetrics::logStats(rocksdb::DB* db) {
-	TraceEvent e("ShardedRocksDBMetrics");
+	TraceEvent e(SevInfo, "ShardedRocksDBMetrics", debugID);
 	uint64_t stat;
 	for (auto& [name, ticker, cumulation] : tickerStats) {
 		stat = stats->getTickerCount(ticker);
 		e.detail(name, stat - cumulation);
 		cumulation = stat;
 	}
-	for (auto& [name, property] : propertyStats) { // Zhe: TODO aggregation
+	for (auto& [name, property] : propertyStats) {
 		stat = 0;
-		ASSERT(db->GetIntProperty(property, &stat));
+		ASSERT(db->GetAggregatedIntProperty(property, &stat));
 		e.detail(name, stat);
 	}
-	/*
-	stat = readIterPool->numReadIteratorsCreated();
-	e.detail("NumReadIteratorsCreated", stat - readIteratorPoolStats["NumReadIteratorsCreated"]);
-	readIteratorPoolStats["NumReadIteratorsCreated"] = stat;
-	stat = readIterPool->numTimesReadIteratorsReused();
-	e.detail("NumTimesReadIteratorsReused", stat - readIteratorPoolStats["NumTimesReadIteratorsReused"]);
-	readIteratorPoolStats["NumTimesReadIteratorsReused"] = stat;
-	*/
 }
 
-void RocksDBMetrics::logMemUsagePerShard(std::string shardName, rocksdb::DB* db) {
-	TraceEvent e("ShardedRocksDBShardMemMetrics");
+void RocksDBMetrics::logMemUsage(rocksdb::DB* db) {
+	TraceEvent e(SevInfo, "ShardedRocksDBMemMetrics", debugID);
 	uint64_t stat;
 	ASSERT(db != nullptr);
-	ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kBlockCacheUsage, &stat));
+	ASSERT(db->GetAggregatedIntProperty(rocksdb::DB::Properties::kBlockCacheUsage, &stat));
 	e.detail("BlockCacheUsage", stat);
-	ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kEstimateTableReadersMem, &stat));
+	ASSERT(db->GetAggregatedIntProperty(rocksdb::DB::Properties::kEstimateTableReadersMem, &stat));
 	e.detail("EstimateSstReaderBytes", stat);
-	ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kCurSizeAllMemTables, &stat));
+	ASSERT(db->GetAggregatedIntProperty(rocksdb::DB::Properties::kCurSizeAllMemTables, &stat));
 	e.detail("AllMemtablesBytes", stat);
-	ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kBlockCachePinnedUsage, &stat));
+	ASSERT(db->GetAggregatedIntProperty(rocksdb::DB::Properties::kBlockCachePinnedUsage, &stat));
 	e.detail("BlockCachePinnedUsage", stat);
-	e.detail("Name", shardName);
 }
 
 void RocksDBMetrics::resetPerfContext() {
-	rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableTimeExceptForMutex);
+	rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableCount);
 	rocksdb::get_perf_context()->Reset();
 }
 
-void RocksDBMetrics::setPerfContext(int index) {
+void RocksDBMetrics::collectPerfContext(int index) {
 	for (auto& [name, metric, vals] : perfContextMetrics) {
 		vals[index] = getRocksdbPerfcontextMetric(metric);
 	}
 }
 
 void RocksDBMetrics::logPerfContext(bool ignoreZeroMetric) {
-	TraceEvent e("ShardedRocksDBPerfContextMetrics");
+	TraceEvent e(SevInfo, "ShardedRocksDBPerfContextMetrics", debugID);
 	e.setMaxEventLength(20000);
 	for (auto& [name, metric, vals] : perfContextMetrics) {
 		uint64_t s = 0;
@@ -1410,12 +1445,7 @@ void RocksDBMetrics::logPerfContext(bool ignoreZeroMetric) {
 		}
 		if (ignoreZeroMetric && s == 0)
 			continue;
-		for (int i = 0; i < SERVER_KNOBS->ROCKSDB_READ_PARALLELISM; i++) {
-			if (vals[i] != 0)
-				e.detail("RD" + std::to_string(i) + name, vals[i]);
-		}
-		if (vals[SERVER_KNOBS->ROCKSDB_READ_PARALLELISM] != 0)
-			e.detail("WR" + (std::string)name, vals[SERVER_KNOBS->ROCKSDB_READ_PARALLELISM]);
+		e.detail(name, s);
 	}
 }
 
@@ -1563,17 +1593,30 @@ uint64_t RocksDBMetrics::getRocksdbPerfcontextMetric(int metric) {
 	return 0;
 }
 
-ACTOR Future<Void> rocksDBAggregatedMetricsLogger(std::shared_ptr<RocksDBMetrics> rocksDBMetrics, rocksdb::DB* db) {
-	loop {
-		wait(delay(SERVER_KNOBS->ROCKSDB_METRICS_DELAY));
-		/*
-		if (SERVER_KNOBS->ROCKSDB_ENABLE_STATISTIC) {
-		    rocksDBMetrics->logStats(db);
+ACTOR Future<Void> rocksDBAggregatedMetricsLogger(std::shared_ptr<ShardedRocksDBState> rState,
+                                                  Future<Void> openFuture,
+                                                  std::shared_ptr<RocksDBMetrics> rocksDBMetrics,
+                                                  ShardManager* shardManager) {
+	try {
+		wait(openFuture);
+		state rocksdb::DB* db = shardManager->getDb();
+		loop {
+			wait(delay(SERVER_KNOBS->ROCKSDB_METRICS_DELAY));
+			if (rState->closing) {
+				break;
+			}
+			rocksDBMetrics->logStats(db);
+			rocksDBMetrics->logMemUsage(db);
+			if (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE != 0) {
+				rocksDBMetrics->logPerfContext(true);
+			}
+		}
+	} catch (Error& e) {
+		if (e.code() != error_code_actor_cancelled) {
+			TraceEvent(SevError, "ShardedRocksDBMetricsError").errorUnsuppressed(e);
 		}
-		if (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE != 0) {
-		    rocksDBMetrics->logPerfContext(true);
-		}*/
 	}
+	return Void();
 }
 
 struct ShardedRocksDBKeyValueStore : IKeyValueStore {
@@ -1618,6 +1661,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		std::unordered_map<uint32_t, rocksdb::ColumnFamilyHandle*>* columnFamilyMap;
 		std::shared_ptr<RocksDBMetrics> rocksDBMetrics;
 		std::shared_ptr<rocksdb::RateLimiter> rateLimiter;
+		double sampleStartTime;
 
 		explicit Writer(UID logId,
 		                int threadIndex,
@@ -1631,14 +1675,25 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		                          10, // fairness
 		                          rocksdb::RateLimiter::Mode::kWritesOnly,
 		                          SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE)
-		                    : nullptr) {}
+		                    : nullptr),
+		    sampleStartTime(now()) {}
 
 		~Writer() override {}
 
 		void init() override {}
 
+		void sample() {
+			if (SERVER_KNOBS->ROCKSDB_METRICS_SAMPLE_INTERVAL > 0 &&
+			    now() - sampleStartTime >= SERVER_KNOBS->ROCKSDB_METRICS_SAMPLE_INTERVAL) {
+				rocksDBMetrics->collectPerfContext(threadIndex);
+				rocksDBMetrics->resetPerfContext();
+				sampleStartTime = now();
+			}
+		}
+
 		struct OpenAction : TypedAction<Writer, OpenAction> {
 			ShardManager* shardManager;
+			rocksdb::Options dbOptions;
 			ThreadReturnPromise<Void> done;
 			Optional<Future<Void>>& metrics;
 			const FlowLock* readLock;
@@ -1646,18 +1701,20 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 			std::shared_ptr<RocksDBErrorListener> errorListener;
 
 			OpenAction(ShardManager* shardManager,
+			           rocksdb::Options dbOptions,
 			           Optional<Future<Void>>& metrics,
 			           const FlowLock* readLock,
 			           const FlowLock* fetchLock,
 			           std::shared_ptr<RocksDBErrorListener> errorListener)
-			  : shardManager(shardManager), metrics(metrics), readLock(readLock), fetchLock(fetchLock),
-			    errorListener(errorListener) {}
+			  : shardManager(shardManager), dbOptions(dbOptions), metrics(metrics), readLock(readLock),
+			    fetchLock(fetchLock), errorListener(errorListener) {}
 
 			double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; }
 		};
 
 		void action(OpenAction& a) {
-			auto status = a.shardManager->init();
+			auto status = a.shardManager->init(a.dbOptions);
+
 			if (!status.ok()) {
 				logRocksDBError(status, "Open");
 				a.done.sendError(statusToError(status));
@@ -1712,7 +1769,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 			ThreadReturnPromise<Void> done;
 			double startTime;
 			bool getHistograms;
-			bool getPerfContext;
 			bool logShardMemUsage;
 			double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; }
 			CommitAction(rocksdb::DB* db,
@@ -1727,12 +1783,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 				} else {
 					getHistograms = false;
 				}
-				if ((SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE != 0) &&
-				    (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE)) {
-					getPerfContext = true;
-				} else {
-					getPerfContext = false;
-				}
 			}
 		};
 
@@ -1801,9 +1851,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		}
 
 		void action(CommitAction& a) {
-			if (a.getPerfContext) {
-				rocksDBMetrics->resetPerfContext();
-			}
 			double commitBeginTime;
 			if (a.getHistograms) {
 				commitBeginTime = timer_monotonic();
@@ -1835,9 +1882,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 				rocksDBMetrics->getCommitLatencyHistogram()->sampleSeconds(currTime - a.startTime);
 			}
 
-			if (a.getPerfContext) {
-				rocksDBMetrics->setPerfContext(threadIndex);
-			}
+			sample();
 		}
 
 		struct CloseAction : TypedAction<Writer, CloseAction> {
@@ -1867,51 +1912,44 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		double readRangeTimeout;
 		int threadIndex;
 		std::shared_ptr<RocksDBMetrics> rocksDBMetrics;
+		double sampleStartTime;
 
 		explicit Reader(UID logId, int threadIndex, std::shared_ptr<RocksDBMetrics> rocksDBMetrics)
-		  : logId(logId), threadIndex(threadIndex), rocksDBMetrics(rocksDBMetrics) {
-			if (g_network->isSimulated()) {
-				// In simulation, increasing the read operation timeouts to 5 minutes, as some of the tests have
-				// very high load and single read thread cannot process all the load within the timeouts.
-				readValueTimeout = 5 * 60;
-				readValuePrefixTimeout = 5 * 60;
-				readRangeTimeout = 5 * 60;
-			} else {
-				readValueTimeout = SERVER_KNOBS->ROCKSDB_READ_VALUE_TIMEOUT;
-				readValuePrefixTimeout = SERVER_KNOBS->ROCKSDB_READ_VALUE_PREFIX_TIMEOUT;
-				readRangeTimeout = SERVER_KNOBS->ROCKSDB_READ_RANGE_TIMEOUT;
-			}
+		  : logId(logId), threadIndex(threadIndex), rocksDBMetrics(rocksDBMetrics), sampleStartTime(now()) {
+			readValueTimeout = SERVER_KNOBS->ROCKSDB_READ_VALUE_TIMEOUT;
+			readValuePrefixTimeout = SERVER_KNOBS->ROCKSDB_READ_VALUE_PREFIX_TIMEOUT;
+			readRangeTimeout = SERVER_KNOBS->ROCKSDB_READ_RANGE_TIMEOUT;
 		}
 
 		void init() override {}
 
+		void sample() {
+			if (SERVER_KNOBS->ROCKSDB_METRICS_SAMPLE_INTERVAL > 0 &&
+			    now() - sampleStartTime >= SERVER_KNOBS->ROCKSDB_METRICS_SAMPLE_INTERVAL) {
+				rocksDBMetrics->collectPerfContext(threadIndex);
+				rocksDBMetrics->resetPerfContext();
+				sampleStartTime = now();
+			}
+		}
 		struct ReadValueAction : TypedAction<Reader, ReadValueAction> {
 			Key key;
 			PhysicalShard* shard;
 			Optional<UID> debugID;
 			double startTime;
 			bool getHistograms;
-			bool getPerfContext;
 			bool logShardMemUsage;
 			ThreadReturnPromise<Optional<Value>> result;
 
 			ReadValueAction(KeyRef key, PhysicalShard* shard, Optional<UID> debugID)
 			  : key(key), shard(shard), debugID(debugID), startTime(timer_monotonic()),
 			    getHistograms(
-			        (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) ? true : false),
-			    getPerfContext(
-			        (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE != 0) &&
-			                (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE)
-			            ? true
-			            : false) {}
+			        (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) ? true : false) {
+			}
 
 			double getTimeEstimate() const override { return SERVER_KNOBS->READ_VALUE_TIME_ESTIMATE; }
 		};
 
 		void action(ReadValueAction& a) {
-			if (a.getPerfContext) {
-				rocksDBMetrics->resetPerfContext();
-			}
 			double readBeginTime = timer_monotonic();
 			if (a.getHistograms) {
 				rocksDBMetrics->getReadValueQueueWaitHistogram(threadIndex)->sampleSeconds(readBeginTime - a.startTime);
@@ -1964,9 +2002,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 				rocksDBMetrics->getReadValueActionHistogram(threadIndex)->sampleSeconds(currTime - readBeginTime);
 				rocksDBMetrics->getReadValueLatencyHistogram(threadIndex)->sampleSeconds(currTime - a.startTime);
 			}
-			if (a.getPerfContext) {
-				rocksDBMetrics->setPerfContext(threadIndex);
-			}
+
+			sample();
 		}
 
 		struct ReadValuePrefixAction : TypedAction<Reader, ReadValuePrefixAction> {
@@ -1976,26 +2013,18 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 			Optional<UID> debugID;
 			double startTime;
 			bool getHistograms;
-			bool getPerfContext;
 			bool logShardMemUsage;
 			ThreadReturnPromise<Optional<Value>> result;
 
 			ReadValuePrefixAction(Key key, int maxLength, PhysicalShard* shard, Optional<UID> debugID)
 			  : key(key), maxLength(maxLength), shard(shard), debugID(debugID), startTime(timer_monotonic()),
-			    getHistograms(
-			        (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) ? true : false),
-			    getPerfContext(
-			        (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE != 0) &&
-			                (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE)
-			            ? true
-			            : false){};
+			    getHistograms((deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE)
+			                      ? true
+			                      : false){};
 			double getTimeEstimate() const override { return SERVER_KNOBS->READ_VALUE_TIME_ESTIMATE; }
 		};
 
 		void action(ReadValuePrefixAction& a) {
-			if (a.getPerfContext) {
-				rocksDBMetrics->resetPerfContext();
-			}
 			double readBeginTime = timer_monotonic();
 			if (a.getHistograms) {
 				rocksDBMetrics->getReadPrefixQueueWaitHistogram(threadIndex)
@@ -2053,9 +2082,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 				rocksDBMetrics->getReadPrefixActionHistogram(threadIndex)->sampleSeconds(currTime - readBeginTime);
 				rocksDBMetrics->getReadPrefixLatencyHistogram(threadIndex)->sampleSeconds(currTime - a.startTime);
 			}
-			if (a.getPerfContext) {
-				rocksDBMetrics->setPerfContext(threadIndex);
-			}
+
+			sample();
 		}
 
 		struct ReadRangeAction : TypedAction<Reader, ReadRangeAction>, FastAllocated<ReadRangeAction> {
@@ -2064,18 +2092,12 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 			int rowLimit, byteLimit;
 			double startTime;
 			bool getHistograms;
-			bool getPerfContext;
 			bool logShardMemUsage;
 			ThreadReturnPromise<RangeResult> result;
 			ReadRangeAction(KeyRange keys, std::vector<DataShard*> shards, int rowLimit, int byteLimit)
 			  : keys(keys), rowLimit(rowLimit), byteLimit(byteLimit), startTime(timer_monotonic()),
 			    getHistograms(
-			        (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) ? true : false),
-			    getPerfContext(
-			        (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE != 0) &&
-			                (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE)
-			            ? true
-			            : false) {
+			        (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) ? true : false) {
 				for (const DataShard* shard : shards) {
 					if (shard != nullptr) {
 						shardRanges.emplace_back(shard->physicalShard, keys & shard->range);
@@ -2086,9 +2108,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		};
 
 		void action(ReadRangeAction& a) {
-			if (a.getPerfContext) {
-				rocksDBMetrics->resetPerfContext();
-			}
 			double readBeginTime = timer_monotonic();
 			if (a.getHistograms) {
 				rocksDBMetrics->getReadRangeQueueWaitHistogram(threadIndex)->sampleSeconds(readBeginTime - a.startTime);
@@ -2114,10 +2133,10 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 				std::reverse(a.shardRanges.begin(), a.shardRanges.end());
 			}
 
-			// TODO: consider multi-thread reads. It's possible to read multiple shards in parallel. However, the number
-			// of rows to read needs to be calculated based on the previous read result. We may read more than we
-			// expected when parallel read is used when the previous result is not available. It's unlikely to get to
-			// performance improvement when the actual number of rows to read is very small.
+			// TODO: consider multi-thread reads. It's possible to read multiple shards in parallel. However, the
+			// number of rows to read needs to be calculated based on the previous read result. We may read more
+			// than we expected when parallel read is used when the previous result is not available. It's unlikely
+			// to get to performance improvement when the actual number of rows to read is very small.
 			int accumulatedBytes = 0;
 			int numShards = 0;
 			for (auto& [shard, range] : a.shardRanges) {
@@ -2152,9 +2171,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 				rocksDBMetrics->getReadRangeActionHistogram(threadIndex)->sampleSeconds(currTime - readBeginTime);
 				rocksDBMetrics->getReadRangeLatencyHistogram(threadIndex)->sampleSeconds(currTime - a.startTime);
 			}
-			if (a.getPerfContext) {
-				rocksDBMetrics->setPerfContext(threadIndex);
-			}
+
+			sample();
 		}
 	};
 
@@ -2175,17 +2193,19 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 	    numReadWaiters(SERVER_KNOBS->ROCKSDB_READ_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
 	    numFetchWaiters(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX),
 	    errorListener(std::make_shared<RocksDBErrorListener>()), errorFuture(errorListener->getFuture()),
-	    shardManager(path, id), rocksDBMetrics(new RocksDBMetrics()) {
-		// In simluation, run the reader/writer threads as Coro threads (i.e. in the network thread. The storage engine
-		// is still multi-threaded as background compaction threads are still present. Reads/writes to disk will also
-		// block the network thread in a way that would be unacceptable in production but is a necessary evil here. When
-		// performing the reads in background threads in simulation, the event loop thinks there is no work to do and
-		// advances time faster than 1 sec/sec. By the time the blocking read actually finishes, simulation has advanced
-		// time by more than 5 seconds, so every read fails with a transaction_too_old error. Doing blocking IO on the
-		// main thread solves this issue. There are almost certainly better fixes, but my goal was to get a less
-		// invasive change merged first and work on a more realistic version if/when we think that would provide
-		// substantially more confidence in the correctness.
-		// TODO: Adapt the simulation framework to not advance time quickly when background reads/writes are occurring.
+	    shardManager(path, id), dbOptions(getOptions()),
+	    rocksDBMetrics(std::make_shared<RocksDBMetrics>(id, dbOptions.statistics)) {
+		// In simluation, run the reader/writer threads as Coro threads (i.e. in the network thread. The storage
+		// engine is still multi-threaded as background compaction threads are still present. Reads/writes to disk
+		// will also block the network thread in a way that would be unacceptable in production but is a necessary
+		// evil here. When performing the reads in background threads in simulation, the event loop thinks there is
+		// no work to do and advances time faster than 1 sec/sec. By the time the blocking read actually finishes,
+		// simulation has advanced time by more than 5 seconds, so every read fails with a transaction_too_old
+		// error. Doing blocking IO on the main thread solves this issue. There are almost certainly better fixes,
+		// but my goal was to get a less invasive change merged first and work on a more realistic version if/when
+		// we think that would provide substantially more confidence in the correctness.
+		// TODO: Adapt the simulation framework to not advance time quickly when background reads/writes are
+		// occurring.
 		if (g_network->isSimulated()) {
 			writeThread = CoroThreadPool::createThreadPool();
 			readThreads = CoroThreadPool::createThreadPool();
@@ -2208,6 +2228,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		// The metrics future retains a reference to the DB, so stop it before we delete it.
 		self->metrics.reset();
 		self->refreshHolder.cancel();
+		self->cleanUpJob.cancel();
 
 		wait(self->readThreads->stop());
 		auto a = new Writer::CloseAction(&self->shardManager, deleteOnClose);
@@ -2233,15 +2254,17 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 	Future<Void> init() override {
 		if (openFuture.isValid()) {
 			return openFuture;
-			// Restore durable state if KVS is open. KVS will be re-initialized during rollback. To avoid the cost of
-			// opening and closing multiple rocksdb instances, we reconcile the shard map using persist shard mapping
-			// data.
+			// Restore durable state if KVS is open. KVS will be re-initialized during rollback. To avoid the cost
+			// of opening and closing multiple rocksdb instances, we reconcile the shard map using persist shard
+			// mapping data.
 		} else {
 			auto a = std::make_unique<Writer::OpenAction>(
-			    &shardManager, metrics, &readSemaphore, &fetchSemaphore, errorListener);
+			    &shardManager, dbOptions, metrics, &readSemaphore, &fetchSemaphore, errorListener);
 			openFuture = a->done.getFuture();
-			this->metrics = ShardManager::shardMetricsLogger(this->rState, openFuture, &shardManager);
+			this->metrics = ShardManager::shardMetricsLogger(this->rState, openFuture, &shardManager) &&
+			                rocksDBAggregatedMetricsLogger(this->rState, openFuture, rocksDBMetrics, &shardManager);
 			this->refreshHolder = refreshReadIteratorPools(this->rState, openFuture, shardManager.getAllShards());
+			this->cleanUpJob = emptyShardCleaner(this->rState, openFuture, &shardManager, writeThread);
 			writeThread->post(a.release());
 			return openFuture;
 		}
@@ -2309,7 +2332,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		return result;
 	}
 
-	Future<Optional<Value>> readValue(KeyRef key, ReadType type, Optional<UID> debugID) override {
+	Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> options) override {
 		auto* shard = shardManager.getDataShard(key);
 		if (shard == nullptr || !shard->physicalShard->initialized()) {
 			// TODO: read non-exist system key range should not cause an error.
@@ -2319,6 +2342,14 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 			return Optional<Value>();
 		}
 
+		ReadType type = ReadType::NORMAL;
+		Optional<UID> debugID;
+
+		if (options.present()) {
+			type = options.get().type;
+			debugID = options.get().debugID;
+		}
+
 		if (!shouldThrottle(type, key)) {
 			auto a = new Reader::ReadValueAction(key, shard->physicalShard, debugID);
 			auto res = a->result.getFuture();
@@ -2334,7 +2365,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		return read(a.release(), &semaphore, readThreads.getPtr(), &counters.failedToAcquire);
 	}
 
-	Future<Optional<Value>> readValuePrefix(KeyRef key, int maxLength, ReadType type, Optional<UID> debugID) override {
+	Future<Optional<Value>> readValuePrefix(KeyRef key, int maxLength, Optional<ReadOptions> options) override {
 		auto* shard = shardManager.getDataShard(key);
 		if (shard == nullptr || !shard->physicalShard->initialized()) {
 			// TODO: read non-exist system key range should not cause an error.
@@ -2344,6 +2375,14 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 			return Optional<Value>();
 		}
 
+		ReadType type = ReadType::NORMAL;
+		Optional<UID> debugID;
+
+		if (options.present()) {
+			type = options.get().type;
+			debugID = options.get().debugID;
+		}
+
 		if (!shouldThrottle(type, key)) {
 			auto a = new Reader::ReadValuePrefixAction(key, maxLength, shard->physicalShard, debugID);
 			auto res = a->result.getFuture();
@@ -2379,10 +2418,18 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		return result;
 	}
 
-	Future<RangeResult> readRange(KeyRangeRef keys, int rowLimit, int byteLimit, ReadType type) override {
+	Future<RangeResult> readRange(KeyRangeRef keys,
+	                              int rowLimit,
+	                              int byteLimit,
+	                              Optional<ReadOptions> options = Optional<ReadOptions>()) override {
 		TraceEvent(SevVerbose, "ShardedRocksReadRangeBegin", this->id).detail("Range", keys);
 		auto shards = shardManager.getDataShardsByRange(keys);
 
+		ReadType type = ReadType::NORMAL;
+		if (options.present()) {
+			type = options.get().type;
+		}
+
 		if (!shouldThrottle(type, keys.begin)) {
 			auto a = new Reader::ReadRangeAction(keys, shards, rowLimit, byteLimit);
 			auto res = a->result.getFuture();
@@ -2398,6 +2445,34 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		return read(a.release(), &semaphore, readThreads.getPtr(), &counters.failedToAcquire);
 	}
 
+	ACTOR static Future<Void> emptyShardCleaner(std::shared_ptr<ShardedRocksDBState> rState,
+	                                            Future<Void> openFuture,
+	                                            ShardManager* shardManager,
+	                                            Reference<IThreadPool> writeThread) {
+		state double cleanUpDelay = SERVER_KNOBS->ROCKSDB_PHYSICAL_SHARD_CLEAN_UP_DELAY;
+		state double cleanUpPeriod = cleanUpDelay * 2;
+		try {
+			wait(openFuture);
+			loop {
+				wait(delay(cleanUpPeriod));
+				if (rState->closing) {
+					break;
+				}
+				auto shards = shardManager->getPendingDeletionShards(cleanUpDelay);
+				auto a = new Writer::RemoveShardAction(shards);
+				Future<Void> f = a->done.getFuture();
+				writeThread->post(a);
+				TraceEvent(SevInfo, "ShardedRocksDB").detail("DeleteEmptyShards", shards.size());
+				wait(f);
+			}
+		} catch (Error& e) {
+			if (e.code() != error_code_actor_cancelled) {
+				TraceEvent(SevError, "DeleteEmptyShardsError").errorUnsuppressed(e);
+			}
+		}
+		return Void();
+	}
+
 	StorageBytes getStorageBytes() const override {
 		uint64_t live = 0;
 		ASSERT(shardManager.getDb()->GetAggregatedIntProperty(rocksdb::DB::Properties::kLiveSstFilesSize, &live));
@@ -2414,19 +2489,12 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		return shardManager.persistRangeMapping(range, isAdd);
 	}
 
-	Future<Void> cleanUpShardsIfNeeded(const std::vector<std::string>& shardIds) override {
-		auto shards = shardManager.cleanUpShards(shardIds);
-		auto a = new Writer::RemoveShardAction(shards);
-		Future<Void> res = a->done.getFuture();
-		writeThread->post(a);
-		return res;
-	}
-
 	// Used for debugging shard mapping issue.
 	std::vector<std::pair<KeyRange, std::string>> getDataMapping() { return shardManager.getDataMapping(); }
 
 	std::shared_ptr<ShardedRocksDBState> rState;
 	ShardManager shardManager;
+	rocksdb::Options dbOptions;
 	std::shared_ptr<RocksDBMetrics> rocksDBMetrics;
 	std::string path;
 	UID id;
@@ -2443,6 +2511,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 	int numFetchWaiters;
 	Counters counters;
 	Future<Void> refreshHolder;
+	Future<Void> cleanUpJob;
 };
 
 } // namespace
@@ -2558,21 +2627,21 @@ TEST_CASE("noSim/ShardedRocksDB/RangeOps") {
 
 	// Range read
 	// Read forward full range.
-	RangeResult result = wait(kvStore->readRange(KeyRangeRef("0"_sr, ":"_sr), 1000, 10000, ReadType::NORMAL));
+	RangeResult result = wait(kvStore->readRange(KeyRangeRef("0"_sr, ":"_sr), 1000, 10000));
 	ASSERT_EQ(result.size(), expectedRows.size());
 	for (int i = 0; i < expectedRows.size(); ++i) {
 		ASSERT(result[i] == expectedRows[i]);
 	}
 
 	// Read backward full range.
-	RangeResult result = wait(kvStore->readRange(KeyRangeRef("0"_sr, ":"_sr), -1000, 10000, ReadType::NORMAL));
+	RangeResult result = wait(kvStore->readRange(KeyRangeRef("0"_sr, ":"_sr), -1000, 10000));
 	ASSERT_EQ(result.size(), expectedRows.size());
 	for (int i = 0; i < expectedRows.size(); ++i) {
 		ASSERT(result[i] == expectedRows[59 - i]);
 	}
 
 	// Forward with row limit.
-	RangeResult result = wait(kvStore->readRange(KeyRangeRef("2"_sr, "6"_sr), 10, 10000, ReadType::NORMAL));
+	RangeResult result = wait(kvStore->readRange(KeyRangeRef("2"_sr, "6"_sr), 10, 10000));
 	ASSERT_EQ(result.size(), 10);
 	for (int i = 0; i < 10; ++i) {
 		ASSERT(result[i] == expectedRows[20 + i]);
@@ -2598,14 +2667,14 @@ TEST_CASE("noSim/ShardedRocksDB/RangeOps") {
 	wait(kvStore->init());
 
 	// Read all values.
-	RangeResult result = wait(kvStore->readRange(KeyRangeRef("0"_sr, ":"_sr), 1000, 10000, ReadType::NORMAL));
+	RangeResult result = wait(kvStore->readRange(KeyRangeRef("0"_sr, ":"_sr), 1000, 10000));
 	ASSERT_EQ(result.size(), expectedRows.size());
 	for (int i = 0; i < expectedRows.size(); ++i) {
 		ASSERT(result[i] == expectedRows[i]);
 	}
 
 	// Read partial range with row limit
-	RangeResult result = wait(kvStore->readRange(KeyRangeRef("5"_sr, ":"_sr), 35, 10000, ReadType::NORMAL));
+	RangeResult result = wait(kvStore->readRange(KeyRangeRef("5"_sr, ":"_sr), 35, 10000));
 	ASSERT_EQ(result.size(), 35);
 	for (int i = 0; i < result.size(); ++i) {
 		ASSERT(result[i] == expectedRows[40 + i]);
@@ -2615,7 +2684,7 @@ TEST_CASE("noSim/ShardedRocksDB/RangeOps") {
 	kvStore->clear(KeyRangeRef("40"_sr, "45"_sr));
 	wait(kvStore->commit(false));
 
-	RangeResult result = wait(kvStore->readRange(KeyRangeRef("4"_sr, "5"_sr), 20, 10000, ReadType::NORMAL));
+	RangeResult result = wait(kvStore->readRange(KeyRangeRef("4"_sr, "5"_sr), 20, 10000));
 	ASSERT_EQ(result.size(), 5);
 
 	// Clear a single value.
@@ -2635,10 +2704,10 @@ TEST_CASE("noSim/ShardedRocksDB/RangeOps") {
 	kvStore = new ShardedRocksDBKeyValueStore(rocksDBTestDir, deterministicRandom()->randomUniqueID());
 	wait(kvStore->init());
 
-	RangeResult result = wait(kvStore->readRange(KeyRangeRef("1"_sr, "8"_sr), 1000, 10000, ReadType::NORMAL));
+	RangeResult result = wait(kvStore->readRange(KeyRangeRef("1"_sr, "8"_sr), 1000, 10000));
 	ASSERT_EQ(result.size(), 0);
 
-	RangeResult result = wait(kvStore->readRange(KeyRangeRef("0"_sr, ":"_sr), 1000, 10000, ReadType::NORMAL));
+	RangeResult result = wait(kvStore->readRange(KeyRangeRef("0"_sr, ":"_sr), 1000, 10000));
 	ASSERT_EQ(result.size(), 19);
 
 	Future<Void> closed = kvStore->onClosed();
diff --git a/fdbserver/MutationTracking.cpp b/fdbserver/MutationTracking.cpp
index 5bb491100a..ab3bb5ee53 100644
--- a/fdbserver/MutationTracking.cpp
+++ b/fdbserver/MutationTracking.cpp
@@ -21,7 +21,6 @@
 #include <algorithm>
 #include <vector>
 #include "fdbclient/FDBTypes.h"
-#include "fdbserver/EncryptedMutationMessage.h"
 #include "fdbserver/MutationTracking.h"
 #include "fdbserver/LogProtocolMessage.h"
 #include "fdbserver/SpanContextMessage.h"
@@ -103,8 +102,6 @@ TraceEvent debugTagsAndMessageEnabled(const char* context, Version version, Stri
 			BinaryReader br(mutationData, AssumeVersion(rdr.protocolVersion()));
 			OTELSpanContextMessage scm;
 			br >> scm;
-		} else if (EncryptedMutationMessage::startsEncryptedMutationMessage(mutationType)) {
-			throw encrypt_unsupported();
 		} else {
 			MutationRef m;
 			BinaryReader br(mutationData, AssumeVersion(rdr.protocolVersion()));
diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp
index d6a5b6a8b3..57ea452ecb 100644
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@@ -235,7 +235,9 @@ ACTOR Future<std::pair<int64_t, int64_t>> getTLogQueueInfo(Database cx,
 }
 
 // Returns a vector of blob worker interfaces which have been persisted under the system key space
-ACTOR Future<std::vector<BlobWorkerInterface>> getBlobWorkers(Database cx, bool use_system_priority = false) {
+ACTOR Future<std::vector<BlobWorkerInterface>> getBlobWorkers(Database cx,
+                                                              bool use_system_priority = false,
+                                                              Version* grv = nullptr) {
 	state Transaction tr(cx);
 	loop {
 		if (use_system_priority) {
@@ -252,6 +254,9 @@ ACTOR Future<std::vector<BlobWorkerInterface>> getBlobWorkers(Database cx, bool
 			for (int i = 0; i < blobWorkersList.size(); i++) {
 				blobWorkers.push_back(decodeBlobWorkerListValue(blobWorkersList[i].value));
 			}
+			if (grv) {
+				*grv = tr.getReadVersion().get();
+			}
 			return blobWorkers;
 		} catch (Error& e) {
 			wait(tr.onError(e));
diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index 13a7f61ae3..b9376a01ce 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -18,10 +18,13 @@
  * limitations under the License.
  */
 
+#include "fdbclient/ClientKnobs.h"
 #include "fdbserver/DataDistribution.actor.h"
+#include "fdbserver/Knobs.h"
 #include "fdbserver/Ratekeeper.h"
 #include "fdbserver/TagThrottler.h"
 #include "fdbserver/WaitFailure.h"
+#include "fdbserver/QuietDatabase.h"
 #include "flow/OwningResource.h"
 
 #include "flow/actorcompiler.h" // must be last include
@@ -37,7 +40,9 @@ const char* limitReasonName[] = { "workload",
 	                              "log_server_min_free_space",
 	                              "log_server_min_free_space_ratio",
 	                              "storage_server_durability_lag",
-	                              "storage_server_list_fetch_failed" };
+	                              "storage_server_list_fetch_failed",
+	                              "blob_worker_lag",
+	                              "blob_worker_missing" };
 static_assert(sizeof(limitReasonName) / sizeof(limitReasonName[0]) == limitReason_t_end, "limitReasonDesc table size");
 
 int limitReasonEnd = limitReason_t_end;
@@ -55,7 +60,9 @@ const char* limitReasonDesc[] = { "Workload or read performance.",
 	                              "Log server running out of space (approaching 100MB limit).",
 	                              "Log server running out of space (approaching 5% limit).",
 	                              "Storage server durable version falling behind.",
-	                              "Unable to fetch storage server list." };
+	                              "Unable to fetch storage server list.",
+	                              "Blob worker granule version falling behind.",
+	                              "No blob workers are reporting metrics." };
 
 static_assert(sizeof(limitReasonDesc) / sizeof(limitReasonDesc[0]) == limitReason_t_end, "limitReasonDesc table size");
 
@@ -240,6 +247,99 @@ public:
 		}
 	}
 
+	ACTOR static Future<Void> monitorBlobWorkers(Ratekeeper* self, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
+		state std::vector<BlobWorkerInterface> blobWorkers;
+		state int workerFetchCount = 0;
+		state double lastStartTime = 0;
+		state double startTime = 0;
+		state bool blobWorkerDead = false;
+		state double lastLoggedTime = 0;
+
+		loop {
+			while (!self->configuration.blobGranulesEnabled) {
+				wait(delay(SERVER_KNOBS->SERVER_LIST_DELAY));
+			}
+
+			state Version grv;
+			state Future<Void> blobWorkerDelay =
+			    delay(SERVER_KNOBS->METRIC_UPDATE_RATE * FLOW_KNOBS->DELAY_JITTER_OFFSET);
+			int fetchAmount = SERVER_KNOBS->BW_FETCH_WORKERS_INTERVAL /
+			                  (SERVER_KNOBS->METRIC_UPDATE_RATE * FLOW_KNOBS->DELAY_JITTER_OFFSET);
+			if (++workerFetchCount == fetchAmount || blobWorkerDead) {
+				workerFetchCount = 0;
+				std::vector<BlobWorkerInterface> _blobWorkers = wait(getBlobWorkers(self->db, true, &grv));
+				blobWorkers = _blobWorkers;
+			} else {
+				grv = self->maxVersion;
+			}
+
+			lastStartTime = startTime;
+			startTime = now();
+
+			if (blobWorkers.size() > 0) {
+				state Future<Optional<BlobManagerBlockedReply>> blockedAssignments;
+				if (dbInfo->get().blobManager.present()) {
+					blockedAssignments =
+					    timeout(brokenPromiseToNever(dbInfo->get().blobManager.get().blobManagerBlockedReq.getReply(
+					                BlobManagerBlockedRequest())),
+					            SERVER_KNOBS->BLOB_WORKER_TIMEOUT);
+				}
+				state std::vector<Future<Optional<MinBlobVersionReply>>> aliveVersions;
+				aliveVersions.reserve(blobWorkers.size());
+				for (auto& it : blobWorkers) {
+					MinBlobVersionRequest req;
+					req.grv = grv;
+					aliveVersions.push_back(timeout(brokenPromiseToNever(it.minBlobVersionRequest.getReply(req)),
+					                                SERVER_KNOBS->BLOB_WORKER_TIMEOUT));
+				}
+				if (blockedAssignments.isValid()) {
+					wait(success(blockedAssignments));
+					if (blockedAssignments.get().present() && blockedAssignments.get().get().blockedAssignments == 0) {
+						self->unblockedAssignmentTime = now();
+					}
+				}
+				wait(waitForAll(aliveVersions));
+				Version minVer = grv;
+				blobWorkerDead = false;
+				int minIdx = 0;
+				for (int i = 0; i < blobWorkers.size(); i++) {
+					if (aliveVersions[i].get().present()) {
+						if (aliveVersions[i].get().get().version < minVer) {
+							minVer = aliveVersions[i].get().get().version;
+							minIdx = i;
+						}
+					} else {
+						blobWorkerDead = true;
+						minVer = 0;
+						minIdx = i;
+						break;
+					}
+				}
+				if (minVer > 0 && blobWorkers.size() > 0 &&
+				    now() - self->unblockedAssignmentTime < SERVER_KNOBS->BW_MAX_BLOCKED_INTERVAL) {
+					while (!self->blobWorkerVersionHistory.empty() &&
+					       minVer < self->blobWorkerVersionHistory.back().second) {
+						self->blobWorkerVersionHistory.pop_back();
+					}
+					self->blobWorkerVersionHistory.push_back(std::make_pair(now(), minVer));
+				}
+				while (self->blobWorkerVersionHistory.size() > SERVER_KNOBS->MIN_BW_HISTORY &&
+				       self->blobWorkerVersionHistory[1].first <
+				           self->blobWorkerVersionHistory.back().first - SERVER_KNOBS->BW_ESTIMATION_INTERVAL) {
+					self->blobWorkerVersionHistory.pop_front();
+				}
+				if (now() - lastLoggedTime > SERVER_KNOBS->BW_RW_LOGGING_INTERVAL) {
+					lastLoggedTime = now();
+					TraceEvent("RkMinBlobWorkerVersion")
+					    .detail("BWVersion", minVer)
+					    .detail("MaxVer", self->maxVersion)
+					    .detail("MinId", blobWorkers.size() > 0 ? blobWorkers[minIdx].id() : UID());
+				}
+			}
+			wait(blobWorkerDelay);
+		}
+	}
+
 	ACTOR static Future<Void> run(RatekeeperInterface rkInterf, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
 		state ActorOwningSelfRef<Ratekeeper> pSelf(
 		    new Ratekeeper(rkInterf.id(), openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True)));
@@ -260,6 +360,9 @@ public:
 		self.addActor.send(traceRole(Role::RATEKEEPER, rkInterf.id()));
 
 		self.addActor.send(self.monitorThrottlingChanges());
+		if (SERVER_KNOBS->BW_THROTTLING_ENABLED) {
+			self.addActor.send(self.monitorBlobWorkers(dbInfo));
+		}
 		self.addActor.send(self.refreshStorageServerCommitCosts());
 
 		TraceEvent("RkTLogQueueSizeParameters", rkInterf.id())
@@ -289,10 +392,55 @@ public:
 
 		self.remoteDC = dbInfo->get().logSystemConfig.getRemoteDcId();
 
+		state bool recovering = dbInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS;
+		state Version recoveryVersion = std::numeric_limits<Version>::max();
+		if (recovering) {
+			self.version_recovery[recoveryVersion] = std::make_pair(now(), Optional<double>());
+		}
+
 		try {
 			state bool lastLimited = false;
 			loop choose {
 				when(wait(timeout)) {
+					double actualTps = self.smoothReleasedTransactions.smoothRate();
+					actualTps =
+					    std::max(std::max(1.0, actualTps),
+					             self.smoothTotalDurableBytes.smoothRate() / CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT);
+
+					if (self.actualTpsHistory.size() > SERVER_KNOBS->MAX_TPS_HISTORY_SAMPLES) {
+						self.actualTpsHistory.pop_front();
+					}
+					self.actualTpsHistory.push_back(actualTps);
+
+					if (self.configuration.blobGranulesEnabled && SERVER_KNOBS->BW_THROTTLING_ENABLED) {
+						Version maxVersion = 0;
+						int64_t totalReleased = 0;
+						int64_t batchReleased = 0;
+						for (auto& it : self.grvProxyInfo) {
+							maxVersion = std::max(maxVersion, it.second.version);
+							totalReleased += it.second.totalTransactions;
+							batchReleased += it.second.batchTransactions;
+						}
+						self.version_transactions[maxVersion] =
+						    Ratekeeper::VersionInfo(totalReleased, batchReleased, now());
+
+						loop {
+							auto secondEntry = self.version_transactions.begin();
+							++secondEntry;
+							if (secondEntry != self.version_transactions.end() &&
+							    secondEntry->second.created < now() - (2 * SERVER_KNOBS->TARGET_BW_LAG) &&
+							    (self.blobWorkerVersionHistory.empty() ||
+							     secondEntry->first < self.blobWorkerVersionHistory.front().second)) {
+								self.version_transactions.erase(self.version_transactions.begin());
+							} else {
+								break;
+							}
+						}
+					}
+					while (self.version_recovery.size() > CLIENT_KNOBS->MAX_GENERATIONS) {
+						self.version_recovery.erase(self.version_recovery.begin());
+					}
+
 					self.updateRate(&self.normalLimits);
 					self.updateRate(&self.batchLimits);
 
@@ -326,6 +474,17 @@ public:
 
 					p.totalTransactions = req.totalReleasedTransactions;
 					p.batchTransactions = req.batchReleasedTransactions;
+					p.version = req.version;
+					self.maxVersion = std::max(self.maxVersion, req.version);
+
+					if (recoveryVersion == std::numeric_limits<Version>::max() &&
+					    self.version_recovery.count(recoveryVersion)) {
+						recoveryVersion = self.maxVersion;
+						self.version_recovery[recoveryVersion] =
+						    self.version_recovery[std::numeric_limits<Version>::max()];
+						self.version_recovery.erase(std::numeric_limits<Version>::max());
+					}
+
 					p.lastUpdateTime = now();
 
 					reply.transactionRate = self.normalLimits.tpsLimit / self.grvProxyInfo.size();
@@ -337,9 +496,16 @@ public:
 						p.lastThrottledTagChangeId = self.tagThrottler->getThrottledTagChangeId();
 						p.lastTagPushTime = now();
 
-						reply.throttledTags = self.tagThrottler->getClientRates();
-						bool returningTagsToProxy =
-						    reply.throttledTags.present() && reply.throttledTags.get().size() > 0;
+						bool returningTagsToProxy{ false };
+						if (SERVER_KNOBS->ENFORCE_TAG_THROTTLING_ON_PROXIES) {
+							reply.proxyThrottledTags = self.tagThrottler->getProxyRates(self.grvProxyInfo.size());
+							returningTagsToProxy =
+							    reply.proxyThrottledTags.present() && reply.proxyThrottledTags.get().size() > 0;
+						} else {
+							reply.clientThrottledTags = self.tagThrottler->getClientRates();
+							returningTagsToProxy =
+							    reply.clientThrottledTags.present() && reply.clientThrottledTags.get().size() > 0;
+						}
 						CODE_PROBE(returningTagsToProxy, "Returning tag throttles to a proxy");
 					}
 
@@ -361,6 +527,27 @@ public:
 				}
 				when(wait(err.getFuture())) {}
 				when(wait(dbInfo->onChange())) {
+					if (!recovering && dbInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
+						recovering = true;
+						recoveryVersion = self.maxVersion;
+						if (recoveryVersion == 0) {
+							recoveryVersion = std::numeric_limits<Version>::max();
+						}
+						if (self.version_recovery.count(recoveryVersion)) {
+							auto& it = self.version_recovery[recoveryVersion];
+							double existingEnd = it.second.present() ? it.second.get() : now();
+							double existingDuration = existingEnd - it.first;
+							self.version_recovery[recoveryVersion] =
+							    std::make_pair(now() - existingDuration, Optional<double>());
+						} else {
+							self.version_recovery[recoveryVersion] = std::make_pair(now(), Optional<double>());
+						}
+					}
+					if (recovering && dbInfo->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS) {
+						recovering = false;
+						self.version_recovery[recoveryVersion].second = now();
+					}
+
 					if (tlogInterfs != dbInfo->get().logSystemConfig.allLocalLogs()) {
 						tlogInterfs = dbInfo->get().logSystemConfig.allLocalLogs();
 						tlogTrackers = std::vector<Future<Void>>();
@@ -418,6 +605,10 @@ Future<Void> Ratekeeper::monitorThrottlingChanges() {
 	return tagThrottler->monitorThrottlingChanges();
 }
 
+Future<Void> Ratekeeper::monitorBlobWorkers(Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
+	return RatekeeperImpl::monitorBlobWorkers(this, dbInfo);
+}
+
 Future<Void> Ratekeeper::run(RatekeeperInterface rkInterf, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
 	return RatekeeperImpl::run(rkInterf, dbInfo);
 }
@@ -434,7 +625,8 @@ Ratekeeper::Ratekeeper(UID id, Database db)
                  SERVER_KNOBS->TARGET_BYTES_PER_TLOG,
                  SERVER_KNOBS->SPRING_BYTES_TLOG,
                  SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE,
-                 SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS),
+                 SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS,
+                 SERVER_KNOBS->TARGET_BW_LAG),
     batchLimits(TransactionPriority::BATCH,
                 "Batch",
                 SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER_BATCH,
@@ -442,7 +634,9 @@ Ratekeeper::Ratekeeper(UID id, Database db)
                 SERVER_KNOBS->TARGET_BYTES_PER_TLOG_BATCH,
                 SERVER_KNOBS->SPRING_BYTES_TLOG_BATCH,
                 SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE_BATCH,
-                SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS_BATCH) {
+                SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS_BATCH,
+                SERVER_KNOBS->TARGET_BW_LAG_BATCH),
+    maxVersion(0), blobWorkerTime(now()), unblockedAssignmentTime(now()) {
 	if (SERVER_KNOBS->GLOBAL_TAG_THROTTLING) {
 		tagThrottler = std::make_unique<GlobalTagThrottler>(db, id);
 	} else {
@@ -467,16 +661,11 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) {
 
 	double actualTps = smoothReleasedTransactions.smoothRate();
 	actualTpsMetric = (int64_t)actualTps;
-	// SOMEDAY: Remove the max( 1.0, ... ) since the below calculations _should_ be able to recover back up from this
-	// value
+	// SOMEDAY: Remove the max( 1.0, ... ) since the below calculations _should_ be able to recover back
+	// up from this value
 	actualTps =
 	    std::max(std::max(1.0, actualTps), smoothTotalDurableBytes.smoothRate() / CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT);
 
-	if (actualTpsHistory.size() > SERVER_KNOBS->MAX_TPS_HISTORY_SAMPLES) {
-		actualTpsHistory.pop_front();
-	}
-	actualTpsHistory.push_back(actualTps);
-
 	limits->tpsLimit = std::numeric_limits<double>::infinity();
 	UID reasonID = UID();
 	limitReason_t limitReason = limitReason_t::unlimited;
@@ -497,7 +686,8 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) {
 	    SERVER_KNOBS->RATEKEEPER_PRINT_LIMIT_REASON &&
 	    (deterministicRandom()->random01() < SERVER_KNOBS->RATEKEEPER_LIMIT_REASON_SAMPLE_RATE);
 
-	// Look at each storage server's write queue and local rate, compute and store the desired rate ratio
+	// Look at each storage server's write queue and local rate, compute and store the desired rate
+	// ratio
 	for (auto i = storageQueueInfo.begin(); i != storageQueueInfo.end(); ++i) {
 		auto const& ss = i->value;
 		if (!ss.valid || !ss.acceptingRequests || (remoteDC.present() && ss.locality.dcId() == remoteDC))
@@ -710,6 +900,139 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) {
 		break;
 	}
 
+	if (configuration.blobGranulesEnabled && SERVER_KNOBS->BW_THROTTLING_ENABLED) {
+		Version lastBWVer = 0;
+		auto lastIter = version_transactions.end();
+		if (!blobWorkerVersionHistory.empty()) {
+			lastBWVer = blobWorkerVersionHistory.back().second;
+			lastIter = version_transactions.lower_bound(lastBWVer);
+			if (lastIter != version_transactions.end()) {
+				blobWorkerTime = lastIter->second.created;
+			} else {
+				blobWorkerTime = std::max(blobWorkerTime,
+				                          now() - (maxVersion - lastBWVer) / (double)SERVER_KNOBS->VERSIONS_PER_SECOND);
+			}
+		}
+		double blobWorkerLag = (now() - blobWorkerTime) - getRecoveryDuration(lastBWVer);
+		if (blobWorkerLag > limits->bwLagTarget / 2 && !blobWorkerVersionHistory.empty()) {
+			double elapsed = blobWorkerVersionHistory.back().first - blobWorkerVersionHistory.front().first;
+			Version firstBWVer = blobWorkerVersionHistory.front().second;
+			ASSERT(lastBWVer >= firstBWVer);
+			if (elapsed > SERVER_KNOBS->BW_ESTIMATION_INTERVAL / 2) {
+				auto firstIter = version_transactions.upper_bound(firstBWVer);
+				if (lastIter != version_transactions.end() && firstIter != version_transactions.begin()) {
+					--firstIter;
+					double targetRateRatio;
+					if (blobWorkerLag > 3 * limits->bwLagTarget) {
+						targetRateRatio = 0;
+						ASSERT(!g_network->isSimulated() || limits->bwLagTarget != SERVER_KNOBS->TARGET_BW_LAG ||
+						       now() < FLOW_KNOBS->SIM_SPEEDUP_AFTER_SECONDS + 50);
+					} else if (blobWorkerLag > limits->bwLagTarget) {
+						targetRateRatio = SERVER_KNOBS->BW_LAG_DECREASE_AMOUNT;
+					} else {
+						targetRateRatio = SERVER_KNOBS->BW_LAG_INCREASE_AMOUNT;
+					}
+					int64_t totalTransactions =
+					    lastIter->second.totalTransactions - firstIter->second.totalTransactions;
+					int64_t batchTransactions =
+					    lastIter->second.batchTransactions - firstIter->second.batchTransactions;
+					int64_t normalTransactions = totalTransactions - batchTransactions;
+					double bwTPS;
+					if (limits->bwLagTarget == SERVER_KNOBS->TARGET_BW_LAG) {
+						bwTPS = targetRateRatio * (totalTransactions) / elapsed;
+					} else {
+						bwTPS = std::max(0.0, ((targetRateRatio * (totalTransactions)) - normalTransactions) / elapsed);
+					}
+
+					if (bwTPS < limits->tpsLimit) {
+						if (printRateKeepLimitReasonDetails) {
+							TraceEvent("RatekeeperLimitReasonDetails")
+							    .detail("Reason", limitReason_t::blob_worker_lag)
+							    .detail("BWLag", blobWorkerLag)
+							    .detail("BWRate", bwTPS)
+							    .detail("Ratio", targetRateRatio)
+							    .detail("Released", totalTransactions)
+							    .detail("Elapsed", elapsed)
+							    .detail("LastVer", lastBWVer)
+							    .detail("RecoveryDuration", getRecoveryDuration(lastBWVer));
+						}
+						limits->tpsLimit = bwTPS;
+						limitReason = limitReason_t::blob_worker_lag;
+					}
+				} else if (blobWorkerLag > limits->bwLagTarget) {
+					double maxTps = 0;
+					for (int i = 0; i < actualTpsHistory.size(); i++) {
+						maxTps = std::max(maxTps, actualTpsHistory[i]);
+					}
+					double bwProgress =
+					    std::min(elapsed, (lastBWVer - firstBWVer) / (double)SERVER_KNOBS->VERSIONS_PER_SECOND);
+					double bwTPS = maxTps * bwProgress / elapsed;
+
+					if (blobWorkerLag > 3 * limits->bwLagTarget) {
+						limits->tpsLimit = 0.0;
+						if (printRateKeepLimitReasonDetails) {
+							TraceEvent("RatekeeperLimitReasonDetails")
+							    .detail("Reason", limitReason_t::blob_worker_missing)
+							    .detail("LastValid", lastIter != version_transactions.end())
+							    .detail("FirstValid", firstIter != version_transactions.begin())
+							    .detail("FirstVersion",
+							            version_transactions.size() ? version_transactions.begin()->first : -1)
+							    .detail("FirstBWVer", firstBWVer)
+							    .detail("LastBWVer", lastBWVer)
+							    .detail("VerTransactions", version_transactions.size())
+							    .detail("RecoveryDuration", getRecoveryDuration(lastBWVer));
+						}
+						limitReason = limitReason_t::blob_worker_missing;
+						ASSERT(!g_network->isSimulated() || limits->bwLagTarget != SERVER_KNOBS->TARGET_BW_LAG ||
+						       now() < FLOW_KNOBS->SIM_SPEEDUP_AFTER_SECONDS + 50);
+					} else if (bwTPS < limits->tpsLimit) {
+						if (printRateKeepLimitReasonDetails) {
+							TraceEvent("RatekeeperLimitReasonDetails")
+							    .detail("Reason", limitReason_t::blob_worker_lag)
+							    .detail("BWLag", blobWorkerLag)
+							    .detail("BWRate", bwTPS)
+							    .detail("MaxTPS", maxTps)
+							    .detail("Progress", bwProgress)
+							    .detail("Elapsed", elapsed);
+						}
+						limits->tpsLimit = bwTPS;
+						limitReason = limitReason_t::blob_worker_lag;
+					}
+				}
+			} else if (blobWorkerLag > 3 * limits->bwLagTarget) {
+				limits->tpsLimit = 0.0;
+				if (printRateKeepLimitReasonDetails) {
+					TraceEvent("RatekeeperLimitReasonDetails")
+					    .detail("Reason", limitReason_t::blob_worker_missing)
+					    .detail("Elapsed", elapsed)
+					    .detail("LastVer", lastBWVer)
+					    .detail("FirstVer", firstBWVer)
+					    .detail("BWLag", blobWorkerLag)
+					    .detail("RecoveryDuration", getRecoveryDuration(lastBWVer));
+					;
+				}
+				limitReason = limitReason_t::blob_worker_missing;
+				ASSERT(!g_network->isSimulated() || limits->bwLagTarget != SERVER_KNOBS->TARGET_BW_LAG ||
+				       now() < FLOW_KNOBS->SIM_SPEEDUP_AFTER_SECONDS + 50);
+			}
+		} else if (blobWorkerLag > 3 * limits->bwLagTarget) {
+			limits->tpsLimit = 0.0;
+			if (printRateKeepLimitReasonDetails) {
+				TraceEvent("RatekeeperLimitReasonDetails")
+				    .detail("Reason", limitReason_t::blob_worker_missing)
+				    .detail("BWLag", blobWorkerLag)
+				    .detail("RecoveryDuration", getRecoveryDuration(lastBWVer))
+				    .detail("HistorySize", blobWorkerVersionHistory.size());
+			}
+			limitReason = limitReason_t::blob_worker_missing;
+			ASSERT(!g_network->isSimulated() || limits->bwLagTarget != SERVER_KNOBS->TARGET_BW_LAG ||
+			       now() < FLOW_KNOBS->SIM_SPEEDUP_AFTER_SECONDS + 50);
+		}
+	} else {
+		blobWorkerTime = now();
+		unblockedAssignmentTime = now();
+	}
+
 	healthMetrics.worstStorageQueue = worstStorageQueueStorageServer;
 	healthMetrics.limitingStorageQueue = limitingStorageQueueStorageServer;
 	healthMetrics.worstStorageDurabilityLag = worstDurabilityLag;
@@ -744,7 +1067,8 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) {
 		}
 
 		if (minSSVer != std::numeric_limits<Version>::max() && maxTLVer != std::numeric_limits<Version>::min()) {
-			// writeToReadLatencyLimit: 0 = infinite speed; 1 = TL durable speed ; 2 = half TL durable speed
+			// writeToReadLatencyLimit: 0 = infinite speed; 1 = TL durable speed ; 2 = half TL durable
+			// speed
 			writeToReadLatencyLimit =
 			    ((maxTLVer - minLimitingSSVer) - limits->maxVersionDifference / 2) / (limits->maxVersionDifference / 4);
 			worstVersionLag = std::max((Version)0, maxTLVer - minSSVer);
@@ -912,6 +1236,14 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) {
 	limits->tpsLimitMetric = std::min(limits->tpsLimit, 1e6);
 	limits->reasonMetric = limitReason;
 
+	if (limits->priority == TransactionPriority::DEFAULT) {
+		limits->tpsLimit = std::max(limits->tpsLimit, SERVER_KNOBS->RATEKEEPER_MIN_RATE);
+		limits->tpsLimit = std::min(limits->tpsLimit, SERVER_KNOBS->RATEKEEPER_MAX_RATE);
+	} else if (limits->priority == TransactionPriority::BATCH) {
+		limits->tpsLimit = std::max(limits->tpsLimit, SERVER_KNOBS->RATEKEEPER_BATCH_MIN_RATE);
+		limits->tpsLimit = std::min(limits->tpsLimit, SERVER_KNOBS->RATEKEEPER_BATCH_MAX_RATE);
+	}
+
 	if (deterministicRandom()->random01() < 0.1) {
 		const std::string& name = limits->rkUpdateEventCacheHolder.getPtr()->trackingKey;
 		TraceEvent(name.c_str(), id)
@@ -1049,8 +1381,8 @@ TLogQueueInfo::TLogQueueInfo(UID id)
   : valid(false), id(id), smoothDurableBytes(SERVER_KNOBS->SMOOTHING_AMOUNT),
     smoothInputBytes(SERVER_KNOBS->SMOOTHING_AMOUNT), verySmoothDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT),
     smoothFreeSpace(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalSpace(SERVER_KNOBS->SMOOTHING_AMOUNT) {
-	// FIXME: this is a tacky workaround for a potential uninitialized use in trackTLogQueueInfo (copied from
-	// storageQueueInfO)
+	// FIXME: this is a tacky workaround for a potential uninitialized use in trackTLogQueueInfo (copied
+	// from storageQueueInfO)
 	lastReply.instanceID = -1;
 }
 
@@ -1081,14 +1413,16 @@ RatekeeperLimits::RatekeeperLimits(TransactionPriority priority,
                                    int64_t logTargetBytes,
                                    int64_t logSpringBytes,
                                    double maxVersionDifference,
-                                   int64_t durabilityLagTargetVersions)
+                                   int64_t durabilityLagTargetVersions,
+                                   double bwLagTarget)
   : tpsLimit(std::numeric_limits<double>::infinity()), tpsLimitMetric(StringRef("Ratekeeper.TPSLimit" + context)),
     reasonMetric(StringRef("Ratekeeper.Reason" + context)), storageTargetBytes(storageTargetBytes),
     storageSpringBytes(storageSpringBytes), logTargetBytes(logTargetBytes), logSpringBytes(logSpringBytes),
     maxVersionDifference(maxVersionDifference),
-    durabilityLagTargetVersions(
-        durabilityLagTargetVersions +
-        SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS), // The read transaction life versions are expected to not
+    durabilityLagTargetVersions(durabilityLagTargetVersions +
+                                SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS), // The read transaction life versions
+                                                                                   // are expected to not
     // be durable on the storage servers
-    lastDurabilityLag(0), durabilityLagLimit(std::numeric_limits<double>::infinity()), priority(priority),
-    context(context), rkUpdateEventCacheHolder(makeReference<EventCacheHolder>("RkUpdate" + context)) {}
+    lastDurabilityLag(0), durabilityLagLimit(std::numeric_limits<double>::infinity()), bwLagTarget(bwLagTarget),
+    priority(priority), context(context),
+    rkUpdateEventCacheHolder(makeReference<EventCacheHolder>("RkUpdate" + context)) {}
diff --git a/fdbserver/RemoteIKeyValueStore.actor.cpp b/fdbserver/RemoteIKeyValueStore.actor.cpp
index bead82267a..63471825e8 100644
--- a/fdbserver/RemoteIKeyValueStore.actor.cpp
+++ b/fdbserver/RemoteIKeyValueStore.actor.cpp
@@ -18,6 +18,7 @@
  * limitations under the License.
  */
 
+#include "fdbserver/IKeyValueStore.h"
 #include "flow/ActorCollection.h"
 #include "flow/Error.h"
 #include "flow/Platform.h"
@@ -99,8 +100,7 @@ ACTOR Future<Void> runIKVS(OpenKVStoreRequest openReq, IKVSInterface ikvsInterfa
 		try {
 			choose {
 				when(IKVSGetValueRequest getReq = waitNext(ikvsInterface.getValue.getFuture())) {
-					actors.add(cancellableForwardPromise(getReq.reply,
-					                                     kvStore->readValue(getReq.key, getReq.type, getReq.debugID)));
+					actors.add(cancellableForwardPromise(getReq.reply, kvStore->readValue(getReq.key, getReq.options)));
 				}
 				when(IKVSSetRequest req = waitNext(ikvsInterface.set.getFuture())) { kvStore->set(req.keyValue); }
 				when(IKVSClearRequest req = waitNext(ikvsInterface.clear.getFuture())) { kvStore->clear(req.range); }
@@ -110,16 +110,16 @@ ACTOR Future<Void> runIKVS(OpenKVStoreRequest openReq, IKVSInterface ikvsInterfa
 				when(IKVSReadValuePrefixRequest readPrefixReq = waitNext(ikvsInterface.readValuePrefix.getFuture())) {
 					actors.add(cancellableForwardPromise(
 					    readPrefixReq.reply,
-					    kvStore->readValuePrefix(
-					        readPrefixReq.key, readPrefixReq.maxLength, readPrefixReq.type, readPrefixReq.debugID)));
+					    kvStore->readValuePrefix(readPrefixReq.key, readPrefixReq.maxLength, readPrefixReq.options)));
 				}
 				when(IKVSReadRangeRequest readRangeReq = waitNext(ikvsInterface.readRange.getFuture())) {
 					actors.add(cancellableForwardPromise(
 					    readRangeReq.reply,
-					    fmap(
-					        [](const RangeResult& result) { return IKVSReadRangeReply(result); },
-					        kvStore->readRange(
-					            readRangeReq.keys, readRangeReq.rowLimit, readRangeReq.byteLimit, readRangeReq.type))));
+					    fmap([](const RangeResult& result) { return IKVSReadRangeReply(result); },
+					         kvStore->readRange(readRangeReq.keys,
+					                            readRangeReq.rowLimit,
+					                            readRangeReq.byteLimit,
+					                            readRangeReq.options))));
 				}
 				when(IKVSGetStorageByteRequest req = waitNext(ikvsInterface.getStorageBytes.getFuture())) {
 					StorageBytes storageBytes = kvStore->getStorageBytes();
@@ -178,6 +178,8 @@ ACTOR static Future<int> flowProcessRunner(RemoteIKeyValueStore* self, Promise<V
 		                              SERVER_KNOBS->CONN_FILE,
 		                              "--logdir",
 		                              SERVER_KNOBS->LOG_DIRECTORY,
+		                              "--trace-format",
+		                              getTraceFormatExtension(),
 		                              "-p",
 		                              flowProcessAddr,
 		                              "--process-name",
@@ -227,12 +229,6 @@ IKeyValueStore* openRemoteKVStore(KeyValueStoreType storeType,
 	return self;
 }
 
-ACTOR static Future<Void> delayFlowProcessRunAction(FlowProcess* self, double time) {
-	wait(delay(time));
-	wait(self->run());
-	return Void();
-}
-
 Future<Void> runFlowProcess(std::string const& name, Endpoint endpoint) {
 	TraceEvent(SevInfo, "RunFlowProcessStart").log();
 	FlowProcess* self = IProcessFactory::create(name.c_str());
@@ -242,5 +238,5 @@ Future<Void> runFlowProcess(std::string const& name, Endpoint endpoint) {
 	req.flowProcessInterface = self->serializedInterface();
 	registerProcess.send(req);
 	TraceEvent(SevDebug, "FlowProcessInitFinished").log();
-	return delayFlowProcessRunAction(self, g_network->isSimulated() ? 0 : SERVER_KNOBS->REMOTE_KV_STORE_INIT_DELAY);
+	return self->run();
 }
diff --git a/fdbserver/Resolver.actor.cpp b/fdbserver/Resolver.actor.cpp
index 869872d061..a522576cc0 100644
--- a/fdbserver/Resolver.actor.cpp
+++ b/fdbserver/Resolver.actor.cpp
@@ -26,6 +26,7 @@
 #include "fdbclient/SystemData.h"
 #include "fdbserver/ApplyMetadataMutation.h"
 #include "fdbserver/ConflictSet.h"
+#include "fdbserver/EncryptionOpsUtils.h"
 #include "fdbserver/IKeyValueStore.h"
 #include "fdbserver/Knobs.h"
 #include "fdbserver/LogSystem.h"
@@ -193,7 +194,9 @@ struct Resolver : ReferenceCounted<Resolver> {
 };
 } // namespace
 
-ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatchRequest req) {
+ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
+                                ResolveTransactionBatchRequest req,
+                                Reference<AsyncVar<ServerDBInfo> const> db) {
 	state Optional<UID> debugID;
 	state Span span("R:resolveBatch"_loc, req.spanContext);
 
@@ -348,7 +351,7 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc
 				SpanContext spanContext =
 				    req.transactions[t].spanContext.present() ? req.transactions[t].spanContext.get() : SpanContext();
 
-				applyMetadataMutations(spanContext, *resolverData, req.transactions[t].mutations);
+				applyMetadataMutations(spanContext, *resolverData, req.transactions[t].mutations, db);
 			}
 			CODE_PROBE(self->forceRecovery, "Resolver detects forced recovery");
 		}
@@ -503,7 +506,8 @@ struct TransactionStateResolveContext {
 	}
 };
 
-ACTOR Future<Void> processCompleteTransactionStateRequest(TransactionStateResolveContext* pContext) {
+ACTOR Future<Void> processCompleteTransactionStateRequest(TransactionStateResolveContext* pContext,
+                                                          Reference<AsyncVar<ServerDBInfo> const> db) {
 	state KeyRange txnKeys = allKeys;
 	state std::map<Tag, UID> tag_uid;
 
@@ -570,8 +574,7 @@ ACTOR Future<Void> processCompleteTransactionStateRequest(TransactionStateResolv
 		bool confChanges; // Ignore configuration changes for initial commits.
 		ResolverData resolverData(
 		    pContext->pResolverData->dbgid, pContext->pTxnStateStore, &pContext->pResolverData->keyInfo, confChanges);
-
-		applyMetadataMutations(SpanContext(), resolverData, mutations);
+		applyMetadataMutations(SpanContext(), resolverData, mutations, db);
 	} // loop
 
 	auto lockedKey = pContext->pTxnStateStore->readValue(databaseLockedKey).get();
@@ -584,7 +587,8 @@ ACTOR Future<Void> processCompleteTransactionStateRequest(TransactionStateResolv
 }
 
 ACTOR Future<Void> processTransactionStateRequestPart(TransactionStateResolveContext* pContext,
-                                                      TxnStateRequest request) {
+                                                      TxnStateRequest request,
+                                                      Reference<AsyncVar<ServerDBInfo> const> db) {
 	ASSERT(pContext->pResolverData.getPtr() != nullptr);
 	ASSERT(pContext->pActors != nullptr);
 
@@ -611,7 +615,7 @@ ACTOR Future<Void> processTransactionStateRequestPart(TransactionStateResolveCon
 	if (pContext->receivedSequences.size() == pContext->maxSequence) {
 		// Received all components of the txnStateRequest
 		ASSERT(!pContext->processed);
-		wait(processCompleteTransactionStateRequest(pContext));
+		wait(processCompleteTransactionStateRequest(pContext, db));
 		pContext->processed = true;
 	}
 
@@ -649,8 +653,15 @@ ACTOR Future<Void> resolverCore(ResolverInterface resolver,
 	state TransactionStateResolveContext transactionStateResolveContext;
 	if (SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS) {
 		self->logAdapter = new LogSystemDiskQueueAdapter(self->logSystem, Reference<AsyncVar<PeekTxsInfo>>(), 1, false);
-		self->txnStateStore = keyValueStoreLogSystem(
-		    self->logAdapter, db, resolver.id(), 2e9, true, true, true, SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION);
+		self->txnStateStore =
+		    keyValueStoreLogSystem(self->logAdapter,
+		                           db,
+		                           resolver.id(),
+		                           2e9,
+		                           true,
+		                           true,
+		                           true,
+		                           isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION, db->get().client));
 
 		// wait for txnStateStore recovery
 		wait(success(self->txnStateStore->readValue(StringRef())));
@@ -667,7 +678,7 @@ ACTOR Future<Void> resolverCore(ResolverInterface resolver,
 
 	loop choose {
 		when(ResolveTransactionBatchRequest batch = waitNext(resolver.resolve.getFuture())) {
-			actors.add(resolveBatch(self, batch));
+			actors.add(resolveBatch(self, batch, db));
 		}
 		when(ResolutionMetricsRequest req = waitNext(resolver.metrics.getFuture())) {
 			++self->metricsRequests;
@@ -690,7 +701,7 @@ ACTOR Future<Void> resolverCore(ResolverInterface resolver,
 		}
 		when(TxnStateRequest request = waitNext(resolver.txnState.getFuture())) {
 			if (SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS) {
-				addActor.send(processTransactionStateRequestPart(&transactionStateResolveContext, request));
+				addActor.send(processTransactionStateRequestPart(&transactionStateResolveContext, request, db));
 			} else {
 				ASSERT(false);
 			}
diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp
index 736e73920f..e0932887fc 100644
--- a/fdbserver/RestoreLoader.actor.cpp
+++ b/fdbserver/RestoreLoader.actor.cpp
@@ -24,7 +24,6 @@
 #include "flow/UnitTest.h"
 #include "fdbclient/BackupContainer.h"
 #include "fdbclient/BackupAgent.actor.h"
-#include "fdbserver/EncryptedMutationMessage.h"
 #include "fdbserver/RestoreLoader.actor.h"
 #include "fdbserver/RestoreRoleCommon.actor.h"
 #include "fdbserver/MutationTracking.h"
@@ -423,11 +422,11 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
 			ASSERT(inserted);
 
 			ArenaReader rd(buf.arena(), StringRef(message, msgSize), AssumeVersion(g_network->protocolVersion()));
-			if (EncryptedMutationMessage::isNextIn(rd)) {
-				throw encrypt_unsupported();
-			}
 			MutationRef mutation;
 			rd >> mutation;
+			if (mutation.isEncrypted()) {
+				throw encrypt_unsupported();
+			}
 
 			// Skip mutation whose commitVesion < range kv's version
 			if (logMutationTooOld(pRangeVersions, mutation, msgVersion.version)) {
diff --git a/fdbserver/RkTagThrottleCollection.cpp b/fdbserver/RkTagThrottleCollection.cpp
index 83e6c1e1eb..a995238bd5 100644
--- a/fdbserver/RkTagThrottleCollection.cpp
+++ b/fdbserver/RkTagThrottleCollection.cpp
@@ -353,6 +353,7 @@ void RkTagThrottleCollection::incrementBusyTagCount(TagThrottledReason reason) {
 	} else if (reason == TagThrottledReason::BUSY_WRITE) {
 		++busyWriteTagCount;
 	} else {
-		ASSERT(false);
+		ASSERT(reason == TagThrottledReason::UNSET);
+		CODE_PROBE(true, "tag throttled reason is unset, probably because of upgrading");
 	}
 }
diff --git a/fdbserver/RocksDBCheckpointUtils.actor.cpp b/fdbserver/RocksDBCheckpointUtils.actor.cpp
index 522b40906d..88b4795171 100644
--- a/fdbserver/RocksDBCheckpointUtils.actor.cpp
+++ b/fdbserver/RocksDBCheckpointUtils.actor.cpp
@@ -448,7 +448,7 @@ private:
 	}
 
 	ACTOR static Future<Void> doClose(RocksDBCFCheckpointReader* self) {
-		wait(delay(0, TaskPriority::CheckPoint));
+		wait(delay(0, TaskPriority::FetchKeys));
 		delete self;
 		return Void();
 	}
@@ -802,7 +802,7 @@ ACTOR Future<Void> deleteRocksCheckpoint(CheckpointMetaData checkpoint) {
 		TraceEvent(SevInfo, "DeleteCheckpointRemovedDir", checkpoint.checkpointID)
 		    .detail("CheckpointID", checkpoint.checkpointID)
 		    .detail("Dir", dir);
-		wait(delay(0, TaskPriority::CheckPoint));
+		wait(delay(0, TaskPriority::FetchKeys));
 	}
 
 	return Void();
diff --git a/fdbserver/ServerCheckpoint.actor.cpp b/fdbserver/ServerCheckpoint.actor.cpp
index cc74b3cb22..91d0145fd8 100644
--- a/fdbserver/ServerCheckpoint.actor.cpp
+++ b/fdbserver/ServerCheckpoint.actor.cpp
@@ -35,7 +35,7 @@ ICheckpointReader* newCheckpointReader(const CheckpointMetaData& checkpoint, UID
 }
 
 ACTOR Future<Void> deleteCheckpoint(CheckpointMetaData checkpoint) {
-	wait(delay(0, TaskPriority::CheckPoint));
+	wait(delay(0, TaskPriority::FetchKeys));
 	state CheckpointFormat format = checkpoint.getFormat();
 	if (format == RocksDBColumnFamily || format == RocksDB) {
 		wait(deleteRocksCheckpoint(checkpoint));
diff --git a/fdbserver/SimKmsConnector.actor.cpp b/fdbserver/SimKmsConnector.actor.cpp
index 91d843345a..0a427d82f6 100644
--- a/fdbserver/SimKmsConnector.actor.cpp
+++ b/fdbserver/SimKmsConnector.actor.cpp
@@ -76,18 +76,20 @@ struct SimKmsConnectorContext : NonCopyable, ReferenceCounted<SimKmsConnectorCon
 };
 
 namespace {
-Optional<int64_t> getRefreshInterval(int64_t now, int64_t defaultTtl) {
+Optional<int64_t> getRefreshInterval(const int64_t now, const int64_t defaultTtl) {
 	if (BUGGIFY) {
-		return Optional<int64_t>(now + defaultTtl);
+		return Optional<int64_t>(now);
 	}
-	return Optional<int64_t>();
+	return Optional<int64_t>(now + defaultTtl);
 }
 
-Optional<int64_t> getExpireInterval(Optional<int64_t> refTS) {
+Optional<int64_t> getExpireInterval(Optional<int64_t> refTS, const int64_t defaultTtl) {
+	ASSERT(refTS.present());
+
 	if (BUGGIFY) {
 		return Optional<int64_t>(-1);
 	}
-	return refTS;
+	return (refTS.get() + defaultTtl);
 }
 } // namespace
 
@@ -105,11 +107,17 @@ ACTOR Future<Void> ekLookupByIds(Reference<SimKmsConnectorContext> ctx,
 	}
 
 	// Lookup corresponding EncryptKeyCtx for input keyId
+	const int64_t currTS = (int64_t)now();
+	// Fetch default TTL to avoid BUGGIFY giving different value per invocation causing refTS > expTS
+	const int64_t defaultTtl = FLOW_KNOBS->ENCRYPT_CIPHER_KEY_CACHE_TTL;
+	Optional<int64_t> refAtTS = getRefreshInterval(currTS, defaultTtl);
+	Optional<int64_t> expAtTS = getExpireInterval(refAtTS, defaultTtl);
+	TraceEvent("SimKms.EKLookupById").detail("RefreshAt", refAtTS).detail("ExpireAt", expAtTS);
 	for (const auto& item : req.encryptKeyInfos) {
 		const auto& itr = ctx->simEncryptKeyStore.find(item.baseCipherId);
 		if (itr != ctx->simEncryptKeyStore.end()) {
 			rep.cipherKeyDetails.emplace_back_deep(
-			    rep.arena, item.domainId, itr->first, StringRef(itr->second.get()->key));
+			    rep.arena, item.domainId, itr->first, StringRef(itr->second.get()->key), refAtTS, expAtTS);
 
 			if (dbgKIdTrace.present()) {
 				// {encryptDomainId, baseCipherId} forms a unique tuple across encryption domains
@@ -145,11 +153,12 @@ ACTOR Future<Void> ekLookupByDomainIds(Reference<SimKmsConnectorContext> ctx,
 	// Map encryptionDomainId to corresponding EncryptKeyCtx element using a modulo operation. This
 	// would mean multiple domains gets mapped to the same encryption key which is fine, the
 	// EncryptKeyStore guarantees that keyId -> plaintext encryptKey mapping is idempotent.
-	int64_t currTS = (int64_t)now();
+	const int64_t currTS = (int64_t)now();
 	// Fetch default TTL to avoid BUGGIFY giving different value per invocation causing refTS > expTS
-	int64_t defaultTtl = FLOW_KNOBS->ENCRYPT_CIPHER_KEY_CACHE_TTL;
+	const int64_t defaultTtl = FLOW_KNOBS->ENCRYPT_CIPHER_KEY_CACHE_TTL;
 	Optional<int64_t> refAtTS = getRefreshInterval(currTS, defaultTtl);
-	Optional<int64_t> expAtTS = getExpireInterval(refAtTS);
+	Optional<int64_t> expAtTS = getExpireInterval(refAtTS, defaultTtl);
+	TraceEvent("SimKms.EKLookupByDomainId").detail("RefreshAt", refAtTS).detail("ExpireAt", expAtTS);
 	for (const auto& info : req.encryptDomainInfos) {
 		EncryptCipherBaseKeyId keyId = 1 + abs(info.domainId) % SERVER_KNOBS->SIM_KMS_MAX_KEYS;
 		const auto& itr = ctx->simEncryptKeyStore.find(keyId);
diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index 4e67e97495..4d6b1e94ef 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -84,7 +84,7 @@ bool destructed = false;
 class TestConfig {
 	class ConfigBuilder {
 		using value_type = toml::basic_value<toml::discard_comments>;
-		using base_variant = std::variant<int, bool, std::string, std::vector<int>, ConfigDBType>;
+		using base_variant = std::variant<int, float, double, bool, std::string, std::vector<int>, ConfigDBType>;
 		using types =
 		    variant_map<variant_concat<base_variant, variant_map<base_variant, Optional>>, std::add_pointer_t>;
 		std::unordered_map<std::string_view, types> confMap;
@@ -94,6 +94,10 @@ class TestConfig {
 			visitor(const value_type& v) : value(v) {}
 			void operator()(int* val) const { *val = value.as_integer(); }
 			void operator()(Optional<int>* val) const { *val = value.as_integer(); }
+			void operator()(float* val) const { *val = value.as_floating(); }
+			void operator()(Optional<float>* val) const { *val = value.as_floating(); }
+			void operator()(double* val) const { *val = value.as_floating(); }
+			void operator()(Optional<double>* val) const { *val = value.as_floating(); }
 			void operator()(bool* val) const { *val = value.as_boolean(); }
 			void operator()(Optional<bool>* val) const { *val = value.as_boolean(); }
 			void operator()(std::string* val) const { *val = value.as_string(); }
@@ -215,8 +219,12 @@ class TestConfig {
 			std::string attrib = removeWhitespace(line.substr(0, found));
 			std::string value = removeWhitespace(line.substr(found + 1));
 
-			if (attrib == "extraDB") {
-				sscanf(value.c_str(), "%d", &extraDB);
+			if (attrib == "extraDatabaseMode") {
+				extraDatabaseMode = ISimulator::stringToExtraDatabaseMode(value);
+			}
+
+			if (attrib == "extraDatabaseCount") {
+				sscanf(value.c_str(), "%d", &extraDatabaseCount);
 			}
 
 			if (attrib == "minimumReplication") {
@@ -296,7 +304,9 @@ class TestConfig {
 	ConfigDBType configDBType{ ConfigDBType::DISABLED };
 
 public:
-	int extraDB = 0;
+	ISimulator::ExtraDatabaseMode extraDatabaseMode = ISimulator::ExtraDatabaseMode::Disabled;
+	// The number of extra database used if the database mode is MULTIPLE
+	int extraDatabaseCount = 1;
 	int minimumReplication = 0;
 	int minimumRegions = 0;
 	bool configureLocked = false;
@@ -335,8 +345,11 @@ public:
 
 	bool allowDefaultTenant = true;
 	bool allowDisablingTenants = true;
+	bool allowCreatingTenants = true;
 	bool injectTargetedSSRestart = false;
 	bool injectSSDelay = false;
+	std::string testClass; // unused -- used in TestHarness
+	float testPriority; // unused -- used in TestHarness
 
 	ConfigDBType getConfigDBType() const { return configDBType; }
 
@@ -362,8 +375,12 @@ public:
 			loadIniFile(testFile);
 			return;
 		}
+		std::string extraDatabaseModeStr;
 		ConfigBuilder builder;
-		builder.add("extraDB", &extraDB)
+		builder.add("testClass", &testClass)
+		    .add("testPriority", &testPriority)
+		    .add("extraDatabaseMode", &extraDatabaseModeStr)
+		    .add("extraDatabaseCount", &extraDatabaseCount)
 		    .add("minimumReplication", &minimumReplication)
 		    .add("minimumRegions", &minimumRegions)
 		    .add("configureLocked", &configureLocked)
@@ -394,6 +411,8 @@ public:
 		    .add("blobGranulesEnabled", &blobGranulesEnabled)
 		    .add("allowDefaultTenant", &allowDefaultTenant)
 		    .add("allowDisablingTenants", &allowDisablingTenants)
+		    .add("allowCreatingTenants", &allowCreatingTenants)
+		    .add("randomlyRenameZoneId", &randomlyRenameZoneId)
 		    .add("randomlyRenameZoneId", &randomlyRenameZoneId)
 		    .add("injectTargetedSSRestart", &injectTargetedSSRestart)
 		    .add("injectSSDelay", &injectSSDelay);
@@ -418,6 +437,9 @@ public:
 			if (!isFirstTestInRestart) {
 				isFirstTestInRestart = tomlKeyPresent(file, "restartInfoLocation");
 			}
+			if (!extraDatabaseModeStr.empty()) {
+				extraDatabaseMode = ISimulator::stringToExtraDatabaseMode(extraDatabaseModeStr);
+			}
 		} catch (std::exception& e) {
 			std::cerr << e.what() << std::endl;
 			TraceEvent("TOMLParseError").detail("Error", printable(e.what()));
@@ -478,22 +500,24 @@ ACTOR Future<Void> runDr(Reference<IClusterConnectionRecord> connRecord) {
 	}
 
 	if (g_simulator.drAgents == ISimulator::BackupAgentType::BackupToDB) {
+		ASSERT(g_simulator.extraDatabases.size() == 1);
 		Database cx = Database::createDatabase(connRecord, -1);
 
-		auto extraFile = makeReference<ClusterConnectionMemoryRecord>(*g_simulator.extraDB);
-		state Database extraDB = Database::createDatabase(extraFile, -1);
+		auto extraFile =
+		    makeReference<ClusterConnectionMemoryRecord>(ClusterConnectionString(g_simulator.extraDatabases[0]));
+		state Database drDatabase = Database::createDatabase(extraFile, -1);
 
 		TraceEvent("StartingDrAgents")
 		    .detail("ConnectionString", connRecord->getConnectionString().toString())
 		    .detail("ExtraString", extraFile->getConnectionString().toString());
 
 		state DatabaseBackupAgent dbAgent = DatabaseBackupAgent(cx);
-		state DatabaseBackupAgent extraAgent = DatabaseBackupAgent(extraDB);
+		state DatabaseBackupAgent extraAgent = DatabaseBackupAgent(drDatabase);
 
 		auto drPollDelay = 1.0 / CLIENT_KNOBS->BACKUP_AGGREGATE_POLL_RATE;
 
 		agentFutures.push_back(extraAgent.run(cx, drPollDelay, CLIENT_KNOBS->SIM_BACKUP_TASKS_PER_AGENT));
-		agentFutures.push_back(dbAgent.run(extraDB, drPollDelay, CLIENT_KNOBS->SIM_BACKUP_TASKS_PER_AGENT));
+		agentFutures.push_back(dbAgent.run(drDatabase, drPollDelay, CLIENT_KNOBS->SIM_BACKUP_TASKS_PER_AGENT));
 
 		while (g_simulator.drAgents == ISimulator::BackupAgentType::BackupToDB) {
 			wait(delay(1.0));
@@ -1102,10 +1126,10 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
 		if (tssModeStr != nullptr) {
 			g_simulator.tssMode = (ISimulator::TSSMode)atoi(tssModeStr);
 		}
-		bool enableExtraDB = (testConfig.extraDB == 3);
 		ClusterConnectionString conn(ini.GetValue("META", "connectionString"));
-		if (enableExtraDB) {
-			g_simulator.extraDB = new ClusterConnectionString(ini.GetValue("META", "connectionString"));
+		if (testConfig.extraDatabaseMode == ISimulator::ExtraDatabaseMode::Local) {
+			g_simulator.extraDatabases.clear();
+			g_simulator.extraDatabases.push_back(conn.toString());
 		}
 		if (!testConfig.disableHostname) {
 			auto mockDNSStr = ini.GetValue("META", "mockDNS");
@@ -1121,6 +1145,8 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
 		if (testConfig.disableEncryption) {
 			g_knobs.setKnob("enable_encryption", KnobValueRef::create(bool{ false }));
 			g_knobs.setKnob("enable_tlog_encryption", KnobValueRef::create(bool{ false }));
+			g_knobs.setKnob("enable_storage_server_encryption", KnobValueRef::create(bool{ false }));
+			g_knobs.setKnob("enable_blob_granule_encryption", KnobValueRef::create(bool{ false }));
 			TraceEvent(SevDebug, "DisableEncryption");
 		}
 		*pConnString = conn;
@@ -1271,7 +1297,8 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
 // Configuration details compiled in a structure used when setting up a simulated cluster
 struct SimulationConfig {
 	explicit SimulationConfig(const TestConfig& testConfig);
-	int extraDB;
+	ISimulator::ExtraDatabaseMode extraDatabaseMode;
+	int extraDatabaseCount;
 	bool generateFearless;
 
 	DatabaseConfiguration db;
@@ -1300,7 +1327,8 @@ private:
 	void generateNormalConfig(const TestConfig& testConfig);
 };
 
-SimulationConfig::SimulationConfig(const TestConfig& testConfig) : extraDB(testConfig.extraDB) {
+SimulationConfig::SimulationConfig(const TestConfig& testConfig)
+  : extraDatabaseMode(testConfig.extraDatabaseMode), extraDatabaseCount(testConfig.extraDatabaseCount) {
 	generateNormalConfig(testConfig);
 }
 
@@ -1746,7 +1774,9 @@ void SimulationConfig::setMachineCount(const TestConfig& testConfig) {
 		machine_count = std::max(datacenters + 2,
 		                         ((db.minDatacentersRequired() > 0) ? datacenters : 1) *
 		                             std::max(3, db.minZonesRequiredPerDatacenter()));
-		machine_count = deterministicRandom()->randomInt(machine_count, std::max(machine_count + 1, extraDB ? 6 : 10));
+		machine_count = deterministicRandom()->randomInt(
+		    machine_count,
+		    std::max(machine_count + 1, extraDatabaseMode == ISimulator::ExtraDatabaseMode::Disabled ? 10 : 6));
 		// generateMachineTeamTestConfig set up the number of servers per machine and the number of machines such that
 		// if we do not remove the surplus server and machine teams, the simulation test will report error.
 		// This is needed to make sure the number of server (and machine) teams is no larger than the desired number.
@@ -1756,7 +1786,9 @@ void SimulationConfig::setMachineCount(const TestConfig& testConfig) {
 			// while the max possible machine team number is 10.
 			// If machine_count > 5, we can still test the effectivenss of machine teams
 			// Note: machine_count may be much larger than 5 because we may have a big replication factor
-			machine_count = std::max(machine_count, deterministicRandom()->randomInt(5, extraDB ? 6 : 10));
+			machine_count = std::max(machine_count,
+			                         deterministicRandom()->randomInt(
+			                             5, extraDatabaseMode == ISimulator::ExtraDatabaseMode::Disabled ? 10 : 6));
 		}
 	}
 	machine_count += datacenters * testConfig.extraMachineCountDC;
@@ -1783,7 +1815,8 @@ void SimulationConfig::setProcessesPerMachine(const TestConfig& testConfig) {
 	} else if (generateFearless) {
 		processes_per_machine = 1;
 	} else {
-		processes_per_machine = deterministicRandom()->randomInt(1, (extraDB ? 14 : 28) / machine_count + 2);
+		processes_per_machine = deterministicRandom()->randomInt(
+		    1, (extraDatabaseMode == ISimulator::ExtraDatabaseMode::Disabled ? 28 : 14) / machine_count + 2);
 	}
 }
 
@@ -1895,6 +1928,8 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
 	if (testConfig.disableEncryption) {
 		g_knobs.setKnob("enable_encryption", KnobValueRef::create(bool{ false }));
 		g_knobs.setKnob("enable_tlog_encryption", KnobValueRef::create(bool{ false }));
+		g_knobs.setKnob("enable_storage_server_encryption", KnobValueRef::create(bool{ false }));
+		g_knobs.setKnob("enable_blob_granule_encryption", KnobValueRef::create(bool{ false }));
 		TraceEvent(SevDebug, "DisableEncryption");
 	}
 	auto configDBType = testConfig.getConfigDBType();
@@ -2014,10 +2049,23 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
 	NetworkAddressFromHostname fromHostname =
 	    useHostname ? NetworkAddressFromHostname::True : NetworkAddressFromHostname::False;
 
+	int extraDatabaseCount = 0;
+	bool useLocalDatabase = (testConfig.extraDatabaseMode == ISimulator::ExtraDatabaseMode::LocalOrSingle && BUGGIFY) ||
+	                        testConfig.extraDatabaseMode == ISimulator::ExtraDatabaseMode::Local;
+	if (!useLocalDatabase && testConfig.extraDatabaseMode != ISimulator::ExtraDatabaseMode::Disabled) {
+		extraDatabaseCount =
+		    testConfig.extraDatabaseMode == ISimulator::ExtraDatabaseMode::Multiple && testConfig.extraDatabaseCount > 0
+		        ? testConfig.extraDatabaseCount
+		        : 1;
+	}
+
 	std::vector<NetworkAddress> coordinatorAddresses;
 	std::vector<Hostname> coordinatorHostnames;
-	std::vector<NetworkAddress> extraCoordinatorAddresses; // Used by extra DB if the DR db is a new one
-	std::vector<Hostname> extraCoordinatorHostnames;
+
+	// A list of coordinators for each extra database being created. The Nth vector in the outer vector
+	// contains the coordinators for the Nth extra database.
+	std::vector<std::vector<NetworkAddress>> extraCoordinatorAddresses(extraDatabaseCount);
+	std::vector<std::vector<Hostname>> extraCoordinatorHostnames(extraDatabaseCount);
 
 	if (testConfig.minimumRegions > 1) {
 		// do not put coordinators in the primary region so that we can kill that region safely
@@ -2029,21 +2077,29 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
 				uint16_t port = sslEnabled && !sslOnly ? 2 : 1;
 				NetworkAddress coordinator(ip, port, true, isTLS, fromHostname);
 				coordinatorAddresses.push_back(coordinator);
-				auto extraIp = makeIPAddressForSim(useIPv6, { 4, dc, 1, m });
-				NetworkAddress extraCoordinator(extraIp, port, true, isTLS, fromHostname);
-				extraCoordinatorAddresses.push_back(extraCoordinator);
 
 				if (useHostname) {
 					std::string hostname = "fakeCoordinatorDC" + std::to_string(dc) + "M" + std::to_string(m);
 					Hostname coordinatorHostname(hostname, std::to_string(port), isTLS);
 					coordinatorHostnames.push_back(coordinatorHostname);
 					INetworkConnections::net()->addMockTCPEndpoint(hostname, std::to_string(port), { coordinator });
-					hostname = "fakeExtraCoordinatorDC" + std::to_string(dc) + "M" + std::to_string(m);
-					Hostname extraCoordinatorHostname(hostname, std::to_string(port), isTLS);
-					extraCoordinatorHostnames.push_back(extraCoordinatorHostname);
-					INetworkConnections::net()->addMockTCPEndpoint(
-					    hostname, std::to_string(port), { extraCoordinator });
 				}
+
+				for (int edb = 0; edb < extraDatabaseCount; ++edb) {
+					auto extraIp = makeIPAddressForSim(useIPv6, { 4 + edb, dc, 1, m });
+					NetworkAddress extraCoordinator(extraIp, port, true, isTLS, fromHostname);
+					extraCoordinatorAddresses[edb].push_back(extraCoordinator);
+
+					if (useHostname) {
+						std::string hostname = "fakeExtraCoordinatorDC" + std::to_string(dc) + "M" + std::to_string(m) +
+						                       "C" + std::to_string(edb);
+						Hostname extraCoordinatorHostname(hostname, std::to_string(port), isTLS);
+						extraCoordinatorHostnames[edb].push_back(extraCoordinatorHostname);
+						INetworkConnections::net()->addMockTCPEndpoint(
+						    hostname, std::to_string(port), { extraCoordinator });
+					}
+				}
+
 				TraceEvent("SelectedCoordinator")
 				    .detail("Hostname", useHostname ? coordinatorHostnames.back().toString().c_str() : "N/A")
 				    .detail("Address", coordinatorAddresses.back());
@@ -2075,20 +2131,26 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
 					uint16_t port = sslEnabled && !sslOnly ? 2 : 1;
 					NetworkAddress coordinator(ip, port, true, isTLS, fromHostname);
 					coordinatorAddresses.push_back(coordinator);
-					auto extraIp = makeIPAddressForSim(useIPv6, { 4, dc, 1, m });
-					NetworkAddress extraCoordinator(extraIp, port, true, isTLS, fromHostname);
-					extraCoordinatorAddresses.push_back(extraCoordinator);
+
 					if (useHostname) {
 						std::string hostname = "fakeCoordinatorDC" + std::to_string(dc) + "M" + std::to_string(m);
 						Hostname coordinatorHostname(hostname, std::to_string(port), isTLS);
 						coordinatorHostnames.push_back(coordinatorHostname);
 						INetworkConnections::net()->addMockTCPEndpoint(hostname, std::to_string(port), { coordinator });
-						hostname = "fakeExtraCoordinatorDC" + std::to_string(dc) + "M" + std::to_string(m);
+					}
+
+					for (int edb = 0; edb < extraDatabaseCount; ++edb) {
+						auto extraIp = makeIPAddressForSim(useIPv6, { 4 + edb, dc, 1, m });
+						NetworkAddress extraCoordinator(extraIp, port, true, isTLS, fromHostname);
+						extraCoordinatorAddresses[edb].push_back(extraCoordinator);
+						std::string hostname = "fakeExtraCoordinatorDC" + std::to_string(dc) + "M" + std::to_string(m) +
+						                       "C" + std::to_string(edb);
 						Hostname extraCoordinatorHostname(hostname, std::to_string(port), isTLS);
-						extraCoordinatorHostnames.push_back(extraCoordinatorHostname);
+						extraCoordinatorHostnames[edb].push_back(extraCoordinatorHostname);
 						INetworkConnections::net()->addMockTCPEndpoint(
 						    hostname, std::to_string(port), { extraCoordinator });
 					}
+
 					TraceEvent("SelectedCoordinator")
 					    .detail("Hostname", useHostname ? coordinatorHostnames.back().toString().c_str() : "N/A")
 					    .detail("Address", coordinatorAddresses.back())
@@ -2127,22 +2189,19 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
 		conn = ClusterConnectionString(coordinatorHostnames, "TestCluster:0"_sr);
 	}
 
-	// If extraDB==0, leave g_simulator.extraDB as null because the test does not use DR.
-	if (testConfig.extraDB == 1) {
-		// The DR database can be either a new database or itself
-		g_simulator.extraDB =
-		    BUGGIFY ? (useHostname ? new ClusterConnectionString(coordinatorHostnames, "TestCluster:0"_sr)
-		                           : new ClusterConnectionString(coordinatorAddresses, "TestCluster:0"_sr))
-		            : (useHostname ? new ClusterConnectionString(extraCoordinatorHostnames, "ExtraCluster:0"_sr)
-		                           : new ClusterConnectionString(extraCoordinatorAddresses, "ExtraCluster:0"_sr));
-	} else if (testConfig.extraDB == 2) {
-		// The DR database is a new database
-		g_simulator.extraDB = useHostname ? new ClusterConnectionString(extraCoordinatorHostnames, "ExtraCluster:0"_sr)
-		                                  : new ClusterConnectionString(extraCoordinatorAddresses, "ExtraCluster:0"_sr);
-	} else if (testConfig.extraDB == 3) {
-		// The DR database is the same database
-		g_simulator.extraDB = useHostname ? new ClusterConnectionString(coordinatorHostnames, "TestCluster:0"_sr)
-		                                  : new ClusterConnectionString(coordinatorAddresses, "TestCluster:0"_sr);
+	if (useLocalDatabase) {
+		g_simulator.extraDatabases.push_back(
+		    useHostname ? ClusterConnectionString(coordinatorHostnames, "TestCluster:0"_sr).toString()
+		                : ClusterConnectionString(coordinatorAddresses, "TestCluster:0"_sr).toString());
+	} else if (testConfig.extraDatabaseMode != ISimulator::ExtraDatabaseMode::Disabled) {
+		for (int i = 0; i < extraDatabaseCount; ++i) {
+			g_simulator.extraDatabases.push_back(
+			    useHostname
+			        ? ClusterConnectionString(extraCoordinatorHostnames[i], StringRef(format("ExtraCluster%04d:0", i)))
+			              .toString()
+			        : ClusterConnectionString(extraCoordinatorAddresses[i], StringRef(format("ExtraCluster%04d:0", i)))
+			              .toString());
+		}
 	}
 
 	*pConnString = conn;
@@ -2151,7 +2210,7 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
 	    .detail("String", conn.toString())
 	    .detail("ConfigString", startingConfigString);
 
-	bool requiresExtraDBMachines = testConfig.extraDB && g_simulator.extraDB->toString() != conn.toString();
+	bool requiresExtraDBMachines = !g_simulator.extraDatabases.empty() && !useLocalDatabase;
 	int assignedMachines = 0, nonVersatileMachines = 0;
 	bool gradualMigrationPossible = true;
 	std::vector<ProcessClass::ClassType> processClassesSubSet = { ProcessClass::UnsetClass,
@@ -2259,31 +2318,35 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
 			                                     "SimulatedMachine"));
 
 			if (requiresExtraDBMachines) {
-				std::vector<IPAddress> extraIps;
-				extraIps.reserve(processesPerMachine);
-				for (int i = 0; i < processesPerMachine; i++) {
-					extraIps.push_back(
-					    makeIPAddressForSim(useIPv6, { 4, dc, deterministicRandom()->randomInt(1, i + 2), machine }));
+				int cluster = 4;
+				for (auto extraDatabase : g_simulator.extraDatabases) {
+					std::vector<IPAddress> extraIps;
+					extraIps.reserve(processesPerMachine);
+					for (int i = 0; i < processesPerMachine; i++) {
+						extraIps.push_back(makeIPAddressForSim(
+						    useIPv6, { cluster, dc, deterministicRandom()->randomInt(1, i + 2), machine }));
+					}
+
+					Standalone<StringRef> newMachineId(deterministicRandom()->randomUniqueID().toString());
+
+					LocalityData localities(Optional<Standalone<StringRef>>(), newZoneId, newMachineId, dcUID);
+					localities.set("data_hall"_sr, dcUID);
+					systemActors->push_back(reportErrors(simulatedMachine(ClusterConnectionString(extraDatabase),
+					                                                      extraIps,
+					                                                      sslEnabled,
+					                                                      localities,
+					                                                      processClass,
+					                                                      baseFolder,
+					                                                      false,
+					                                                      machine == useSeedForMachine,
+					                                                      AgentNone,
+					                                                      sslOnly,
+					                                                      whitelistBinPaths,
+					                                                      protocolVersion,
+					                                                      configDBType),
+					                                     "SimulatedMachine"));
+					++cluster;
 				}
-
-				Standalone<StringRef> newMachineId(deterministicRandom()->randomUniqueID().toString());
-
-				LocalityData localities(Optional<Standalone<StringRef>>(), newZoneId, newMachineId, dcUID);
-				localities.set("data_hall"_sr, dcUID);
-				systemActors->push_back(reportErrors(simulatedMachine(*g_simulator.extraDB,
-				                                                      extraIps,
-				                                                      sslEnabled,
-				                                                      localities,
-				                                                      processClass,
-				                                                      baseFolder,
-				                                                      false,
-				                                                      machine == useSeedForMachine,
-				                                                      AgentNone,
-				                                                      sslOnly,
-				                                                      whitelistBinPaths,
-				                                                      protocolVersion,
-				                                                      configDBType),
-				                                     "SimulatedMachine"));
 			}
 
 			assignedMachines++;
@@ -2392,7 +2455,7 @@ ACTOR void setupAndRun(std::string dataFolder,
 	allowList.addTrustedSubnet("abcd::/16"sv);
 	state bool allowDefaultTenant = testConfig.allowDefaultTenant;
 	state bool allowDisablingTenants = testConfig.allowDisablingTenants;
-	state bool allowCreatingTenants = true;
+	state bool allowCreatingTenants = testConfig.allowCreatingTenants;
 
 	if (!SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
 		testConfig.storageEngineExcludeTypes.push_back(5);
@@ -2422,7 +2485,8 @@ ACTOR void setupAndRun(std::string dataFolder,
 	// Disable the default tenant in backup and DR tests for now. This is because backup does not currently duplicate
 	// the tenant map and related state.
 	// TODO: reenable when backup/DR or BlobGranule supports tenants.
-	if (std::string_view(testFile).find("Backup") != std::string_view::npos || testConfig.extraDB != 0) {
+	if (std::string_view(testFile).find("Backup") != std::string_view::npos ||
+	    testConfig.extraDatabaseMode != ISimulator::ExtraDatabaseMode::Disabled) {
 		allowDefaultTenant = false;
 	}
 
@@ -2433,12 +2497,12 @@ ACTOR void setupAndRun(std::string dataFolder,
 		testConfig.storageEngineExcludeTypes.push_back(5);
 	}
 
-	state ProtocolVersion protocolVersion = currentProtocolVersion;
+	state ProtocolVersion protocolVersion = currentProtocolVersion();
 	if (testConfig.startIncompatibleProcess) {
 		// isolates right most 1 bit of compatibleProtocolVersionMask to make this protocolVersion incompatible
 		uint64_t minAddToMakeIncompatible =
 		    ProtocolVersion::compatibleProtocolVersionMask & ~(ProtocolVersion::compatibleProtocolVersionMask - 1);
-		protocolVersion = ProtocolVersion(currentProtocolVersion.version() + minAddToMakeIncompatible);
+		protocolVersion = ProtocolVersion(currentProtocolVersion().version() + minAddToMakeIncompatible);
 	}
 
 	// TODO (IPv6) Use IPv6?
@@ -2455,7 +2519,7 @@ ACTOR void setupAndRun(std::string dataFolder,
 	                           ProcessClass(ProcessClass::TesterClass, ProcessClass::CommandLineSource),
 	                           "",
 	                           "",
-	                           currentProtocolVersion);
+	                           currentProtocolVersion());
 	testSystem->excludeFromRestarts = true;
 	wait(g_simulator.onProcess(testSystem, TaskPriority::DefaultYield));
 	Sim2FileSystem::newFileSystem();
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index fa6d7eeb43..deb0aec396 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -1078,76 +1078,61 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
 	return processMap;
 }
 
-struct ClientStats {
-	int count;
-	std::set<std::pair<NetworkAddress, Key>> examples;
-
-	ClientStats() : count(0) {}
-};
-
 static JsonBuilderObject clientStatusFetcher(
     std::map<NetworkAddress, std::pair<double, OpenDatabaseRequest>>* clientStatusMap) {
 	JsonBuilderObject clientStatus;
 
 	int64_t clientCount = 0;
-	std::map<Key, ClientStats> issues;
-	std::map<Standalone<ClientVersionRef>, ClientStats> supportedVersions;
-	std::map<Key, ClientStats> maxSupportedProtocol;
+	// Here we handle versions and maxSupportedProtocols, the issues will be handled in getClientIssuesAsMessages
+	std::map<Standalone<ClientVersionRef>, OpenDatabaseRequest::Samples> supportedVersions;
+	std::map<Key, OpenDatabaseRequest::Samples> maxSupportedProtocol;
 
 	for (auto iter = clientStatusMap->begin(); iter != clientStatusMap->end();) {
-		if (now() - iter->second.first < 2 * SERVER_KNOBS->COORDINATOR_REGISTER_INTERVAL) {
-			clientCount += iter->second.second.clientCount;
-			for (auto& it : iter->second.second.issues) {
-				auto& issue = issues[it.item];
-				issue.count += it.count;
-				issue.examples.insert(it.examples.begin(), it.examples.end());
-			}
-			for (auto& it : iter->second.second.supportedVersions) {
-				auto& version = supportedVersions[it.item];
-				version.count += it.count;
-				version.examples.insert(it.examples.begin(), it.examples.end());
-			}
-			for (auto& it : iter->second.second.maxProtocolSupported) {
-				auto& protocolVersion = maxSupportedProtocol[it.item];
-				protocolVersion.count += it.count;
-				protocolVersion.examples.insert(it.examples.begin(), it.examples.end());
-			}
-			++iter;
-		} else {
+		if (now() - iter->second.first >= 2 * SERVER_KNOBS->COORDINATOR_REGISTER_INTERVAL) {
 			iter = clientStatusMap->erase(iter);
+			continue;
 		}
+
+		clientCount += iter->second.second.clientCount;
+		for (const auto& [version, samples] : iter->second.second.supportedVersions) {
+			supportedVersions[version] += samples;
+		}
+		for (const auto& [protocol, samples] : iter->second.second.maxProtocolSupported) {
+			maxSupportedProtocol[protocol] += samples;
+		}
+		++iter;
 	}
 
 	clientStatus["count"] = clientCount;
 
 	JsonBuilderArray versionsArray = JsonBuilderArray();
-	for (auto& cv : supportedVersions) {
+	for (const auto& [clientVersionRef, samples] : supportedVersions) {
 		JsonBuilderObject ver;
-		ver["count"] = (int64_t)cv.second.count;
-		ver["client_version"] = cv.first.clientVersion.toString();
-		ver["protocol_version"] = cv.first.protocolVersion.toString();
-		ver["source_version"] = cv.first.sourceVersion.toString();
+		ver["count"] = (int64_t)samples.count;
+		ver["client_version"] = clientVersionRef.clientVersion.toString();
+		ver["protocol_version"] = clientVersionRef.protocolVersion.toString();
+		ver["source_version"] = clientVersionRef.sourceVersion.toString();
 
 		JsonBuilderArray clients = JsonBuilderArray();
-		for (auto& client : cv.second.examples) {
+		for (const auto& [networkAddress, trackLogGroup] : samples.samples) {
 			JsonBuilderObject cli;
-			cli["address"] = client.first.toString();
-			cli["log_group"] = client.second.toString();
+			cli["address"] = networkAddress.toString();
+			cli["log_group"] = trackLogGroup.toString();
 			clients.push_back(cli);
 		}
 
-		auto iter = maxSupportedProtocol.find(cv.first.protocolVersion);
-		if (iter != maxSupportedProtocol.end()) {
+		auto iter = maxSupportedProtocol.find(clientVersionRef.protocolVersion);
+		if (iter != std::end(maxSupportedProtocol)) {
 			JsonBuilderArray maxClients = JsonBuilderArray();
-			for (auto& client : iter->second.examples) {
+			for (const auto& [networkAddress, trackLogGroup] : iter->second.samples) {
 				JsonBuilderObject cli;
-				cli["address"] = client.first.toString();
-				cli["log_group"] = client.second.toString();
+				cli["address"] = networkAddress.toString();
+				cli["log_group"] = trackLogGroup.toString();
 				maxClients.push_back(cli);
 			}
 			ver["max_protocol_count"] = iter->second.count;
 			ver["max_protocol_clients"] = maxClients;
-			maxSupportedProtocol.erase(cv.first.protocolVersion);
+			maxSupportedProtocol.erase(clientVersionRef.protocolVersion);
 		}
 
 		ver["connected_clients"] = clients;
@@ -1533,10 +1518,10 @@ struct ProtocolVersionData {
 	ProtocolVersion runningProtocolVersion;
 	ProtocolVersion newestProtocolVersion;
 	ProtocolVersion lowestCompatibleProtocolVersion;
-	ProtocolVersionData() : runningProtocolVersion(currentProtocolVersion) {}
+	ProtocolVersionData() : runningProtocolVersion(currentProtocolVersion()) {}
 
 	ProtocolVersionData(uint64_t newestProtocolVersionValue, uint64_t lowestCompatibleProtocolVersionValue)
-	  : runningProtocolVersion(currentProtocolVersion), newestProtocolVersion(newestProtocolVersionValue),
+	  : runningProtocolVersion(currentProtocolVersion()), newestProtocolVersion(newestProtocolVersionValue),
 	    lowestCompatibleProtocolVersion(lowestCompatibleProtocolVersionValue) {}
 };
 
@@ -2170,10 +2155,13 @@ ACTOR static Future<JsonBuilderObject> workloadStatusFetcher(
 
 	// Transactions
 	try {
-		state TraceEventFields ratekeeper = wait(
-		    timeoutError(rkWorker.interf.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("RkUpdate"))), 1.0));
-		TraceEventFields batchRatekeeper = wait(timeoutError(
-		    rkWorker.interf.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("RkUpdateBatch"))), 1.0));
+		state Future<TraceEventFields> f1 =
+		    timeoutError(rkWorker.interf.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("RkUpdate"))), 1.0);
+		state Future<TraceEventFields> f2 = timeoutError(
+		    rkWorker.interf.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("RkUpdateBatch"))), 1.0);
+		wait(success(f1) && success(f2));
+		TraceEventFields ratekeeper = f1.get();
+		TraceEventFields batchRatekeeper = f2.get();
 
 		bool autoThrottlingEnabled = ratekeeper.getInt("AutoThrottlingEnabled");
 		double tpsLimit = ratekeeper.getDouble("TPSLimit");
@@ -2358,6 +2346,53 @@ ACTOR static Future<JsonBuilderObject> clusterSummaryStatisticsFetcher(
 	return statusObj;
 }
 
+ACTOR static Future<JsonBuilderObject> blobWorkerStatusFetcher(
+    std::vector<BlobWorkerInterface> servers,
+    std::unordered_map<NetworkAddress, WorkerInterface> addressWorkersMap,
+    std::set<std::string>* incompleteReason) {
+
+	state JsonBuilderObject statusObj;
+	state std::vector<Future<Optional<TraceEventFields>>> futures;
+
+	statusObj["number_of_blob_workers"] = static_cast<int>(servers.size());
+
+	try {
+		for (auto& intf : servers) {
+			auto workerIntf = addressWorkersMap[intf.address()];
+			futures.push_back(latestEventOnWorker(workerIntf, "BlobWorkerMetrics"));
+		}
+
+		wait(waitForAll(futures));
+
+		state int totalRanges = 0;
+		for (auto future : futures) {
+			if (future.get().present()) {
+				auto latestTrace = future.get().get();
+				int numRanges = latestTrace.getInt("NumRangesAssigned");
+				totalRanges += numRanges;
+
+				JsonBuilderObject workerStatusObj;
+				workerStatusObj["number_of_key_ranges"] = numRanges;
+				workerStatusObj["put_requests"] = StatusCounter(latestTrace.getValue("S3PutReqs")).getStatus();
+				workerStatusObj["get_requests"] = StatusCounter(latestTrace.getValue("S3GetReqs")).getStatus();
+				workerStatusObj["delete_requests"] = StatusCounter(latestTrace.getValue("S3DeleteReqs")).getStatus();
+				workerStatusObj["bytes_buffered"] = latestTrace.getInt64("MutationBytesBuffered");
+				workerStatusObj["compression_bytes_raw"] =
+				    StatusCounter(latestTrace.getValue("CompressionBytesRaw")).getStatus();
+				workerStatusObj["compression_bytes_final"] =
+				    StatusCounter(latestTrace.getValue("CompressionBytesFinal")).getStatus();
+				statusObj[latestTrace.getValue("ID")] = workerStatusObj;
+			}
+		}
+		statusObj["number_of_key_ranges"] = totalRanges;
+	} catch (Error& e) {
+		if (e.code() == error_code_actor_cancelled)
+			throw;
+		incompleteReason->insert("Unknown blob worker stats");
+	}
+	return statusObj;
+}
+
 static JsonBuilderObject tlogFetcher(int* logFaultTolerance,
                                      const std::vector<TLogSet>& tLogs,
                                      std::unordered_map<NetworkAddress, WorkerInterface> const& address_workers) {
@@ -2586,18 +2621,19 @@ static JsonBuilderArray getClientIssuesAsMessages(
 		std::map<std::string, std::pair<int, std::vector<std::string>>> deduplicatedIssues;
 
 		for (auto iter = clientStatusMap->begin(); iter != clientStatusMap->end();) {
-			if (now() - iter->second.first < 2 * SERVER_KNOBS->COORDINATOR_REGISTER_INTERVAL) {
-				for (auto& issue : iter->second.second.issues) {
-					auto& t = deduplicatedIssues[issue.item.toString()];
-					t.first += issue.count;
-					for (auto& example : issue.examples) {
-						t.second.push_back(formatIpPort(example.first.ip, example.first.port));
-					}
-				}
-				++iter;
-			} else {
+			if (now() - iter->second.first >= 2 * SERVER_KNOBS->COORDINATOR_REGISTER_INTERVAL) {
 				iter = clientStatusMap->erase(iter);
+				continue;
 			}
+
+			for (const auto& [issueKey, samples] : iter->second.second.issues) {
+				auto& t = deduplicatedIssues[issueKey.toString()];
+				t.first += samples.count;
+				for (const auto& sample : samples.samples) {
+					t.second.push_back(formatIpPort(sample.first.ip, sample.first.port));
+				}
+			}
+			++iter;
 		}
 
 		// FIXME: add the log_group in addition to the network address
@@ -2985,6 +3021,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
 
 		state Optional<DatabaseConfiguration> configuration;
 		state Optional<LoadConfigurationResult> loadResult;
+		state std::unordered_map<NetworkAddress, WorkerInterface> address_workers;
 
 		if (statusCode != RecoveryStatus::configuration_missing) {
 			std::pair<Optional<DatabaseConfiguration>, Optional<LoadConfigurationResult>> loadResults =
@@ -3038,7 +3075,6 @@ ACTOR Future<StatusReply> clusterGetStatus(
 			// Start getting storage servers now (using system priority) concurrently.  Using sys priority because
 			// having storage servers in status output is important to give context to error messages in status that
 			// reference a storage server role ID.
-			state std::unordered_map<NetworkAddress, WorkerInterface> address_workers;
 			for (auto const& worker : workers) {
 				address_workers[worker.interf.address()] = worker.interf;
 			}
@@ -3259,6 +3295,12 @@ ACTOR Future<StatusReply> clusterGetStatus(
 		statusObj["processes"] = processStatus;
 		statusObj["clients"] = clientStatusFetcher(clientStatus);
 
+		if (configuration.present() && configuration.get().blobGranulesEnabled) {
+			JsonBuilderObject blobGranuelsStatus =
+			    wait(blobWorkerStatusFetcher(blobWorkers, address_workers, &status_incomplete_reasons));
+			statusObj["blob_granules"] = blobGranuelsStatus;
+		}
+
 		JsonBuilderArray incompatibleConnectionsArray;
 		for (auto it : incompatibleConnections) {
 			incompatibleConnectionsArray.push_back(it.toString());
diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp
index d7c0ee3047..7f1e950bc0 100644
--- a/fdbserver/StorageCache.actor.cpp
+++ b/fdbserver/StorageCache.actor.cpp
@@ -18,13 +18,13 @@
  * limitations under the License.
  */
 
+#include "fdbclient/FDBTypes.h"
 #include "fdbserver/OTELSpanContextMessage.h"
 #include "flow/Arena.h"
 #include "fdbclient/FDBOptions.g.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/SystemData.h"
-#include "fdbserver/EncryptedMutationMessage.h"
-#include "fdbserver/GetEncryptCipherKeys.h"
+#include "fdbclient/GetEncryptCipherKeys.actor.h"
 #include "fdbserver/Knobs.h"
 #include "fdbserver/ServerDBInfo.h"
 #include "fdbclient/StorageServerInterface.h"
@@ -481,18 +481,18 @@ ACTOR Future<Void> getValueQ(StorageCacheData* data, GetValueRequest req) {
 		// TODO what's this?
 		wait(delay(0, TaskPriority::DefaultEndpoint));
 
-		if (req.debugID.present()) {
+		if (req.options.present() && req.options.get().debugID.present()) {
 			g_traceBatch.addEvent("GetValueDebug",
-			                      req.debugID.get().first(),
+			                      req.options.get().debugID.get().first(),
 			                      "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask());
 			// FIXME
 		}
 
 		state Optional<Value> v;
 		state Version version = wait(waitForVersion(data, req.version));
-		if (req.debugID.present())
+		if (req.options.present() && req.options.get().debugID.present())
 			g_traceBatch.addEvent("GetValueDebug",
-			                      req.debugID.get().first(),
+			                      req.options.get().debugID.get().first(),
 			                      "getValueQ.AfterVersion"); //.detail("TaskID", g_network->getCurrentTask());
 
 		state uint64_t changeCounter = data->cacheRangeChangeCounter;
@@ -527,9 +527,9 @@ ACTOR Future<Void> getValueQ(StorageCacheData* data, GetValueRequest req) {
 			//TraceEvent(SevDebug, "SCGetValueQPresent", data->thisServerID).detail("ResultSize",resultSize).detail("Version", version).detail("ReqKey",req.key).detail("Value",v);
 		}
 
-		if (req.debugID.present())
+		if (req.options.present() && req.options.get().debugID.present())
 			g_traceBatch.addEvent("GetValueDebug",
-			                      req.debugID.get().first(),
+			                      req.options.get().debugID.get().first(),
 			                      "getValueQ.AfterRead"); //.detail("TaskID", g_network->getCurrentTask());
 
 		GetValueReply reply(v, true);
@@ -732,8 +732,8 @@ ACTOR Future<Void> getKeyValues(StorageCacheData* data, GetKeyValuesRequest req)
 	// Active load balancing runs at a very high priority (to obtain accurate queue lengths)
 	// so we need to downgrade here
 	TaskPriority taskType = TaskPriority::DefaultEndpoint;
-	if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.readType == ReadType::FETCH) {
-		taskType = TaskPriority::LowPriorityRead;
+	if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.options.present() && req.options.get().type == ReadType::FETCH) {
+		taskType = TaskPriority::FetchKeys;
 		// } else if (false) {
 		// 	// Placeholder for up-prioritizing fetches for important requests
 		// 	taskType = TaskPriority::DefaultDelay;
@@ -741,17 +741,18 @@ ACTOR Future<Void> getKeyValues(StorageCacheData* data, GetKeyValuesRequest req)
 	wait(delay(0, taskType));
 
 	try {
-		if (req.debugID.present())
-			g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.Before");
+		if (req.options.present() && req.options.get().debugID.present())
+			g_traceBatch.addEvent(
+			    "TransactionDebug", req.options.get().debugID.get().first(), "storagecache.getKeyValues.Before");
 		state Version version = wait(waitForVersion(data, req.version));
 
 		state uint64_t changeCounter = data->cacheRangeChangeCounter;
 
 		state KeyRange cachedKeyRange = getCachedKeyRange(data, req.begin);
 
-		if (req.debugID.present())
+		if (req.options.present() && req.options.get().debugID.present())
 			g_traceBatch.addEvent(
-			    "TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.AfterVersion");
+			    "TransactionDebug", req.options.get().debugID.get().first(), "storagecache.getKeyValues.AfterVersion");
 		//.detail("CacheRangeBegin", cachedKeyRange.begin).detail("CacheRangeEnd", cachedKeyRange.end);
 
 		if (!selectorInRange(req.end, cachedKeyRange) &&
@@ -769,8 +770,9 @@ ACTOR Future<Void> getKeyValues(StorageCacheData* data, GetKeyValuesRequest req)
 		                      : findKey(data, req.begin, version, cachedKeyRange, &offset1);
 		state Key end = req.end.isFirstGreaterOrEqual() ? req.end.getKey()
 		                                                : findKey(data, req.end, version, cachedKeyRange, &offset2);
-		if (req.debugID.present())
-			g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.AfterKeys");
+		if (req.options.present() && req.options.get().debugID.present())
+			g_traceBatch.addEvent(
+			    "TransactionDebug", req.options.get().debugID.get().first(), "storagecache.getKeyValues.AfterKeys");
 		//.detail("Off1",offset1).detail("Off2",offset2).detail("ReqBegin",req.begin.getKey()).detail("ReqEnd",req.end.getKey());
 
 		// Offsets of zero indicate begin/end keys in this cachedKeyRange, which obviously means we can answer the query
@@ -795,8 +797,9 @@ ACTOR Future<Void> getKeyValues(StorageCacheData* data, GetKeyValuesRequest req)
 		// offset1).detail("EndOffset", offset2);
 
 		if (begin >= end) {
-			if (req.debugID.present())
-				g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.Send");
+			if (req.options.present() && req.options.get().debugID.present())
+				g_traceBatch.addEvent(
+				    "TransactionDebug", req.options.get().debugID.get().first(), "storagecache.getKeyValues.Send");
 			//.detail("Begin",begin).detail("End",end);
 
 			GetKeyValuesReply none;
@@ -812,9 +815,10 @@ ACTOR Future<Void> getKeyValues(StorageCacheData* data, GetKeyValuesRequest req)
 			GetKeyValuesReply _r = readRange(data, version, KeyRangeRef(begin, end), req.limit, &remainingLimitBytes);
 			GetKeyValuesReply r = _r;
 
-			if (req.debugID.present())
-				g_traceBatch.addEvent(
-				    "TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.AfterReadRange");
+			if (req.options.present() && req.options.get().debugID.present())
+				g_traceBatch.addEvent("TransactionDebug",
+				                      req.options.get().debugID.get().first(),
+				                      "storagecache.getKeyValues.AfterReadRange");
 			data->checkChangeCounter(
 			    changeCounter,
 			    KeyRangeRef(std::min<KeyRef>(begin, std::min<KeyRef>(req.begin.getKey(), req.end.getKey())),
@@ -1183,13 +1187,15 @@ ACTOR Future<RangeResult> tryFetchRange(Database cx,
 	state RangeResult output;
 	state KeySelectorRef begin = firstGreaterOrEqual(keys.begin);
 	state KeySelectorRef end = firstGreaterOrEqual(keys.end);
+	state ReadOptions options = ReadOptions(Optional<UID>(), ReadType::FETCH);
 
 	if (*isTooOld)
 		throw transaction_too_old();
 
 	ASSERT(!cx->switchable);
 	tr.setVersion(version);
-	tr.trState->readType = ReadType::FETCH;
+	tr.trState->taskID = TaskPriority::FetchKeys;
+	tr.trState->readOptions = options;
 	limits.minRows = 0;
 
 	try {
@@ -1909,25 +1915,19 @@ ACTOR Future<Void> pullAsyncData(StorageCacheData* data) {
 					           OTELSpanContextMessage::isNextIn(cloneReader)) {
 						OTELSpanContextMessage scm;
 						cloneReader >> scm;
-					} else if (cloneReader.protocolVersion().hasEncryptionAtRest() &&
-					           EncryptedMutationMessage::isNextIn(cloneReader) && !cipherKeys.present()) {
-						// Encrypted mutation found, but cipher keys haven't been fetch.
-						// Collect cipher details to fetch cipher keys in one batch.
-						EncryptedMutationMessage emm;
-						cloneReader >> emm;
-						cipherDetails.insert(emm.header.cipherTextDetails);
-						cipherDetails.insert(emm.header.cipherHeaderDetails);
-						collectingCipherKeys = true;
 					} else {
 						MutationRef msg;
-						if (cloneReader.protocolVersion().hasEncryptionAtRest() &&
-						    EncryptedMutationMessage::isNextIn(cloneReader)) {
-							assert(cipherKeys.present());
-							msg = EncryptedMutationMessage::decrypt(cloneReader, cloneReader.arena(), cipherKeys.get());
-						} else {
-							cloneReader >> msg;
+						cloneReader >> msg;
+						if (msg.isEncrypted()) {
+							if (!cipherKeys.present()) {
+								const BlobCipherEncryptHeader* header = msg.encryptionHeader();
+								cipherDetails.insert(header->cipherTextDetails);
+								cipherDetails.insert(header->cipherHeaderDetails);
+								collectingCipherKeys = true;
+							} else {
+								msg = msg.decrypt(cipherKeys.get(), cloneReader.arena());
+							}
 						}
-
 						if (!collectingCipherKeys) {
 							if (firstMutation && msg.param1.startsWith(systemKeys.end))
 								hasPrivateData = true;
@@ -2019,10 +2019,9 @@ ACTOR Future<Void> pullAsyncData(StorageCacheData* data) {
 					reader >> oscm;
 				} else {
 					MutationRef msg;
-					if (reader.protocolVersion().hasEncryptionAtRest() && EncryptedMutationMessage::isNextIn(reader)) {
-						msg = EncryptedMutationMessage::decrypt(reader, reader.arena(), cipherKeys.get());
-					} else {
-						reader >> msg;
+					reader >> msg;
+					if (msg.isEncrypted()) {
+						msg = msg.decrypt(cipherKeys.get(), reader.arena());
 					}
 
 					if (ver != invalidVersion) // This change belongs to a version < minVersion
diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp
index 9d182ae832..fadb717ff1 100644
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@@ -497,7 +497,8 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 	*/
 
 	AsyncTrigger stopCommit;
-	bool stopped, initialized;
+	bool initialized;
+	Promise<Void> stoppedPromise;
 	DBRecoveryCount recoveryCount;
 
 	// If persistentDataVersion != persistentDurableDataVersion,
@@ -641,10 +642,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 	                 TLogSpillType logSpillType,
 	                 std::vector<Tag> tags,
 	                 std::string context)
-	  : stopped(false), initialized(false), queueCommittingVersion(0), knownCommittedVersion(0),
-	    durableKnownCommittedVersion(0), minKnownCommittedVersion(0), queuePoppedVersion(0), minPoppedTagVersion(0),
-	    minPoppedTag(invalidTag), unpoppedRecoveredTagCount(0), cc("TLog", interf.id().toString()),
-	    bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc), blockingPeeks("BlockingPeeks", cc),
+	  : initialized(false), queueCommittingVersion(0), knownCommittedVersion(0), durableKnownCommittedVersion(0),
+	    minKnownCommittedVersion(0), queuePoppedVersion(0), minPoppedTagVersion(0), minPoppedTag(invalidTag),
+	    unpoppedRecoveredTagCount(0), cc("TLog", interf.id().toString()), bytesInput("BytesInput", cc),
+	    bytesDurable("BytesDurable", cc), blockingPeeks("BlockingPeeks", cc),
 	    blockingPeekTimeouts("BlockingPeekTimeouts", cc), emptyPeeks("EmptyPeeks", cc),
 	    nonEmptyPeeks("NonEmptyPeeks", cc), logId(interf.id()), protocolVersion(protocolVersion),
 	    newPersistentDataVersion(invalidVersion), tLogData(tLogData), unrecoveredBefore(1), recoveredAt(1),
@@ -755,6 +756,15 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 			waitingTags.clear();
 		}
 	}
+
+	bool stopped() const { return stoppedPromise.isSet(); }
+
+	void stop() {
+		if (stoppedPromise.canBeSet()) {
+			TraceEvent(SevDebug, "StoppingTLog", tLogData->dbgid).detail("LogId", logId);
+			stoppedPromise.send(Void());
+		}
+	}
 };
 
 template <class T>
@@ -807,15 +817,15 @@ ACTOR Future<Void> tLogLock(TLogData* self, ReplyPromise<TLogLockResult> reply,
 	state Version stopVersion = logData->version.get();
 
 	CODE_PROBE(true, "TLog stopped by recovering cluster-controller");
-	CODE_PROBE(logData->stopped, "logData already stopped");
-	CODE_PROBE(!logData->stopped, "logData not yet stopped");
+	CODE_PROBE(logData->stopped(), "logData already stopped");
+	CODE_PROBE(!logData->stopped(), "logData not yet stopped");
 
 	TraceEvent("TLogStop", logData->logId)
 	    .detail("Ver", stopVersion)
-	    .detail("IsStopped", logData->stopped)
+	    .detail("IsStopped", logData->stopped())
 	    .detail("QueueCommitted", logData->queueCommittedVersion.get());
 
-	logData->stopped = true;
+	logData->stop();
 	logData->unblockWaitingPeeks();
 	if (!logData->recoveryComplete.isSet()) {
 		logData->recoveryComplete.sendError(end_of_stream());
@@ -835,7 +845,7 @@ ACTOR Future<Void> tLogLock(TLogData* self, ReplyPromise<TLogLockResult> reply,
 	TraceEvent("TLogStop2", self->dbgid)
 	    .detail("LogId", logData->logId)
 	    .detail("Ver", stopVersion)
-	    .detail("IsStopped", logData->stopped)
+	    .detail("IsStopped", logData->stopped())
 	    .detail("QueueCommitted", logData->queueCommittedVersion.get())
 	    .detail("KnownCommitted", result.knownCommittedVersion);
 
@@ -1092,7 +1102,10 @@ ACTOR Future<Void> updatePersistentData(TLogData* self, Reference<LogData> logDa
 	    BinaryWriter::toValue(logData->knownCommittedVersion, Unversioned())));
 	logData->persistentDataVersion = newPersistentDataVersion;
 
-	wait(self->persistentData->commit()); // SOMEDAY: This seems to be running pretty often, should we slow it down???
+	// SOMEDAY: This seems to be running pretty often, should we slow it down???
+	// This needs a timeout since nothing prevents I/O operations from hanging indefinitely.
+	wait(ioTimeoutError(self->persistentData->commit(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
+
 	wait(delay(0, TaskPriority::UpdateStorage));
 
 	// Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue,
@@ -1315,7 +1328,7 @@ ACTOR Future<Void> updateStorage(TLogData* self) {
 	// tag; which is not intended to ever happen.
 	Optional<Version> cachePopVersion;
 	for (auto& it : self->id_data) {
-		if (!it.second->stopped) {
+		if (!it.second->stopped()) {
 			if (it.second->version.get() - it.second->unrecoveredBefore >
 			    SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT + SERVER_KNOBS->MAX_CACHE_VERSIONS) {
 				cachePopVersion = it.second->version.get() - SERVER_KNOBS->MAX_CACHE_VERSIONS;
@@ -1332,7 +1345,7 @@ ACTOR Future<Void> updateStorage(TLogData* self) {
 		wait(waitForAll(cachePopFutures));
 	}
 
-	if (logData->stopped) {
+	if (logData->stopped()) {
 		if (self->bytesInput - self->bytesDurable >= self->targetVolatileBytes) {
 			while (logData->persistentDataDurableVersion != logData->version.get()) {
 				totalSize = 0;
@@ -1763,19 +1776,20 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
 		wait(logData->version.whenAtLeast(reqBegin));
 		wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask()));
 	}
-	if (!logData->stopped && reqTag.locality != tagLocalityTxs && reqTag != txsTag &&
+	if (!logData->stopped() && reqTag.locality != tagLocalityTxs && reqTag != txsTag &&
 	    logData->version.get() < logData->recoveryTxnVersion) {
 		DebugLogTraceEvent("TLogPeekMessages1", self->dbgid)
 		    .detail("LogId", logData->logId)
 		    .detail("Tag", reqTag.toString())
 		    .detail("ReqBegin", reqBegin)
 		    .detail("Version", logData->version.get())
+		    .detail("RecoveryTxnVersion", logData->recoveryTxnVersion)
 		    .detail("RecoveredAt", logData->recoveredAt);
 		// Make sure the peek reply has the recovery txn for the current TLog.
 		// Older generation TLog has been stopped and doesn't wait here.
 		// Similarly during recovery, reading transaction state store
 		// doesn't wait here.
-		wait(logData->version.whenAtLeast(logData->recoveryTxnVersion));
+		wait(logData->version.whenAtLeast(logData->recoveryTxnVersion) || logData->stoppedPromise.getFuture());
 	}
 
 	if (logData->locality != tagLocalitySatellite && reqTag.locality == tagLocalityLogRouter) {
@@ -2197,7 +2211,7 @@ ACTOR Future<Void> commitQueue(TLogData* self) {
 	loop {
 		int foundCount = 0;
 		for (auto it : self->id_data) {
-			if (!it.second->stopped) {
+			if (!it.second->stopped()) {
 				logData = it.second;
 				foundCount++;
 			} else if (it.second->version.get() >
@@ -2222,8 +2236,8 @@ ACTOR Future<Void> commitQueue(TLogData* self) {
 		}
 
 		loop {
-			if (logData->stopped && logData->version.get() == std::max(logData->queueCommittingVersion,
-			                                                           logData->queueCommittedVersion.get())) {
+			if (logData->stopped() && logData->version.get() == std::max(logData->queueCommittingVersion,
+			                                                             logData->queueCommittedVersion.get())) {
 				wait(logData->queueCommittedVersion.whenAtLeast(logData->version.get()));
 				break;
 			}
@@ -2274,7 +2288,7 @@ ACTOR Future<Void> tLogCommit(TLogData* self,
 	}
 
 	state double waitStartT = 0;
-	while (self->bytesInput - self->bytesDurable >= SERVER_KNOBS->TLOG_HARD_LIMIT_BYTES && !logData->stopped) {
+	while (self->bytesInput - self->bytesDurable >= SERVER_KNOBS->TLOG_HARD_LIMIT_BYTES && !logData->stopped()) {
 		if (now() - waitStartT >= 1) {
 			TraceEvent(SevWarn, "TLogUpdateLag", logData->logId)
 			    .detail("Version", logData->version.get())
@@ -2285,7 +2299,7 @@ ACTOR Future<Void> tLogCommit(TLogData* self,
 		wait(delayJittered(.005, TaskPriority::TLogCommit));
 	}
 
-	if (logData->stopped) {
+	if (logData->stopped()) {
 		req.reply.sendError(tlog_stopped());
 		return Void();
 	}
@@ -2334,7 +2348,7 @@ ACTOR Future<Void> tLogCommit(TLogData* self,
 	    timeoutWarning(logData->queueCommittedVersion.whenAtLeast(req.version) || stopped, 0.1, warningCollectorInput));
 
 	if (stopped.isReady()) {
-		ASSERT(logData->stopped);
+		ASSERT(logData->stopped());
 		req.reply.sendError(tlog_stopped());
 		return Void();
 	}
@@ -2356,7 +2370,7 @@ ACTOR Future<Void> initPersistentState(TLogData* self, Reference<LogData> logDat
 
 	// PERSIST: Initial setup of persistentData for a brand new tLog for a new database
 	state IKeyValueStore* storage = self->persistentData;
-	wait(ioTimeoutError(storage->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
+	wait(storage->init());
 	storage->set(persistFormat);
 	storage->set(
 	    KeyValueRef(BinaryWriter::toValue(logData->logId, Unversioned()).withPrefix(persistCurrentVersionKeys.begin),
@@ -2388,7 +2402,7 @@ ACTOR Future<Void> initPersistentState(TLogData* self, Reference<LogData> logDat
 	}
 
 	TraceEvent("TLogInitCommit", logData->logId).log();
-	wait(ioTimeoutError(self->persistentData->commit(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
+	wait(self->persistentData->commit());
 	return Void();
 }
 
@@ -2410,9 +2424,11 @@ ACTOR Future<UID> getClusterId(TLogData* self) {
 	}
 }
 
+// send stopped promise instead of LogData* to avoid reference cycles
 ACTOR Future<Void> rejoinClusterController(TLogData* self,
                                            TLogInterface tli,
                                            DBRecoveryCount recoveryCount,
+                                           Promise<Void> stoppedPromise,
                                            Future<Void> registerWithCC,
                                            bool isPrimary) {
 	state LifetimeToken lastMasterLifetime;
@@ -2453,6 +2469,13 @@ ACTOR Future<Void> rejoinClusterController(TLogData* self,
 			if (BUGGIFY)
 				wait(delay(SERVER_KNOBS->BUGGIFY_WORKER_REMOVED_MAX_LAG * deterministicRandom()->random01()));
 			throw worker_removed();
+		} else if (inf.recoveryCount > recoveryCount && stoppedPromise.canBeSet()) {
+			CODE_PROBE(true, "Stopping tlog because new dbinfo has a higher recovery count");
+			TraceEvent("StoppingTLog", self->dbgid)
+			    .detail("LogId", tli.id())
+			    .detail("NewRecoveryCount", inf.recoveryCount)
+			    .detail("MyRecoveryCount", recoveryCount);
+			stoppedPromise.send(Void());
 		}
 
 		if (registerWithCC.isReady()) {
@@ -2628,7 +2651,7 @@ ACTOR Future<Void> serveTLogInterface(TLogData* self,
 					                               logData->locality);
 				}
 
-				if (!logData->isPrimary && logData->stopped) {
+				if (!logData->isPrimary && logData->stopped()) {
 					TraceEvent("TLogAlreadyStopped", self->dbgid).detail("LogId", logData->logId);
 					logData->removed = logData->removed && logData->logSystem->get()->endEpoch();
 				}
@@ -2662,8 +2685,8 @@ ACTOR Future<Void> serveTLogInterface(TLogData* self,
 		when(TLogCommitRequest req = waitNext(tli.commit.getFuture())) {
 			//TraceEvent("TLogCommitReq", logData->logId).detail("Ver", req.version).detail("PrevVer", req.prevVersion).detail("LogVer", logData->version.get());
 			ASSERT(logData->isPrimary);
-			CODE_PROBE(logData->stopped, "TLogCommitRequest while stopped");
-			if (!logData->stopped)
+			CODE_PROBE(logData->stopped(), "TLogCommitRequest while stopped");
+			if (!logData->stopped())
 				logData->addActor.send(tLogCommit(self, req, logData, warningCollectorInput));
 			else
 				req.reply.sendError(tlog_stopped());
@@ -2680,7 +2703,7 @@ ACTOR Future<Void> serveTLogInterface(TLogData* self,
 				g_traceBatch.addAttach("TransactionAttachID", req.debugID.get().first(), tlogDebugID.first());
 				g_traceBatch.addEvent("TransactionDebug", tlogDebugID.first(), "TLogServer.TLogConfirmRunningRequest");
 			}
-			if (!logData->stopped)
+			if (!logData->stopped())
 				req.reply.send(Void());
 			else
 				req.reply.sendError(tlog_stopped());
@@ -2713,7 +2736,7 @@ void removeLog(TLogData* self, Reference<LogData> logData) {
 	    .detail("LogId", logData->logId)
 	    .detail("Input", logData->bytesInput.getValue())
 	    .detail("Durable", logData->bytesDurable.getValue());
-	logData->stopped = true;
+	logData->stop();
 	logData->unblockWaitingPeeks();
 	if (!logData->recoveryComplete.isSet()) {
 		logData->recoveryComplete.sendError(end_of_stream());
@@ -2773,7 +2796,7 @@ ACTOR Future<Void> pullAsyncData(TLogData* self,
 		}
 
 		state double waitStartT = 0;
-		while (self->bytesInput - self->bytesDurable >= SERVER_KNOBS->TLOG_HARD_LIMIT_BYTES && !logData->stopped) {
+		while (self->bytesInput - self->bytesDurable >= SERVER_KNOBS->TLOG_HARD_LIMIT_BYTES && !logData->stopped()) {
 			if (now() - waitStartT >= 1) {
 				TraceEvent(SevWarn, "TLogUpdateLag", logData->logId)
 				    .detail("Version", logData->version.get())
@@ -2792,7 +2815,7 @@ ACTOR Future<Void> pullAsyncData(TLogData* self,
 			if (!foundMessage || r->version().version != ver) {
 				ASSERT(r->version().version > lastVer);
 				if (ver) {
-					if (logData->stopped || (endVersion.present() && ver > endVersion.get())) {
+					if (logData->stopped() || (endVersion.present() && ver > endVersion.get())) {
 						return Void();
 					}
 
@@ -2833,7 +2856,7 @@ ACTOR Future<Void> pullAsyncData(TLogData* self,
 				if (!foundMessage) {
 					ver--;
 					if (ver > logData->version.get()) {
-						if (logData->stopped || (endVersion.present() && ver > endVersion.get())) {
+						if (logData->stopped() || (endVersion.present() && ver > endVersion.get())) {
 							return Void();
 						}
 
@@ -3126,7 +3149,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 		                                 std::vector<Tag>(),
 		                                 "Restored");
 		logData->locality = id_locality[id1];
-		logData->stopped = true;
+		logData->stop();
 		logData->unblockWaitingPeeks();
 		self->id_data[id1] = logData;
 		id_interf[id1] = recruited;
@@ -3138,8 +3161,8 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 		logData->version.set(ver);
 		logData->recoveryCount =
 		    BinaryReader::fromStringRef<DBRecoveryCount>(fRecoverCounts.get()[idx].value, Unversioned());
-		logData->removed =
-		    rejoinClusterController(self, recruited, logData->recoveryCount, registerWithCC.getFuture(), false);
+		logData->removed = rejoinClusterController(
+		    self, recruited, logData->recoveryCount, logData->stoppedPromise, registerWithCC.getFuture(), false);
 		removed.push_back(errorOr(logData->removed));
 		logsByVersion.emplace_back(ver, id1);
 
@@ -3332,7 +3355,7 @@ ACTOR Future<Void> updateLogSystem(TLogData* self,
 
 void stopAllTLogs(TLogData* self, UID newLogId) {
 	for (auto it : self->id_data) {
-		if (!it.second->stopped) {
+		if (!it.second->stopped()) {
 			TraceEvent("TLogStoppedByNewRecruitment", self->dbgid)
 			    .detail("LogId", it.second->logId)
 			    .detail("StoppedId", it.first.toString())
@@ -3345,7 +3368,7 @@ void stopAllTLogs(TLogData* self, UID newLogId) {
 				it.second->committingQueue.sendError(worker_removed());
 			}
 		}
-		it.second->stopped = true;
+		it.second->stop();
 		it.second->unblockWaitingPeeks();
 		if (!it.second->recoveryComplete.isSet()) {
 			it.second->recoveryComplete.sendError(end_of_stream());
@@ -3390,7 +3413,8 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
 	logData->locality = req.locality;
 	logData->recoveryCount = req.epoch;
 	logData->recoveryTxnVersion = req.recoveryTransactionVersion;
-	logData->removed = rejoinClusterController(self, recruited, req.epoch, Future<Void>(Void()), req.isPrimary);
+	logData->removed = rejoinClusterController(
+	    self, recruited, req.epoch, logData->stoppedPromise, Future<Void>(Void()), req.isPrimary);
 	self->popOrder.push_back(recruited.id());
 	self->spillOrder.push_back(recruited.id());
 
@@ -3417,7 +3441,8 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
 
 			logData->unpoppedRecoveredTagCount = req.allTags.size();
 			logData->unpoppedRecoveredTags = std::set<Tag>(req.allTags.begin(), req.allTags.end());
-			wait(initPersistentState(self, logData) || logData->removed);
+			wait(ioTimeoutError(initPersistentState(self, logData) || logData->removed,
+			                    SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
 
 			TraceEvent("TLogRecover", self->dbgid)
 			    .detail("LogId", logData->logId)
@@ -3437,7 +3462,7 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
 			logData->initialized = true;
 			self->newLogData.trigger();
 
-			if ((req.isPrimary || req.recoverFrom.logRouterTags == 0) && !logData->stopped &&
+			if ((req.isPrimary || req.recoverFrom.logRouterTags == 0) && !logData->stopped() &&
 			    logData->unrecoveredBefore <= recoverAt) {
 				if (req.recoverFrom.logRouterTags > 0 && req.locality != tagLocalitySatellite) {
 					logData->logRouterPopToVersion = recoverAt;
@@ -3458,7 +3483,7 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
 			state Version lastVersionPrevEpoch = req.recoverAt;
 
 			if ((req.isPrimary || req.recoverFrom.logRouterTags == 0) &&
-			    logData->version.get() < lastVersionPrevEpoch && !logData->stopped) {
+			    logData->version.get() < lastVersionPrevEpoch && !logData->stopped()) {
 				// Log the changes to the persistent queue, to be committed by commitQueue()
 				TLogQueueEntryRef qe;
 				qe.version = lastVersionPrevEpoch;
@@ -3481,7 +3506,8 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
 			logData->addActor.send(respondToRecovered(recruited, logData->recoveryComplete));
 		} else {
 			// Brand new tlog, initialization has already been done by caller
-			wait(initPersistentState(self, logData) || logData->removed);
+			wait(ioTimeoutError(initPersistentState(self, logData) || logData->removed,
+			                    SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
 
 			if (logData->recoveryComplete.isSet()) {
 				throw worker_removed();
diff --git a/fdbserver/TenantCache.actor.cpp b/fdbserver/TenantCache.actor.cpp
index 491dff1664..e5111d69e4 100644
--- a/fdbserver/TenantCache.actor.cpp
+++ b/fdbserver/TenantCache.actor.cpp
@@ -32,7 +32,7 @@ class TenantCacheImpl {
 		tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
 
 		KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> tenantList =
-		    wait(TenantMetadata::tenantMap.getRange(
+		    wait(TenantMetadata::tenantMap().getRange(
 		        tr, Optional<TenantName>(), Optional<TenantName>(), CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1));
 		ASSERT(tenantList.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER && !tenantList.more);
 
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 30328d2887..f19f3cddff 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -2007,7 +2007,7 @@ public:
 	          int64_t remapCleanupWindowBytes,
 	          int concurrentExtentReads,
 	          bool memoryOnly,
-	          std::shared_ptr<IEncryptionKeyProvider> keyProvider,
+	          Reference<IEncryptionKeyProvider> keyProvider,
 	          Promise<Void> errorPromise = {})
 	  : keyProvider(keyProvider), ioLock(FLOW_KNOBS->MAX_OUTSTANDING,
 	                                     SERVER_KNOBS->REDWOOD_IO_MAX_PRIORITY,
@@ -2016,8 +2016,8 @@ public:
 	    filename(filename), memoryOnly(memoryOnly), errorPromise(errorPromise),
 	    remapCleanupWindowBytes(remapCleanupWindowBytes), concurrentExtentReads(new FlowLock(concurrentExtentReads)) {
 
-		if (keyProvider == nullptr) {
-			keyProvider = std::make_shared<NullKeyProvider>();
+		if (!keyProvider) {
+			keyProvider = makeReference<NullKeyProvider>();
 		}
 
 		// This sets the page cache size for all PageCacheT instances using the same evictor
@@ -2781,11 +2781,8 @@ public:
 		             page->rawData(),
 		             header);
 
-		int readBytes = wait(readPhysicalBlock(self,
-		                                       page->rawData(),
-		                                       page->rawSize(),
-		                                       (int64_t)pageID * page->rawSize(),
-		                                       std::min(priority, ioMaxPriority)));
+		int readBytes = wait(
+		    readPhysicalBlock(self, page->rawData(), page->rawSize(), (int64_t)pageID * page->rawSize(), priority));
 		debug_printf("DWALPager(%s) op=readPhysicalDiskReadComplete %s ptr=%p bytes=%d\n",
 		             self->filename.c_str(),
 		             toString(pageID).c_str(),
@@ -3776,7 +3773,7 @@ private:
 	int physicalExtentSize;
 	int pagesPerExtent;
 
-	std::shared_ptr<IEncryptionKeyProvider> keyProvider;
+	Reference<IEncryptionKeyProvider> keyProvider;
 
 	PriorityMultiLock ioLock;
 
@@ -4772,6 +4769,9 @@ public:
 		constexpr static FileIdentifier file_identifier = 10847329;
 		constexpr static unsigned int FORMAT_VERSION = 17;
 
+		// Maximum size of the root pointer
+		constexpr static int maxRootPointerSize = 3000 / sizeof(LogicalPageID);
+
 		// This serves as the format version for the entire tree, individual pages will not be versioned
 		uint32_t formatVersion;
 		EncodingType encodingType;
@@ -4854,7 +4854,7 @@ public:
 	VersionedBTree(IPager2* pager,
 	               std::string name,
 	               EncodingType defaultEncodingType,
-	               std::shared_ptr<IEncryptionKeyProvider> keyProvider)
+	               Reference<IEncryptionKeyProvider> keyProvider)
 	  : m_pager(pager), m_encodingType(defaultEncodingType), m_enforceEncodingType(false), m_keyProvider(keyProvider),
 	    m_pBuffer(nullptr), m_mutationCount(0), m_name(name) {
 
@@ -4862,13 +4862,13 @@ public:
 		// This prevents an attack where an encrypted page is replaced by an attacker with an unencrypted page
 		// or an encrypted page fabricated using a compromised scheme.
 		if (ArenaPage::isEncodingTypeEncrypted(m_encodingType)) {
-			ASSERT(keyProvider != nullptr);
+			ASSERT(keyProvider.isValid());
 			m_enforceEncodingType = true;
 		}
 
 		// If key provider isn't given, instantiate the null provider
-		if (m_keyProvider == nullptr) {
-			m_keyProvider = std::make_shared<NullKeyProvider>();
+		if (!m_keyProvider) {
+			m_keyProvider = makeReference<NullKeyProvider>();
 		}
 
 		m_pBoundaryVerifier = DecodeBoundaryVerifier::getVerifier(name);
@@ -5054,6 +5054,17 @@ public:
 			self->m_lazyClearQueue.recover(self->m_pager, self->m_header.lazyDeleteQueue, "LazyClearQueueRecovered");
 			debug_printf("BTree recovered.\n");
 
+			if (ArenaPage::isEncodingTypeEncrypted(self->m_header.encodingType) &&
+			    self->m_encodingType == EncodingType::XXHash64) {
+				// On restart the encryption config of the cluster could be unknown. In that case if we find the Redwood
+				// instance is encrypted, we should use the same encryption encoding.
+				self->m_encodingType = self->m_header.encodingType;
+				self->m_enforceEncodingType = true;
+				TraceEvent("RedwoodBTreeNodeForceEncryption")
+				    .detail("InstanceName", self->m_pager->getName())
+				    .detail("EncodingFound", self->m_header.encodingType)
+				    .detail("EncodingDesired", self->m_encodingType);
+			}
 			if (self->m_header.encodingType != self->m_encodingType) {
 				TraceEvent(SevWarn, "RedwoodBTreeNodeEncodingMismatch")
 				    .detail("InstanceName", self->m_pager->getName())
@@ -5350,7 +5361,7 @@ private:
 	IPager2* m_pager;
 	EncodingType m_encodingType;
 	bool m_enforceEncodingType;
-	std::shared_ptr<IEncryptionKeyProvider> m_keyProvider;
+	Reference<IEncryptionKeyProvider> m_keyProvider;
 
 	// Counter to update with DecodeCache memory usage
 	int64_t* m_pDecodeCacheMemory = nullptr;
@@ -5779,15 +5790,22 @@ private:
 		return records;
 	}
 
-	ACTOR static Future<Standalone<VectorRef<RedwoodRecordRef>>> buildNewRoot(
+	// Takes a list of records commitSubtree() on the root and builds new root pages
+	// until there is only 1 root records which is small enough to fit in the BTree commit header.
+	ACTOR static Future<Standalone<VectorRef<RedwoodRecordRef>>> buildNewRootsIfNeeded(
 	    VersionedBTree* self,
 	    Version version,
 	    Standalone<VectorRef<RedwoodRecordRef>> records,
 	    unsigned int height) {
 		debug_printf("buildNewRoot start version %" PRId64 ", %d records\n", version, records.size());
 
-		// While there are multiple child pages for this version we must write new tree levels.
-		while (records.size() > 1) {
+		// While there are multiple child records or there is only one but it is too large to fit in the BTree
+		// commit record, build a new root page and update records to be a link to that new page.
+		// Root pointer size is limited because the pager commit header is limited to smallestPhysicalBlock in
+		// size.
+		while (records.size() > 1 ||
+		       records.front().getChildPage().size() > (BUGGIFY ? 1 : BTreeCommitHeader::maxRootPointerSize)) {
+			CODE_PROBE(records.size() == 1, "Writing a new root because the current root pointer would be too large");
 			self->m_header.height = ++height;
 			ASSERT(height < std::numeric_limits<int8_t>::max());
 			Standalone<VectorRef<RedwoodRecordRef>> newRecords = wait(
@@ -7117,14 +7135,10 @@ private:
 				self->m_pager->updatePage(PagerEventReasons::Commit, self->m_header.height, rootNodeLink, page);
 			} else {
 				Standalone<VectorRef<RedwoodRecordRef>> newRootRecords(all.newLinks, all.newLinks.arena());
-				if (newRootRecords.size() == 1) {
-					rootNodeLink = newRootRecords.front().getChildPage();
-				} else {
-					// If the new root level's size is not 1 then build new root level(s)
-					Standalone<VectorRef<RedwoodRecordRef>> newRootPage =
-					    wait(buildNewRoot(self, batch.writeVersion, newRootRecords, self->m_header.height));
-					rootNodeLink = newRootPage.front().getChildPage();
-				}
+				// Build new root levels if there are multiple new root records or if the root pointer is too large
+				Standalone<VectorRef<RedwoodRecordRef>> newRootPage =
+				    wait(buildNewRootsIfNeeded(self, batch.writeVersion, newRootRecords, self->m_header.height));
+				rootNodeLink = newRootPage.front().getChildPage();
 			}
 		}
 
@@ -7167,6 +7181,7 @@ public:
 
 	private:
 		PagerEventReasons reason;
+		Optional<ReadOptions> options;
 		VersionedBTree* btree;
 		Reference<IPagerSnapshot> pager;
 		bool valid;
@@ -7232,7 +7247,7 @@ public:
 			                    link.get().getChildPage(),
 			                    ioMaxPriority,
 			                    false,
-			                    true),
+			                    !options.present() || options.get().cacheResult || path.back().btPage()->height != 2),
 			           [=](Reference<const ArenaPage> p) {
 #if REDWOOD_DEBUG
 				           path.push_back({ p, btree->getCursor(p.getPtr(), link), link.get().getChildPage() });
@@ -7266,10 +7281,12 @@ public:
 		// Initialize or reinitialize cursor
 		Future<Void> init(VersionedBTree* btree_in,
 		                  PagerEventReasons reason_in,
+		                  Optional<ReadOptions> options_in,
 		                  Reference<IPagerSnapshot> pager_in,
 		                  BTreeNodeLink root) {
 			btree = btree_in;
 			reason = reason_in;
+			options = options_in;
 			pager = pager_in;
 			path.clear();
 			path.reserve(6);
@@ -7464,7 +7481,10 @@ public:
 		Future<Void> movePrev() { return path.empty() ? Void() : move_impl(this, false); }
 	};
 
-	Future<Void> initBTreeCursor(BTreeCursor* cursor, Version snapshotVersion, PagerEventReasons reason) {
+	Future<Void> initBTreeCursor(BTreeCursor* cursor,
+	                             Version snapshotVersion,
+	                             PagerEventReasons reason,
+	                             Optional<ReadOptions> options = Optional<ReadOptions>()) {
 		Reference<IPagerSnapshot> snapshot = m_pager->getReadSnapshot(snapshotVersion);
 
 		BTreeNodeLinkRef root;
@@ -7481,7 +7501,7 @@ public:
 			root = *snapshot->extra.getPtr<BTreeNodeLink>();
 		}
 
-		return cursor->init(this, reason, snapshot, root);
+		return cursor->init(this, reason, options, snapshot, root);
 	}
 };
 
@@ -7492,7 +7512,7 @@ RedwoodRecordRef VersionedBTree::dbEnd(LiteralStringRef("\xff\xff\xff\xff\xff"))
 
 class KeyValueStoreRedwood : public IKeyValueStore {
 public:
-	KeyValueStoreRedwood(std::string filename, UID logID)
+	KeyValueStoreRedwood(std::string filename, UID logID,  Reference<IEncryptionKeyProvider> encryptionKeyProvider)
 	  : m_filename(filename), m_concurrentReads(SERVER_KNOBS->REDWOOD_KVSTORE_CONCURRENT_READS,
 	                                            0,
 	                                            std::to_string(maxConcurrentReadsLaunchLimit)),
@@ -7517,10 +7537,15 @@ public:
 
 		EncodingType encodingType = EncodingType::XXHash64;
 
-		// Deterministically enable encryption based on uid
-		if (g_network->isSimulated() && logID.hash() % 2 == 0) {
-			encodingType = EncodingType::XOREncryption;
-			m_keyProvider = std::make_shared<XOREncryptionKeyProvider>(filename);
+		// When reopening Redwood on restart, the cluser encryption config could be unknown at this point,
+		// for which shouldEnableEncryption will return false. In that case, if the Redwood instance was encrypted
+		// before, the encoding type in the header page will be used instead.
+		//
+		// TODO(yiwu): When the cluster encryption config is available later, fail if the cluster is configured to
+		// enable encryption, but the Redwood instance is unencrypted.
+		if (encryptionKeyProvider && encryptionKeyProvider->shouldEnableEncryption()) {
+			encodingType = EncodingType::AESEncryptionV1;
+			m_keyProvider = encryptionKeyProvider;
 		}
 
 		IPager2* pager = new DWALPager(pageSize,
@@ -7614,18 +7639,26 @@ public:
 		m_tree->set(keyValue);
 	}
 
-	Future<RangeResult> readRange(KeyRangeRef keys, int rowLimit, int byteLimit, ReadType) override {
+	Future<RangeResult> readRange(KeyRangeRef keys,
+	                              int rowLimit,
+	                              int byteLimit,
+	                              Optional<ReadOptions> options) override {
 		debug_printf("READRANGE %s\n", printable(keys).c_str());
-		return catchError(readRange_impl(this, keys, rowLimit, byteLimit));
+		return catchError(readRange_impl(this, keys, rowLimit, byteLimit, options));
 	}
 
 	ACTOR static Future<RangeResult> readRange_impl(KeyValueStoreRedwood* self,
 	                                                KeyRange keys,
 	                                                int rowLimit,
-	                                                int byteLimit) {
+	                                                int byteLimit,
+	                                                Optional<ReadOptions> options) {
+		state PagerEventReasons reason = PagerEventReasons::RangeRead;
 		state VersionedBTree::BTreeCursor cur;
-		wait(
-		    self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion(), PagerEventReasons::RangeRead));
+		if (options.present() && options.get().type == ReadType::FETCH) {
+			reason = PagerEventReasons::FetchRange;
+		}
+		wait(self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion(), reason, options));
+
 		state PriorityMultiLock::Lock lock;
 		state Future<Void> f;
 		++g_redwoodMetrics.metric.opGetRange;
@@ -7761,10 +7794,12 @@ public:
 		return result;
 	}
 
-	ACTOR static Future<Optional<Value>> readValue_impl(KeyValueStoreRedwood* self, Key key, Optional<UID> debugID) {
+	ACTOR static Future<Optional<Value>> readValue_impl(KeyValueStoreRedwood* self,
+	                                                    Key key,
+	                                                    Optional<ReadOptions> options) {
 		state VersionedBTree::BTreeCursor cur;
-		wait(
-		    self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion(), PagerEventReasons::PointRead));
+		wait(self->m_tree->initBTreeCursor(
+		    &cur, self->m_tree->getLastCommittedVersion(), PagerEventReasons::PointRead, options));
 
 		// Not locking for point reads, instead relying on IO priority lock
 		// state PriorityMultiLock::Lock lock = wait(self->m_concurrentReads.lock());
@@ -7783,12 +7818,12 @@ public:
 		return Optional<Value>();
 	}
 
-	Future<Optional<Value>> readValue(KeyRef key, ReadType, Optional<UID> debugID) override {
-		return catchError(readValue_impl(this, key, debugID));
+	Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> options) override {
+		return catchError(readValue_impl(this, key, options));
 	}
 
-	Future<Optional<Value>> readValuePrefix(KeyRef key, int maxLength, ReadType, Optional<UID> debugID) override {
-		return catchError(map(readValue_impl(this, key, debugID), [maxLength](Optional<Value> v) {
+	Future<Optional<Value>> readValuePrefix(KeyRef key, int maxLength, Optional<ReadOptions> options) override {
+		return catchError(map(readValue_impl(this, key, options), [maxLength](Optional<Value> v) {
 			if (v.present() && v.get().size() > maxLength) {
 				v.get().contents() = v.get().substr(0, maxLength);
 			}
@@ -7807,7 +7842,7 @@ private:
 	PriorityMultiLock m_concurrentReads;
 	bool prefetch;
 	Version m_nextCommitVersion;
-	std::shared_ptr<IEncryptionKeyProvider> m_keyProvider;
+	Reference<IEncryptionKeyProvider> m_keyProvider;
 	Future<Void> m_lastCommit = Void();
 
 	template <typename T>
@@ -7816,8 +7851,10 @@ private:
 	}
 };
 
-IKeyValueStore* keyValueStoreRedwoodV1(std::string const& filename, UID logID) {
-	return new KeyValueStoreRedwood(filename, logID);
+IKeyValueStore* keyValueStoreRedwoodV1(std::string const& filename,
+                                       UID logID,
+                                       Reference<IEncryptionKeyProvider> encryptionKeyProvider) {
+	return new KeyValueStoreRedwood(filename, logID, encryptionKeyProvider);
 }
 
 int randomSize(int max) {
@@ -9585,7 +9622,7 @@ TEST_CASE("Lredwood/correctness/btree") {
 	state bool shortTest = params.getInt("shortTest").orDefault(deterministicRandom()->random01() < 0.25);
 
 	state int pageSize =
-	    shortTest ? 200 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(200, 400));
+	    shortTest ? 250 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(250, 400));
 	state int extentSize =
 	    params.getInt("extentSize")
 	        .orDefault(deterministicRandom()->coinflip() ? SERVER_KNOBS->REDWOOD_DEFAULT_EXTENT_SIZE
@@ -9596,7 +9633,9 @@ TEST_CASE("Lredwood/correctness/btree") {
 	state int maxValueSize = params.getInt("maxValueSize").orDefault(randomSize(pageSize * 25));
 	state int maxCommitSize =
 	    params.getInt("maxCommitSize")
-	        .orDefault(shortTest ? 1000 : randomSize(std::min<int>((maxKeySize + maxValueSize) * 20000, 10e6)));
+	        .orDefault(shortTest
+	                       ? 1000
+	                       : randomSize((int)std::min<int64_t>((maxKeySize + maxValueSize) * int64_t(20000), 10e6)));
 	state double setExistingKeyProbability =
 	    params.getDouble("setExistingKeyProbability").orDefault(deterministicRandom()->random01() * .5);
 	state double clearProbability =
@@ -9632,12 +9671,13 @@ TEST_CASE("Lredwood/correctness/btree") {
 	// Max number of records in the BTree or the versioned written map to visit
 	state int64_t maxRecordsRead = params.getInt("maxRecordsRead").orDefault(300e6);
 
-	state EncodingType encodingType = EncodingType::XXHash64;
-	state std::shared_ptr<IEncryptionKeyProvider> keyProvider;
-
-	if (deterministicRandom()->coinflip()) {
-		encodingType = EncodingType::XOREncryption;
-		keyProvider = std::make_shared<XOREncryptionKeyProvider>(file);
+	state EncodingType encodingType =
+	    static_cast<EncodingType>(deterministicRandom()->randomInt(0, EncodingType::MAX_ENCODING_TYPE));
+	state Reference<IEncryptionKeyProvider> keyProvider;
+	if (encodingType == EncodingType::AESEncryptionV1) {
+		keyProvider = makeReference<RandomEncryptionKeyProvider>();
+	} else if (encodingType == EncodingType::XOREncryption_TestOnly) {
+		keyProvider = makeReference<XOREncryptionKeyProvider_TestOnly>(file);
 	}
 
 	printf("\n");
@@ -10119,7 +10159,7 @@ TEST_CASE(":/redwood/performance/extentQueue") {
 		                      remapCleanupWindowBytes,
 		                      concurrentExtentReads,
 		                      false,
-		                      nullptr);
+		                      Reference<IEncryptionKeyProvider>());
 
 		wait(success(pager->init()));
 
@@ -10170,8 +10210,14 @@ TEST_CASE(":/redwood/performance/extentQueue") {
 	}
 
 	printf("Reopening pager file from disk.\n");
-	pager = new DWALPager(
-	    pageSize, extentSize, fileName, cacheSizeBytes, remapCleanupWindowBytes, concurrentExtentReads, false, nullptr);
+	pager = new DWALPager(pageSize,
+	                      extentSize,
+	                      fileName,
+	                      cacheSizeBytes,
+	                      remapCleanupWindowBytes,
+	                      concurrentExtentReads,
+	                      false,
+	                      Reference<IEncryptionKeyProvider>());
 	wait(success(pager->init()));
 
 	printf("Starting ExtentQueue FastPath Recovery from Disk.\n");
@@ -10316,8 +10362,9 @@ TEST_CASE(":/redwood/performance/set") {
 	                                 remapCleanupWindowBytes,
 	                                 concurrentExtentReads,
 	                                 pagerMemoryOnly,
-	                                 nullptr);
-	state VersionedBTree* btree = new VersionedBTree(pager, file, EncodingType::XXHash64, nullptr);
+	                                 Reference<IEncryptionKeyProvider>());
+	state VersionedBTree* btree =
+	    new VersionedBTree(pager, file, EncodingType::XXHash64, Reference<IEncryptionKeyProvider>());
 	wait(btree->init());
 	printf("Initialized.  StorageBytes=%s\n", btree->getStorageBytes().toString().c_str());
 
@@ -10845,7 +10892,9 @@ ACTOR Future<Void> randomRangeScans(IKeyValueStore* kvs,
                                     int valueSize,
                                     int recordCountTarget,
                                     bool singlePrefix,
-                                    int rowLimit) {
+                                    int rowLimit,
+                                    int byteLimit,
+                                    Optional<ReadOptions> options = Optional<ReadOptions>()) {
 	fmt::print("\nstoreType: {}\n", static_cast<int>(kvs->getType()));
 	fmt::print("prefixSource: {}\n", source.toString());
 	fmt::print("suffixSize: {}\n", suffixSize);
@@ -10878,7 +10927,7 @@ ACTOR Future<Void> randomRangeScans(IKeyValueStore* kvs,
 		KeyRangeRef range = source.getKeyRangeRef(singlePrefix, suffixSize);
 		int rowLim = (deterministicRandom()->randomInt(0, 2) != 0) ? rowLimit : -rowLimit;
 
-		RangeResult result = wait(kvs->readRange(range, rowLim));
+		RangeResult result = wait(kvs->readRange(range, rowLim, byteLimit, options));
 
 		recordsRead += result.size();
 		bytesRead += result.size() * recordSize;
@@ -10901,6 +10950,7 @@ TEST_CASE(":/redwood/performance/randomRangeScans") {
 	state int prefixLen = 30;
 	state int suffixSize = 12;
 	state int valueSize = 100;
+	state int maxByteLimit = std::numeric_limits<int>::max();
 
 	// TODO change to 100e8 after figuring out no-disk redwood mode
 	state int writeRecordCountTarget = 1e6;
@@ -10916,11 +10966,11 @@ TEST_CASE(":/redwood/performance/randomRangeScans") {
 	    redwood, suffixSize, valueSize, source, writeRecordCountTarget, writePrefixesInOrder, false));
 
 	// divide targets for tiny queries by 10 because they are much slower
-	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget / 10, true, 10));
-	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget, true, 1000));
-	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget / 10, false, 100));
-	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget, false, 10000));
-	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget, false, 1000000));
+	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget / 10, true, 10, maxByteLimit));
+	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget, true, 1000, maxByteLimit));
+	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget / 10, false, 100, maxByteLimit));
+	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget, false, 10000, maxByteLimit));
+	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget, false, 1000000, maxByteLimit));
 	wait(closeKVS(redwood));
 	printf("\n");
 	return Void();
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index a27324d3af..cbcfd04e45 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -112,8 +112,8 @@ enum {
 	OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_PRINT_CODE_PROBES, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_UNITTESTPARAM, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE,
 	OPT_METRICSPREFIX, OPT_LOGGROUP, OPT_LOCALITY, OPT_IO_TRUST_SECONDS, OPT_IO_TRUST_WARN_ONLY, OPT_FILESYSTEM, OPT_PROFILER_RSS_SIZE, OPT_KVFILE,
 	OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE, OPT_CONFIG_PATH, OPT_USE_TEST_CONFIG_DB, OPT_FAULT_INJECTION, OPT_PROFILER, OPT_PRINT_SIMTIME,
-	OPT_FLOW_PROCESS_NAME, OPT_FLOW_PROCESS_ENDPOINT, OPT_IP_TRUSTED_MASK, OPT_KMS_CONN_DISCOVERY_URL_FILE, OPT_KMS_CONN_VALIDATION_TOKEN_DETAILS, OPT_KMS_CONN_GET_ENCRYPTION_KEYS_ENDPOINT,
-	OPT_NEW_CLUSTER_KEY
+	OPT_FLOW_PROCESS_NAME, OPT_FLOW_PROCESS_ENDPOINT, OPT_IP_TRUSTED_MASK, OPT_KMS_CONN_DISCOVERY_URL_FILE, OPT_KMS_CONNECTOR_TYPE, OPT_KMS_CONN_VALIDATION_TOKEN_DETAILS,
+	OPT_KMS_CONN_GET_ENCRYPTION_KEYS_ENDPOINT, OPT_NEW_CLUSTER_KEY, OPT_AUTHZ_PUBLIC_KEY_FILE, OPT_USE_FUTURE_PROTOCOL_VERSION
 };
 
 CSimpleOpt::SOption g_rgOptions[] = {
@@ -128,8 +128,8 @@ CSimpleOpt::SOption g_rgOptions[] = {
 	{ OPT_LISTEN,                "-l",                          SO_REQ_SEP },
 	{ OPT_LISTEN,                "--listen-address",            SO_REQ_SEP },
 #ifdef __linux__
-	{ OPT_FILESYSTEM,           "--data-filesystem",           SO_REQ_SEP },
-	{ OPT_PROFILER_RSS_SIZE,    "--rsssize",                   SO_REQ_SEP },
+	{ OPT_FILESYSTEM,           "--data-filesystem",            SO_REQ_SEP },
+	{ OPT_PROFILER_RSS_SIZE,    "--rsssize",                    SO_REQ_SEP },
 #endif
 	{ OPT_DATAFOLDER,            "-d",                          SO_REQ_SEP },
 	{ OPT_DATAFOLDER,            "--datadir",                   SO_REQ_SEP },
@@ -208,10 +208,12 @@ CSimpleOpt::SOption g_rgOptions[] = {
 	{ OPT_FLOW_PROCESS_ENDPOINT, "--process-endpoint",          SO_REQ_SEP },
 	{ OPT_IP_TRUSTED_MASK,       "--trusted-subnet-",           SO_REQ_SEP },
 	{ OPT_NEW_CLUSTER_KEY,       "--new-cluster-key",           SO_REQ_SEP },
-	{ OPT_KMS_CONN_DISCOVERY_URL_FILE,           "--discover-kms-conn-url-file",            SO_REQ_SEP},
-	{ OPT_KMS_CONN_VALIDATION_TOKEN_DETAILS,     "--kms-conn-validation-token-details",     SO_REQ_SEP},
-	{ OPT_KMS_CONN_GET_ENCRYPTION_KEYS_ENDPOINT, "--kms-conn-get-encryption-keys-endpoint", SO_REQ_SEP},
-	
+	{ OPT_AUTHZ_PUBLIC_KEY_FILE, "--authorization-public-key-file", SO_REQ_SEP },
+	{ OPT_KMS_CONN_DISCOVERY_URL_FILE,           "--discover-kms-conn-url-file",            SO_REQ_SEP },
+	{ OPT_KMS_CONNECTOR_TYPE,    "--kms-connector-type",        SO_REQ_SEP },
+	{ OPT_KMS_CONN_VALIDATION_TOKEN_DETAILS,     "--kms-conn-validation-token-details",     SO_REQ_SEP },
+	{ OPT_KMS_CONN_GET_ENCRYPTION_KEYS_ENDPOINT, "--kms-conn-get-encryption-keys-endpoint", SO_REQ_SEP },
+	{ OPT_USE_FUTURE_PROTOCOL_VERSION, 			 "--use-future-protocol-version",			SO_REQ_SEP },
 	TLS_OPTION_FLAGS,
 	SO_END_OF_OPTIONS
 };
@@ -539,7 +541,7 @@ static void printBuildInformation() {
 static void printVersion() {
 	printf("FoundationDB " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n");
 	printf("source version %s\n", getSourceVersion());
-	printf("protocol %" PRIx64 "\n", currentProtocolVersion.version());
+	printf("protocol %" PRIx64 "\n", currentProtocolVersion().version());
 }
 
 static void printHelpTeaser(const char* name) {
@@ -732,6 +734,9 @@ static void printUsage(const char* name, bool devhelp) {
 		printOptionUsage("--io-trust-warn-only",
 		                 " Instead of failing when an I/O operation exceeds io_trust_seconds, just"
 		                 " log a warning to the trace log. Has no effect if io_trust_seconds is unspecified.");
+		printOptionUsage("--use-future-protocol-version [true,false]",
+		                 " Run the process with a simulated future protocol version."
+		                 " This option can be used testing purposes only!");
 		printf("\n"
 		       "The 'kvfiledump' role dump all key-values from kvfile to stdout in binary format:\n"
 		       "{key length}{key binary}{value length}{value binary}, length is 4 bytes int\n"
@@ -750,6 +755,7 @@ static void printUsage(const char* name, bool devhelp) {
 		    "The given cluster file passed in by '-C, --cluster-file' is considered to contain the old cluster key.\n"
 		    "It is used before restoring a snapshotted cluster to let the cluster have a different cluster key.\n"
 		    "Please make sure run it on every host in the cluster with the same '--new-cluster-key'.\n");
+
 	} else {
 		printOptionUsage("--dev-help", "Display developer-specific help and exit.");
 	}
@@ -1017,8 +1023,8 @@ enum class ServerRole {
 };
 struct CLIOptions {
 	std::string commandLine;
-	std::string fileSystemPath, dataFolder, connFile, seedConnFile, seedConnString, logFolder = ".", metricsConnFile,
-	                                                                                metricsPrefix, newClusterKey;
+	std::string fileSystemPath, dataFolder, connFile, seedConnFile, seedConnString,
+	    logFolder = ".", metricsConnFile, metricsPrefix, newClusterKey, authzPublicKeyFile;
 	std::string logGroup = "default";
 	uint64_t rollsize = TRACE_DEFAULT_ROLL_SIZE;
 	uint64_t maxLogsSize = TRACE_DEFAULT_MAX_LOGS_SIZE;
@@ -1078,6 +1084,7 @@ struct CLIOptions {
 	static CLIOptions parseArgs(int argc, char* argv[]) {
 		CLIOptions opts;
 		opts.parseArgsInternal(argc, argv);
+		opts.parseEnvInternal();
 		return opts;
 	}
 
@@ -1107,6 +1114,25 @@ struct CLIOptions {
 private:
 	CLIOptions() = default;
 
+	void parseEnvInternal() {
+		for (const std::string& knob : getEnvironmentKnobOptions()) {
+			auto pos = knob.find_first_of("=");
+			if (pos == std::string::npos) {
+				fprintf(stderr,
+				        "Error: malformed environment knob option: %s%s\n",
+				        ENVIRONMENT_KNOB_OPTION_PREFIX,
+				        knob.c_str());
+				TraceEvent(SevWarnAlways, "MalformedEnvironmentVariableKnob")
+				    .detail("Key", ENVIRONMENT_KNOB_OPTION_PREFIX + knob);
+			} else {
+				std::string k = knob.substr(0, pos);
+				std::string v = knob.substr(pos + 1, knob.length());
+				knobs.emplace_back(k, v);
+				manualKnobOverrides[k] = v;
+			}
+		}
+	}
+
 	void parseArgsInternal(int argc, char* argv[]) {
 		for (int a = 0; a < argc; a++) {
 			if (a)
@@ -1663,6 +1689,10 @@ private:
 				knobs.emplace_back("rest_kms_connector_kms_discovery_url_file", args.OptionArg());
 				break;
 			}
+			case OPT_KMS_CONNECTOR_TYPE: {
+				knobs.emplace_back("kms_connector_type", args.OptionArg());
+				break;
+			}
 			case OPT_KMS_CONN_VALIDATION_TOKEN_DETAILS: {
 				knobs.emplace_back("rest_kms_connector_validation_token_details", args.OptionArg());
 				break;
@@ -1684,6 +1714,16 @@ private:
 				}
 				break;
 			}
+			case OPT_AUTHZ_PUBLIC_KEY_FILE: {
+				authzPublicKeyFile = args.OptionArg();
+				break;
+			}
+			case OPT_USE_FUTURE_PROTOCOL_VERSION: {
+				if (!strcmp(args.OptionArg(), "true")) {
+					::useFutureProtocolVersion();
+				}
+				break;
+			}
 			}
 		}
 
@@ -1826,6 +1866,30 @@ private:
 			localities.set(LocalityData::keyDcId, dcId);
 	}
 };
+
+// Returns true iff validation is successful
+bool validateSimulationDataFiles(std::string const& dataFolder, bool isRestarting) {
+	std::vector<std::string> files = platform::listFiles(dataFolder);
+	if (!isRestarting) {
+		for (const auto& file : files) {
+			if (file != "restartInfo.ini" && file != getTestEncryptionFileName()) {
+				TraceEvent(SevError, "IncompatibleFileFound").detail("DataFolder", dataFolder).detail("FileName", file);
+				fprintf(stderr,
+				        "ERROR: Data folder `%s' is non-empty; please use clean, fdb-only folder\n",
+				        dataFolder.c_str());
+				return false;
+			}
+		}
+	} else if (isRestarting && files.empty()) {
+		TraceEvent(SevWarnAlways, "FileNotFound").detail("DataFolder", dataFolder);
+		printf("ERROR: Data folder `%s' is empty, but restarting option selected. Run Phase 1 test first\n",
+		       dataFolder.c_str());
+		return false;
+	}
+
+	return true;
+}
+
 } // namespace
 
 int main(int argc, char* argv[]) {
@@ -1970,6 +2034,16 @@ int main(int argc, char* argv[]) {
 			openTraceFile(
 			    opts.publicAddresses.address, opts.rollsize, opts.maxLogsSize, opts.logFolder, "trace", opts.logGroup);
 			g_network->initTLS();
+			if (!opts.authzPublicKeyFile.empty()) {
+				try {
+					FlowTransport::transport().loadPublicKeyFile(opts.authzPublicKeyFile);
+				} catch (Error& e) {
+					TraceEvent("AuthzPublicKeySetLoadError").error(e);
+				}
+				FlowTransport::transport().watchPublicKeyFile(opts.authzPublicKeyFile);
+			} else {
+				TraceEvent(SevInfo, "AuthzPublicKeyFileNotSet");
+			}
 
 			if (expectsPublicAddress) {
 				for (int ii = 0; ii < (opts.publicAddresses.secondaryAddress.present() ? 2 : 1); ++ii) {
@@ -2014,6 +2088,14 @@ int main(int argc, char* argv[]) {
 				throw;
 		}
 
+		std::string environmentKnobOptions;
+		for (const std::string& knobOption : getEnvironmentKnobOptions()) {
+			environmentKnobOptions += knobOption + " ";
+		}
+		if (environmentKnobOptions.length()) {
+			environmentKnobOptions.pop_back();
+		}
+
 		TraceEvent("ProgramStart")
 		    .setMaxEventLength(12000)
 		    .detail("RandomSeed", opts.randomSeed)
@@ -2028,12 +2110,14 @@ int main(int argc, char* argv[]) {
 		            opts.connectionFile ? opts.connectionFile->getConnectionString().toString() : "")
 		    .detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(nullptr))
 		    .setMaxFieldLength(10000)
+		    .detail("EnvironmentKnobOptions", environmentKnobOptions.length() ? environmentKnobOptions : "none")
 		    .detail("CommandLine", opts.commandLine)
 		    .setMaxFieldLength(0)
 		    .detail("BuggifyEnabled", opts.buggifyEnabled)
 		    .detail("FaultInjectionEnabled", opts.faultInjectionEnabled)
 		    .detail("MemoryLimit", opts.memLimit)
 		    .detail("VirtualMemoryLimit", opts.virtualMemLimit)
+		    .detail("ProtocolVersion", currentProtocolVersion())
 		    .trackLatest("ProgramStart");
 
 		Error::init();
@@ -2071,17 +2155,8 @@ int main(int argc, char* argv[]) {
 					flushAndExit(FDB_EXIT_ERROR);
 				}
 			}
-			std::vector<std::string> files = platform::listFiles(dataFolder);
-			if ((files.size() > 1 || (files.size() == 1 && files[0] != "restartInfo.ini")) && !opts.restarting) {
-				TraceEvent(SevError, "IncompatibleFileFound").detail("DataFolder", dataFolder);
-				fprintf(stderr,
-				        "ERROR: Data folder `%s' is non-empty; please use clean, fdb-only folder\n",
-				        dataFolder.c_str());
-				flushAndExit(FDB_EXIT_ERROR);
-			} else if (files.empty() && opts.restarting) {
-				TraceEvent(SevWarnAlways, "FileNotFound").detail("DataFolder", dataFolder);
-				printf("ERROR: Data folder `%s' is empty, but restarting option selected. Run Phase 1 test first\n",
-				       dataFolder.c_str());
+
+			if (!validateSimulationDataFiles(dataFolder, opts.restarting)) {
 				flushAndExit(FDB_EXIT_ERROR);
 			}
 
@@ -2178,6 +2253,8 @@ int main(int argc, char* argv[]) {
 				                KnobValue::create(ini.GetBoolValue("META", "enableEncryption", false)));
 				g_knobs.setKnob("enable_tlog_encryption",
 				                KnobValue::create(ini.GetBoolValue("META", "enableTLogEncryption", false)));
+				g_knobs.setKnob("enable_storage_server_encryption",
+				                KnobValue::create(ini.GetBoolValue("META", "enableStorageServerEncryption", false)));
 				g_knobs.setKnob("enable_blob_granule_encryption",
 				                KnobValue::create(ini.GetBoolValue("META", "enableBlobGranuleEncryption", false)));
 				g_knobs.setKnob("enable_blob_granule_compression",
@@ -2305,7 +2382,18 @@ int main(int argc, char* argv[]) {
 
 			f = result;
 		} else if (role == ServerRole::FlowProcess) {
-			TraceEvent(SevDebug, "StartingFlowProcess").detail("From", "fdbserver");
+			std::string traceFormat = getTraceFormatExtension();
+			// close and reopen trace file with the correct process listen address to name the file
+			closeTraceFile();
+			// writer is not shutdown immediately, addref on it
+			disposeTraceFileWriter();
+			// use the same trace format as before
+			selectTraceFormatter(traceFormat);
+			// create the trace file with the correct process address
+			openTraceFile(
+			    g_network->getLocalAddress(), opts.rollsize, opts.maxLogsSize, opts.logFolder, "trace", opts.logGroup);
+			auto m = startSystemMonitor(opts.dataFolder, opts.dcId, opts.zoneId, opts.zoneId);
+			TraceEvent(SevDebug, "StartingFlowProcess").detail("FlowProcessName", opts.flowProcessName);
 #if defined(__linux__) || defined(__FreeBSD__)
 			prctl(PR_SET_PDEATHSIG, SIGTERM);
 			if (getppid() == 1) /* parent already died before prctl */
diff --git a/fdbserver/include/fdbserver/ApplyMetadataMutation.h b/fdbserver/include/fdbserver/ApplyMetadataMutation.h
index 015d79b6f7..1550260bc5 100644
--- a/fdbserver/include/fdbserver/ApplyMetadataMutation.h
+++ b/fdbserver/include/fdbserver/ApplyMetadataMutation.h
@@ -103,7 +103,8 @@ void applyMetadataMutations(SpanContext const& spanContext,
                             const UID& dbgid,
                             Arena& arena,
                             const VectorRef<MutationRef>& mutations,
-                            IKeyValueStore* txnStateStore);
+                            IKeyValueStore* txnStateStore,
+                            Reference<AsyncVar<ServerDBInfo> const> dbInfo);
 
 inline bool isSystemKey(KeyRef key) {
 	return key.size() && key[0] == systemKeys.begin[0];
@@ -144,6 +145,7 @@ inline bool containsMetadataMutation(const VectorRef<MutationRef>& mutations) {
 // Resolver's version
 void applyMetadataMutations(SpanContext const& spanContext,
                             ResolverData& resolverData,
-                            const VectorRef<MutationRef>& mutations);
+                            const VectorRef<MutationRef>& mutations,
+                            Reference<AsyncVar<ServerDBInfo> const> dbInfo);
 
 #endif
diff --git a/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h b/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h
index 32cd1429d1..2208c9459a 100644
--- a/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h
+++ b/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h
@@ -33,7 +33,6 @@
 #include "fdbclient/Tenant.h"
 
 #include "fdbserver/ServerDBInfo.h"
-
 #include "flow/flow.h"
 
 #include "flow/actorcompiler.h" // has to be last include
@@ -87,7 +86,8 @@ struct GranuleFiles {
 	              bool canCollapse,
 	              BlobGranuleChunkRef& chunk,
 	              Arena& replyArena,
-	              int64_t& deltaBytesCounter) const;
+	              int64_t& deltaBytesCounter,
+	              bool summarize) const;
 };
 
 // serialize change feed key as UID bytes, to use 16 bytes on disk
@@ -102,6 +102,9 @@ ACTOR Future<Void> readGranuleFiles(Transaction* tr, Key* startKey, Key endKey,
 
 ACTOR Future<GranuleFiles> loadHistoryFiles(Database cx, UID granuleID);
 
+enum ForcedPurgeState { NonePurged, SomePurged, AllPurged };
+ACTOR Future<ForcedPurgeState> getForcePurgedState(Transaction* tr, KeyRange keyRange);
+
 // TODO: versioned like SS has?
 struct GranuleTenantData : NonCopyable, ReferenceCounted<GranuleTenantData> {
 	TenantName name;
diff --git a/fdbserver/include/fdbserver/BlobGranuleValidation.actor.h b/fdbserver/include/fdbserver/BlobGranuleValidation.actor.h
index db4cdc2891..df89b24b96 100644
--- a/fdbserver/include/fdbserver/BlobGranuleValidation.actor.h
+++ b/fdbserver/include/fdbserver/BlobGranuleValidation.actor.h
@@ -55,6 +55,11 @@ void printGranuleChunks(const Standalone<VectorRef<BlobGranuleChunkRef>>& chunks
 
 ACTOR Future<Void> clearAndAwaitMerge(Database cx, KeyRange range);
 
+ACTOR Future<Void> validateGranuleSummaries(Database cx,
+                                            KeyRange range,
+                                            Optional<TenantName> tenantName,
+                                            Promise<Void> testComplete);
+
 #include "flow/unactorcompiler.h"
 
 #endif
diff --git a/fdbserver/include/fdbserver/BlobManagerInterface.h b/fdbserver/include/fdbserver/BlobManagerInterface.h
index 7fb73a220d..8ba341e4fd 100644
--- a/fdbserver/include/fdbserver/BlobManagerInterface.h
+++ b/fdbserver/include/fdbserver/BlobManagerInterface.h
@@ -32,11 +32,13 @@ struct BlobManagerInterface {
 	RequestStream<struct HaltBlobManagerRequest> haltBlobManager;
 	RequestStream<struct HaltBlobGranulesRequest> haltBlobGranules;
 	RequestStream<struct BlobManagerExclusionSafetyCheckRequest> blobManagerExclCheckReq;
+	RequestStream<struct BlobManagerBlockedRequest> blobManagerBlockedReq;
 	struct LocalityData locality;
 	UID myId;
+	int64_t epoch;
 
-	BlobManagerInterface() {}
-	explicit BlobManagerInterface(const struct LocalityData& l, UID id) : locality(l), myId(id) {}
+	BlobManagerInterface() : epoch(0) {}
+	BlobManagerInterface(const struct LocalityData& l, UID id, int64_t epoch) : locality(l), myId(id), epoch(epoch) {}
 
 	void initEndpoints() {}
 	UID id() const { return myId; }
@@ -46,7 +48,15 @@ struct BlobManagerInterface {
 
 	template <class Archive>
 	void serialize(Archive& ar) {
-		serializer(ar, waitFailure, haltBlobManager, haltBlobGranules, blobManagerExclCheckReq, locality, myId);
+		serializer(ar,
+		           waitFailure,
+		           haltBlobManager,
+		           haltBlobGranules,
+		           blobManagerExclCheckReq,
+		           blobManagerBlockedReq,
+		           locality,
+		           myId,
+		           epoch);
 	}
 };
 
@@ -106,4 +116,29 @@ struct BlobManagerExclusionSafetyCheckRequest {
 	}
 };
 
+struct BlobManagerBlockedReply {
+	constexpr static FileIdentifier file_identifier = 8078627;
+	int64_t blockedAssignments;
+
+	BlobManagerBlockedReply() : blockedAssignments(0) {}
+	explicit BlobManagerBlockedReply(int64_t blockedAssignments) : blockedAssignments(blockedAssignments) {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, blockedAssignments);
+	}
+};
+
+struct BlobManagerBlockedRequest {
+	constexpr static FileIdentifier file_identifier = 1986387;
+	ReplyPromise<BlobManagerBlockedReply> reply;
+
+	BlobManagerBlockedRequest() {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, reply);
+	}
+};
+
 #endif
diff --git a/fdbserver/include/fdbserver/ClusterController.actor.h b/fdbserver/include/fdbserver/ClusterController.actor.h
index 5dd37c23e9..f90d17f9c8 100644
--- a/fdbserver/include/fdbserver/ClusterController.actor.h
+++ b/fdbserver/include/fdbserver/ClusterController.actor.h
@@ -139,6 +139,8 @@ public:
 		Future<Void> clientCounter;
 		int clientCount;
 		AsyncVar<bool> blobGranulesEnabled;
+		ClusterType clusterType = ClusterType::STANDALONE;
+		Optional<ClusterName> metaclusterName;
 
 		DBInfo()
 		  : clientInfo(new AsyncVar<ClientDBInfo>()), serverInfo(new AsyncVar<ServerDBInfo>()),
@@ -3312,6 +3314,7 @@ public:
 		serverInfo.masterLifetime.ccID = id;
 		serverInfo.clusterInterface = ccInterface;
 		serverInfo.myLocality = locality;
+		serverInfo.client.isEncryptionEnabled = SERVER_KNOBS->ENABLE_ENCRYPTION;
 		db.serverInfo->set(serverInfo);
 		cx = openDBOnServer(db.serverInfo, TaskPriority::DefaultEndpoint, LockAware::True);
 
diff --git a/fdbserver/include/fdbserver/CoordinationInterface.h b/fdbserver/include/fdbserver/CoordinationInterface.h
index 35175eceab..3c6c904d4c 100644
--- a/fdbserver/include/fdbserver/CoordinationInterface.h
+++ b/fdbserver/include/fdbserver/CoordinationInterface.h
@@ -236,7 +236,7 @@ Future<Void> coordinationServer(std::string const& dataFolder,
                                 Reference<ConfigNode> const&,
                                 ConfigBroadcastInterface const&);
 
-// Read a value of MovableValue and if old cluster key presents in it, update to the new key
+// Read a value of MovableValue and if the old cluster key is nested in it, update it to the new key
 Optional<Value> updateCCSInMovableValue(ValueRef movableVal, KeyRef oldClusterKey, KeyRef newClusterKey);
 
 Future<Void> coordChangeClusterKey(std::string dataFolder, KeyRef newClusterKey, KeyRef oldClusterKey);
diff --git a/fdbserver/include/fdbserver/DDRelocationQueue.h b/fdbserver/include/fdbserver/DDRelocationQueue.h
new file mode 100644
index 0000000000..874e86b83f
--- /dev/null
+++ b/fdbserver/include/fdbserver/DDRelocationQueue.h
@@ -0,0 +1,35 @@
+/*
+ * DDRelocationQueue.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FOUNDATIONDB_DDRELOCATIONQUEUE_H
+#define FOUNDATIONDB_DDRELOCATIONQUEUE_H
+
+#include "DataDistribution.actor.h"
+// send request/signal to DDRelocationQueue through interface
+// call synchronous method from components outside DDRelocationQueue
+struct IDDRelocationQueue {
+	PromiseStream<RelocateShard> relocationProducer, relocationConsumer; // FIXME(xwang): not used yet
+	// PromiseStream<Promise<int>> getUnhealthyRelocationCount; // FIXME(xwang): change it to a synchronous call
+
+	virtual int getUnhealthyRelocationCount() = 0;
+	virtual ~IDDRelocationQueue() = default;
+	;
+};
+
+#endif // FOUNDATIONDB_DDRELOCATIONQUEUE_H
diff --git a/fdbserver/include/fdbserver/DDShardTracker.h b/fdbserver/include/fdbserver/DDShardTracker.h
new file mode 100644
index 0000000000..307a095be6
--- /dev/null
+++ b/fdbserver/include/fdbserver/DDShardTracker.h
@@ -0,0 +1,40 @@
+/*
+ * DDShardTracker.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FOUNDATIONDB_DDSHARDTRACKER_H
+#define FOUNDATIONDB_DDSHARDTRACKER_H
+#include "DataDistribution.actor.h"
+
+// send request/signal to DDTracker through interface
+// call synchronous method from components outside DDShardTracker
+struct IDDShardTracker {
+	// FIXME: the streams are not used yet
+	Promise<Void> readyToStart;
+	PromiseStream<GetMetricsRequest> getShardMetrics;
+	PromiseStream<GetTopKMetricsRequest> getTopKMetrics;
+	PromiseStream<GetMetricsListRequest> getShardMetricsList;
+	PromiseStream<KeyRange> restartShardTracker;
+
+	// PromiseStream<Promise<int64_t>> averageShardBytes; // FIXME(xwang): change it to a synchronous call
+
+	virtual double getAverageShardBytes() = 0;
+	virtual ~IDDShardTracker() = default;
+};
+
+#endif // FOUNDATIONDB_DDSHARDTRACKER_H
diff --git a/fdbserver/include/fdbserver/DDSharedContext.h b/fdbserver/include/fdbserver/DDSharedContext.h
new file mode 100644
index 0000000000..60cc0a1307
--- /dev/null
+++ b/fdbserver/include/fdbserver/DDSharedContext.h
@@ -0,0 +1,70 @@
+/*
+ * DDSharedContext.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FOUNDATIONDB_DDSHAREDCONTEXT_H
+#define FOUNDATIONDB_DDSHAREDCONTEXT_H
+#include "fdbserver/MoveKeys.actor.h"
+#include "fdbserver/ShardsAffectedByTeamFailure.h"
+#include "fdbserver/DDShardTracker.h"
+#include "fdbserver/DDRelocationQueue.h"
+#include "fdbserver/DDTeamCollection.h"
+
+// The common info shared by all DD components. Normally the DD components should share the reference to the same
+// context.
+// NOTE: We should avoid the shared class become an insanely large class, think twice before add member to it.
+class DDSharedContext : public ReferenceCounted<DDSharedContext> {
+	// FIXME(xwang) mark fields privates
+public:
+	std::unique_ptr<DDEnabledState>
+	    ddEnabledState; // Note: don't operate directly because it's shared with snapshot server
+	IDDShardTracker* shardTracker = nullptr;
+	IDDRelocationQueue* relocationQueue = nullptr;
+	std::vector<IDDTeamCollection*> teamCollections;
+
+	// public:
+	UID ddId;
+	MoveKeysLock lock;
+	bool trackerCancelled = false;
+	DatabaseConfiguration configuration;
+
+	Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure;
+	Reference<AsyncVar<bool>> processingUnhealthy, processingWiggle;
+
+	DDSharedContext() = default;
+
+	DDSharedContext(UID id)
+	  : ddEnabledState(new DDEnabledState), ddId(id), shardsAffectedByTeamFailure(new ShardsAffectedByTeamFailure),
+	    processingUnhealthy(new AsyncVar<bool>(false)), processingWiggle(new AsyncVar<bool>(false)) {}
+
+	UID id() const { return ddId; }
+
+	void markTrackerCancelled() { trackerCancelled = true; }
+
+	bool isTrackerCancelled() const { return trackerCancelled; }
+
+	decltype(auto) usableRegions() const { return configuration.usableRegions; }
+
+	bool isDDEnabled() const { return ddEnabledState->isDDEnabled(); };
+
+	void proposeRelocation(const RelocateShard& rs) const { return relocationQueue->relocationProducer.send(rs); }
+
+	void requestRestartShardTracker(KeyRange keys) const { return shardTracker->restartShardTracker.send(keys); }
+};
+
+#endif // FOUNDATIONDB_DDSHAREDCONTEXT_H
diff --git a/fdbserver/include/fdbserver/DDTeamCollection.h b/fdbserver/include/fdbserver/DDTeamCollection.h
index f719586622..c10ab86af5 100644
--- a/fdbserver/include/fdbserver/DDTeamCollection.h
+++ b/fdbserver/include/fdbserver/DDTeamCollection.h
@@ -173,6 +173,13 @@ FDB_DECLARE_BOOLEAN_PARAM(IsRedundantTeam);
 FDB_DECLARE_BOOLEAN_PARAM(IsBadTeam);
 FDB_DECLARE_BOOLEAN_PARAM(WaitWiggle);
 
+// send request/signal to DDTeamCollection through interface
+// call synchronous method from components outside DDTeamCollection
+struct IDDTeamCollection {
+	PromiseStream<GetTeamRequest> getTeam;
+	virtual ~IDDTeamCollection() {}
+};
+
 class DDTeamCollection : public ReferenceCounted<DDTeamCollection> {
 	friend class DDTeamCollectionImpl;
 	friend class DDTeamCollectionUnitTest;
@@ -402,7 +409,7 @@ class DDTeamCollection : public ReferenceCounted<DDTeamCollection> {
 	// in the next iteration of the loop. Otherwise, you may miss checking some elements in machineTeams
 	bool removeMachineTeam(Reference<TCMachineTeamInfo> targetMT);
 
-	// Adds storage servers held on process of which the Process Id is “id” into excludeServers which prevent
+	// Adds storage servers held on process of which the Process id is “id” into excludeServers which prevent
 	// recruiting the wiggling storage servers and let teamTracker start to move data off the affected teams;
 	// Return a vector of futures wait for all data is moved to other teams.
 	Future<Void> excludeStorageServersForWiggle(const UID& id);
@@ -669,6 +676,12 @@ public:
 
 	UID getDistributorId() const { return distributorId; }
 
+	// Divide TSS evenly in each DC if there are multiple
+	// TODO would it be better to put all of them in primary DC?
+	int32_t getTargetTSSInDC() const;
+
+	bool reachTSSPairTarget() const;
+
 	// Keep track of servers and teams -- serves requests for getRandomTeam
 	static Future<Void> run(Reference<DDTeamCollection> teamCollection,
 	                        Reference<InitialDataDistribution> initData,
diff --git a/fdbserver/include/fdbserver/DDTxnProcessor.h b/fdbserver/include/fdbserver/DDTxnProcessor.h
index 8926a51b10..451dc84c96 100644
--- a/fdbserver/include/fdbserver/DDTxnProcessor.h
+++ b/fdbserver/include/fdbserver/DDTxnProcessor.h
@@ -50,7 +50,7 @@ public:
 
 	virtual ~IDDTxnProcessor() = default;
 
-	[[nodiscard]] virtual Future<MoveKeysLock> takeMoveKeysLock(UID ddId) const { return MoveKeysLock(); }
+	[[nodiscard]] virtual Future<MoveKeysLock> takeMoveKeysLock(const UID& ddId) const { return MoveKeysLock(); }
 
 	virtual Future<DatabaseConfiguration> getDatabaseConfiguration() const { return DatabaseConfiguration(); }
 
@@ -61,6 +61,19 @@ public:
 	}
 
 	virtual Future<Void> waitForDataDistributionEnabled(const DDEnabledState* ddEnabledState) const = 0;
+
+	virtual Future<bool> isDataDistributionEnabled(const DDEnabledState* ddEnabledState) const = 0;
+
+	virtual Future<Void> pollMoveKeysLock(const MoveKeysLock& lock, const DDEnabledState* ddEnabledState) const = 0;
+
+	virtual Future<Void> removeKeysFromFailedServer(const UID& serverID,
+	                                                const std::vector<UID>& teamForDroppedRange,
+	                                                const MoveKeysLock& lock,
+	                                                const DDEnabledState* ddEnabledState) const = 0;
+	virtual Future<Void> removeStorageServer(const UID& serverID,
+	                                         const Optional<UID>& tssPairID,
+	                                         const MoveKeysLock& lock,
+	                                         const DDEnabledState* ddEnabledState) const = 0;
 };
 
 class DDTxnProcessorImpl;
@@ -85,7 +98,7 @@ public:
 	    const std::vector<Optional<Key>>& remoteDcIds,
 	    const DDEnabledState* ddEnabledState) override;
 
-	Future<MoveKeysLock> takeMoveKeysLock(UID ddId) const override;
+	Future<MoveKeysLock> takeMoveKeysLock(UID const& ddId) const override;
 
 	Future<DatabaseConfiguration> getDatabaseConfiguration() const override;
 
@@ -94,6 +107,24 @@ public:
 	                               const DatabaseConfiguration& configuration) const override;
 
 	Future<Void> waitForDataDistributionEnabled(const DDEnabledState* ddEnabledState) const override;
+
+	Future<bool> isDataDistributionEnabled(const DDEnabledState* ddEnabledState) const override;
+
+	Future<Void> pollMoveKeysLock(const MoveKeysLock& lock, const DDEnabledState* ddEnabledState) const override;
+
+	Future<Void> removeKeysFromFailedServer(const UID& serverID,
+	                                        const std::vector<UID>& teamForDroppedRange,
+	                                        const MoveKeysLock& lock,
+	                                        const DDEnabledState* ddEnabledState) const override {
+		return ::removeKeysFromFailedServer(cx, serverID, teamForDroppedRange, lock, ddEnabledState);
+	}
+
+	Future<Void> removeStorageServer(const UID& serverID,
+	                                 const Optional<UID>& tssPairID,
+	                                 const MoveKeysLock& lock,
+	                                 const DDEnabledState* ddEnabledState) const override {
+		return ::removeStorageServer(cx, serverID, tssPairID, lock, ddEnabledState);
+	}
 };
 
 // A mock transaction implementation for test usage.
diff --git a/fdbserver/include/fdbserver/DataDistribution.actor.h b/fdbserver/include/fdbserver/DataDistribution.actor.h
index 3e9b03e02c..3a23659720 100644
--- a/fdbserver/include/fdbserver/DataDistribution.actor.h
+++ b/fdbserver/include/fdbserver/DataDistribution.actor.h
@@ -36,7 +36,43 @@
 
 #include "flow/actorcompiler.h" // This must be the last #include.
 
-enum class RelocateReason { INVALID = -1, OTHER, REBALANCE_DISK, REBALANCE_READ };
+/////////////////////////////// Data //////////////////////////////////////
+#ifndef __INTEL_COMPILER
+#pragma region Data
+#endif
+
+// SOMEDAY: whether it's possible to combine RelocateReason and DataMovementReason together?
+// RelocateReason to DataMovementReason is one-to-N mapping
+class RelocateReason {
+public:
+	enum Value : int8_t { OTHER = 0, REBALANCE_DISK, REBALANCE_READ, MERGE_SHARD, SIZE_SPLIT, WRITE_SPLIT, __COUNT };
+	RelocateReason(Value v) : value(v) { ASSERT(value != __COUNT); }
+	explicit RelocateReason(int v) : value((Value)v) { ASSERT(value != __COUNT); }
+	std::string toString() const {
+		switch (value) {
+		case OTHER:
+			return "Other";
+		case REBALANCE_DISK:
+			return "RebalanceDisk";
+		case REBALANCE_READ:
+			return "RebalanceRead";
+		case MERGE_SHARD:
+			return "MergeShard";
+		case SIZE_SPLIT:
+			return "SizeSplit";
+		case WRITE_SPLIT:
+			return "WriteSplit";
+		case __COUNT:
+			ASSERT(false);
+		}
+		return "";
+	}
+	operator int() const { return (int)value; }
+	constexpr static int8_t typeCount() { return (int)__COUNT; }
+
+private:
+	Value value;
+};
 
 // One-to-one relationship to the priority knobs
 enum class DataMovementReason {
@@ -57,13 +93,14 @@ enum class DataMovementReason {
 	TEAM_1_LEFT,
 	TEAM_FAILED,
 	TEAM_0_LEFT,
-	SPLIT_SHARD
+	SPLIT_SHARD,
+	ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD
 };
+extern int dataMovementPriority(DataMovementReason moveReason);
+extern DataMovementReason priorityToDataMovementReason(int priority);
 
 struct DDShardInfo;
 
-extern int dataMovementPriority(DataMovementReason moveReason);
-
 // Represents a data move in DD.
 struct DataMove {
 	DataMove() : meta(DataMoveMetaData()), restore(false), valid(false), cancelled(false) {}
@@ -94,15 +131,26 @@ struct RelocateShard {
 	UID dataMoveId;
 	RelocateReason reason;
 	DataMovementReason moveReason;
-	RelocateShard()
-	  : priority(0), cancelled(false), dataMoveId(anonymousShardId), reason(RelocateReason::INVALID),
-	    moveReason(DataMovementReason::INVALID) {}
-	RelocateShard(KeyRange const& keys, DataMovementReason moveReason, RelocateReason reason)
-	  : keys(keys), cancelled(false), dataMoveId(anonymousShardId), reason(reason), moveReason(moveReason) {
-		priority = dataMovementPriority(moveReason);
-	}
+
+	UID traceId; // track the lifetime of this relocate shard
+
+	// Initialization when define is a better practice. We should avoid assignment of member after definition.
+	// static RelocateShard emptyRelocateShard() { return {}; }
+
+	RelocateShard(KeyRange const& keys, DataMovementReason moveReason, RelocateReason reason, UID traceId = UID())
+	  : keys(keys), priority(dataMovementPriority(moveReason)), cancelled(false), dataMoveId(anonymousShardId),
+	    reason(reason), moveReason(moveReason), traceId(traceId) {}
+
+	RelocateShard(KeyRange const& keys, int priority, RelocateReason reason, UID traceId = UID())
+	  : keys(keys), priority(priority), cancelled(false), dataMoveId(anonymousShardId), reason(reason),
+	    moveReason(priorityToDataMovementReason(priority)), traceId(traceId) {}
 
 	bool isRestore() const { return this->dataMove != nullptr; }
+
+private:
+	RelocateShard()
+	  : priority(0), cancelled(false), dataMoveId(anonymousShardId), reason(RelocateReason::OTHER),
+	    moveReason(DataMovementReason::INVALID) {}
 };
 
 struct IDataDistributionTeam {
@@ -249,6 +297,7 @@ struct GetTopKMetricsReply {
 	GetTopKMetricsReply(std::vector<KeyRangeStorageMetrics> const& m, double minReadLoad, double maxReadLoad)
 	  : shardMetrics(m), minReadLoad(minReadLoad), maxReadLoad(maxReadLoad) {}
 };
+
 struct GetTopKMetricsRequest {
 	int topK = 1; // default only return the top 1 shard based on the GetTopKMetricsRequest::compare function
 	std::vector<KeyRange> keys;
@@ -287,8 +336,186 @@ struct GetMetricsListRequest {
 	GetMetricsListRequest(KeyRange const& keys, const int shardLimit) : keys(keys), shardLimit(shardLimit) {}
 };
 
-struct TeamCollectionInterface {
-	PromiseStream<GetTeamRequest> getTeam;
+// PhysicalShardCollection maintains physical shard concepts in data distribution
+// A physical shard contains one or multiple shards (key range)
+// PhysicalShardCollection is responsible for creation and maintenance of physical shards (including metrics)
+// For multiple DCs, PhysicalShardCollection maintains a pair of primary team and remote team
+// A primary team and a remote team shares a physical shard
+// For each shard (key-range) move, PhysicalShardCollection decides which physical shard and corresponding team(s) to
+// move The current design of PhysicalShardCollection assumes that there exists at most two teamCollections
+// TODO: unit test needed
+FDB_DECLARE_BOOLEAN_PARAM(InAnonymousPhysicalShard);
+FDB_DECLARE_BOOLEAN_PARAM(PhysicalShardHasMoreThanKeyRange);
+FDB_DECLARE_BOOLEAN_PARAM(InOverSizePhysicalShard);
+FDB_DECLARE_BOOLEAN_PARAM(PhysicalShardAvailable);
+FDB_DECLARE_BOOLEAN_PARAM(MoveKeyRangeOutPhysicalShard);
+
+class PhysicalShardCollection : public ReferenceCounted<PhysicalShardCollection> {
+public:
+	PhysicalShardCollection() : requireTransition(false), lastTransitionStartTime(now()) {}
+
+	enum class PhysicalShardCreationTime { DDInit, DDRelocator };
+
+	struct PhysicalShard {
+		PhysicalShard() : id(UID().first()) {}
+
+		PhysicalShard(uint64_t id,
+		              StorageMetrics const& metrics,
+		              std::vector<ShardsAffectedByTeamFailure::Team> teams,
+		              PhysicalShardCreationTime whenCreated)
+		  : id(id), metrics(metrics), teams(teams), whenCreated(whenCreated) {}
+
+		std::string toString() const { return fmt::format("{}", std::to_string(id)); }
+
+		uint64_t id; // physical shard id (never changed)
+		StorageMetrics metrics; // current metrics, updated by shardTracker
+		std::vector<ShardsAffectedByTeamFailure::Team> teams; // which team owns this physical shard (never changed)
+		PhysicalShardCreationTime whenCreated; // when this physical shard is created (never changed)
+	};
+
+	// Two-step team selection
+	// Usage: getting primary dest team and remote dest team in dataDistributionRelocator()
+	// The overall process has two steps:
+	// Step 1: get a physical shard id given the input primary team
+	// Return a new physical shard id if the input primary team is new or the team has no available physical shard
+	// checkPhysicalShardAvailable() defines whether a physical shard is available
+	uint64_t determinePhysicalShardIDGivenPrimaryTeam(ShardsAffectedByTeamFailure::Team primaryTeam,
+	                                                  StorageMetrics const& metrics,
+	                                                  bool forceToUseNewPhysicalShard,
+	                                                  uint64_t debugID);
+
+	// Step 2: get a remote team which has the input physical shard
+	// Return empty if no such remote team
+	// May return a problematic remote team, and re-selection is required for this case
+	Optional<ShardsAffectedByTeamFailure::Team> tryGetAvailableRemoteTeamWith(uint64_t inputPhysicalShardID,
+	                                                                          StorageMetrics const& moveInMetrics,
+	                                                                          uint64_t debugID);
+	// Invariant:
+	// (1) If forceToUseNewPhysicalShard is set, use the bestTeams selected by getTeam(), and create a new physical
+	// shard for the teams
+	// (2) If forceToUseNewPhysicalShard is not set, use the primary team selected by getTeam()
+	//     If there exists a remote team which has an available physical shard with the primary team
+	//         Then, use the remote team. Note that the remote team may be unhealthy and the remote team
+	//         may be one who issues the current data relocation.
+	//         In this case, we set forceToUseNewPhysicalShard to use getTeam() to re-select the remote team
+	//     Otherwise, use getTeam() to re-select the remote team
+
+	// Create a physical shard when initializing PhysicalShardCollection
+	void initPhysicalShardCollection(KeyRange keys,
+	                                 std::vector<ShardsAffectedByTeamFailure::Team> selectedTeams,
+	                                 uint64_t physicalShardID,
+	                                 uint64_t debugID);
+
+	// Create a physical shard when updating PhysicalShardCollection
+	void updatePhysicalShardCollection(KeyRange keys,
+	                                   bool isRestore,
+	                                   std::vector<ShardsAffectedByTeamFailure::Team> selectedTeams,
+	                                   uint64_t physicalShardID,
+	                                   const StorageMetrics& metrics,
+	                                   uint64_t debugID);
+
+	// Update physicalShard metrics and return whether the keyRange needs to move out of its physical shard
+	MoveKeyRangeOutPhysicalShard trackPhysicalShard(KeyRange keyRange,
+	                                                StorageMetrics const& newMetrics,
+	                                                StorageMetrics const& oldMetrics,
+	                                                bool initWithNewMetrics);
+
+	// Clean up empty physicalShard
+	void cleanUpPhysicalShardCollection();
+
+	// Log physicalShard
+	void logPhysicalShardCollection();
+
+private:
+	// Track physicalShard metrics by tracking keyRange metrics
+	void updatePhysicalShardMetricsByKeyRange(KeyRange keyRange,
+	                                          StorageMetrics const& newMetrics,
+	                                          StorageMetrics const& oldMetrics,
+	                                          bool initWithNewMetrics);
+
+	// Check the input keyRange is in the anonymous physical shard
+	InAnonymousPhysicalShard isInAnonymousPhysicalShard(KeyRange keyRange);
+
+	// Check the input physicalShard has more keyRanges in addition to the input keyRange
+	PhysicalShardHasMoreThanKeyRange whetherPhysicalShardHasMoreThanKeyRange(uint64_t physicalShardID,
+	                                                                         KeyRange keyRange);
+
+	// Check the input keyRange is in an oversize physical shard
+	// This function returns true to enforce the keyRange to move out the physical shard
+	// Note that if the physical shard only contains the keyRange, always return FALSE
+	InOverSizePhysicalShard isInOverSizePhysicalShard(KeyRange keyRange);
+
+	// Generate a random physical shard ID, which is not UID().first() nor anonymousShardId.first()
+	uint64_t generateNewPhysicalShardID(uint64_t debugID);
+
+	// Check whether the input physical shard is available
+	// A physical shard is available if the current metric + moveInMetrics <= a threshold
+	PhysicalShardAvailable checkPhysicalShardAvailable(uint64_t physicalShardID, StorageMetrics const& moveInMetrics);
+
+	// If the input team has any available physical shard, return an available physical shard of the input team
+	Optional<uint64_t> trySelectAvailablePhysicalShardFor(ShardsAffectedByTeamFailure::Team team,
+	                                                      StorageMetrics const& metrics,
+	                                                      uint64_t debugID);
+
+	// Reduce the metics of input physical shard by the input metrics
+	void reduceMetricsForMoveOut(uint64_t physicalShardID, StorageMetrics const& metrics);
+
+	// Add the input metrics to the metrics of input physical shard
+	void increaseMetricsForMoveIn(uint64_t physicalShardID, StorageMetrics const& metrics);
+
+	// In physicalShardCollection, add a physical shard initialized by the input parameters to the collection
+	void insertPhysicalShardToCollection(uint64_t physicalShardID,
+	                                     StorageMetrics const& metrics,
+	                                     std::vector<ShardsAffectedByTeamFailure::Team> teams,
+	                                     uint64_t debugID,
+	                                     PhysicalShardCreationTime whenCreated);
+
+	// In teamPhysicalShardIDs, add the input physical shard id to the input teams
+	void updateTeamPhysicalShardIDsMap(uint64_t physicalShardID,
+	                                   std::vector<ShardsAffectedByTeamFailure::Team> inputTeams,
+	                                   uint64_t debugID);
+
+	// In keyRangePhysicalShardIDMap, set the input physical shard id to the input key range
+	void updatekeyRangePhysicalShardIDMap(KeyRange keyRange, uint64_t physicalShardID, uint64_t debugID);
+
+	// Return a string concating the input IDs interleaving with " "
+	std::string convertIDsToString(std::set<uint64_t> ids);
+
+	// Reset TransitionStartTime
+	// Consider a system without concept of physicalShard
+	// When restart, the system begins with a state where all keyRanges are in the anonymousShard
+	// Our goal is to make all keyRanges are out of the anonymousShard
+	// A keyRange moves out of the anonymousShard when the keyRange is triggered a data move
+	// It is possible that a keyRange is cold and no data move is triggered on this keyRange for long time
+	// In this case, we need to intentionally trigger data move on that keyRange
+	// The minimal time span between two successive data move for this purpose is TransitionStartTime
+	inline void resetLastTransitionStartTime() { // reset when a keyRange move is triggered for the transition
+		lastTransitionStartTime = now();
+		return;
+	}
+
+	// When DD restarts, it checks whether keyRange has anonymousShard
+	// If yes, setTransitionCheck() is call to trigger the process of removing anonymousShard
+	inline void setTransitionCheck() {
+		if (requireTransition == true) {
+			return;
+		}
+		requireTransition = true;
+		TraceEvent("PhysicalShardSetTransitionCheck");
+		return;
+	}
+
+	inline bool requireTransitionCheck() { return requireTransition; }
+
+	// Core data structures
+	// Physical shard instances indexed by physical shard id
+	std::unordered_map<uint64_t, PhysicalShard> physicalShardInstances;
+	// Indicate a key range belongs to which physical shard
+	KeyRangeMap<uint64_t> keyRangePhysicalShardIDMap;
+	// Indicate what physical shards owned by a team
+	std::map<ShardsAffectedByTeamFailure::Team, std::set<uint64_t>> teamPhysicalShardIDs;
+	bool requireTransition;
+	double lastTransitionStartTime;
 };
 
 // DDShardInfo is so named to avoid link-time name collision with ShardInfo within the StorageServer
@@ -340,10 +567,46 @@ struct ShardTrackedData {
 	Reference<AsyncVar<Optional<ShardMetrics>>> stats;
 };
 
+// Holds the permitted size and IO Bounds for a shard
+struct ShardSizeBounds {
+	StorageMetrics max;
+	StorageMetrics min;
+	StorageMetrics permittedError;
+
+	bool operator==(ShardSizeBounds const& rhs) const {
+		return max == rhs.max && min == rhs.min && permittedError == rhs.permittedError;
+	}
+};
+
+// Gets the permitted size and IO bounds for a shard
+ShardSizeBounds getShardSizeBounds(KeyRangeRef shard, int64_t maxShardSize);
+
+// Determines the maximum shard size based on the size of the database
+int64_t getMaxShardSize(double dbSizeEstimate);
+
+struct StorageQuotaInfo {
+	std::map<Key, uint64_t> quotaMap;
+};
+
+#ifndef __INTEL_COMPILER
+#pragma endregion
+#endif
+
+// FIXME(xwang): Delete Old DD Actors once the refactoring is done
+/////////////////////////////// Old DD Actors //////////////////////////////////////
+#ifndef __INTEL_COMPILER
+#pragma region Old DD Actors
+#endif
+
+struct TeamCollectionInterface {
+	PromiseStream<GetTeamRequest> getTeam;
+};
+
 ACTOR Future<Void> dataDistributionTracker(Reference<InitialDataDistribution> initData,
                                            Database cx,
                                            PromiseStream<RelocateShard> output,
                                            Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
+                                           Reference<PhysicalShardCollection> physicalShardCollection,
                                            PromiseStream<GetMetricsRequest> getShardMetrics,
                                            FutureStream<GetTopKMetricsRequest> getTopKMetrics,
                                            PromiseStream<GetMetricsListRequest> getShardMetricsList,
@@ -363,6 +626,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
                                          Reference<AsyncVar<bool>> processingWiggle,
                                          std::vector<TeamCollectionInterface> teamCollection,
                                          Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
+                                         Reference<PhysicalShardCollection> physicalShardCollection,
                                          MoveKeysLock lock,
                                          PromiseStream<Promise<int64_t>> getAverageShardBytes,
                                          FutureStream<Promise<int>> getUnhealthyRelocationCount,
@@ -370,24 +634,14 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
                                          int teamSize,
                                          int singleRegionTeamSize,
                                          const DDEnabledState* ddEnabledState);
+#ifndef __INTEL_COMPILER
+#pragma endregion
+#endif
 
-// Holds the permitted size and IO Bounds for a shard
-struct ShardSizeBounds {
-	StorageMetrics max;
-	StorageMetrics min;
-	StorageMetrics permittedError;
-
-	bool operator==(ShardSizeBounds const& rhs) const {
-		return max == rhs.max && min == rhs.min && permittedError == rhs.permittedError;
-	}
-};
-
-// Gets the permitted size and IO bounds for a shard
-ShardSizeBounds getShardSizeBounds(KeyRangeRef shard, int64_t maxShardSize);
-
-// Determines the maximum shard size based on the size of the database
-int64_t getMaxShardSize(double dbSizeEstimate);
-
+/////////////////////////////// Perpetual Storage Wiggle //////////////////////////////////////
+#ifndef __INTEL_COMPILER
+#pragma region Perpetual Storage Wiggle
+#endif
 class DDTeamCollection;
 
 struct StorageWiggleMetrics {
@@ -482,8 +736,9 @@ struct StorageWiggleMetrics {
 };
 
 struct StorageWiggler : ReferenceCounted<StorageWiggler> {
+	static constexpr double MIN_ON_CHECK_DELAY_SEC = 5.0;
 	enum State : uint8_t { INVALID = 0, RUN = 1, PAUSE = 2 };
-	AsyncVar<bool> nonEmpty;
+
 	DDTeamCollection const* teamCollection;
 	StorageWiggleMetrics metrics;
 	// data structures
@@ -496,7 +751,7 @@ struct StorageWiggler : ReferenceCounted<StorageWiggler> {
 	State wiggleState = State::INVALID;
 	double lastStateChangeTs = 0.0; // timestamp describes when did the state change
 
-	explicit StorageWiggler(DDTeamCollection* collection) : nonEmpty(false), teamCollection(collection){};
+	explicit StorageWiggler(DDTeamCollection* collection) : teamCollection(collection){};
 	// add server to wiggling queue
 	void addServer(const UID& serverId, const StorageMetadataType& metadata);
 	// remove server from wiggling queue
@@ -505,8 +760,14 @@ struct StorageWiggler : ReferenceCounted<StorageWiggler> {
 	void updateMetadata(const UID& serverId, const StorageMetadataType& metadata);
 	bool contains(const UID& serverId) const { return pq_handles.count(serverId) > 0; }
 	bool empty() const { return wiggle_pq.empty(); }
-	Optional<UID> getNextServerId();
 
+	// It's guarantee that When a.metadata >= b.metadata, if !necessary(a) then !necessary(b)
+	bool necessary(const UID& serverId, const StorageMetadataType& metadata) const;
+
+	// try to return the next storage server that is necessary to wiggle
+	Optional<UID> getNextServerId(bool necessaryOnly = true);
+	// next check time to avoid busy loop
+	Future<Void> onCheck() const;
 	State getWiggleState() const { return wiggleState; }
 	void setWiggleState(State s) {
 		if (wiggleState != s) {
@@ -542,5 +803,9 @@ struct StorageWiggler : ReferenceCounted<StorageWiggler> {
 	}
 };
 
+#ifndef __INTEL_COMPILER
+#pragma endregion
+#endif
+
 #include "flow/unactorcompiler.h"
 #endif
diff --git a/fdbserver/include/fdbserver/EncryptedMutationMessage.h b/fdbserver/include/fdbserver/EncryptedMutationMessage.h
deleted file mode 100644
index 5aca12ade9..0000000000
--- a/fdbserver/include/fdbserver/EncryptedMutationMessage.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * EncryptedMutationMessage.h
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef FDBSERVER_ENCRYPTEDMUTATIONMESSAGE_H
-#define FDBSERVER_ENCRYPTEDMUTATIONMESSAGE_H
-
-#pragma once
-
-#include "fdbclient/CommitTransaction.h"
-#include "fdbserver/Knobs.h"
-#include "flow/BlobCipher.h"
-
-struct EncryptedMutationMessage {
-
-	BlobCipherEncryptHeader header;
-	StringRef encrypted;
-
-	EncryptedMutationMessage() {}
-
-	std::string toString() const {
-		return format("code: %d, encryption info: %s",
-		              MutationRef::Reserved_For_EncryptedMutationMessage,
-		              header.toString().c_str());
-	}
-
-	template <class Ar>
-	void serialize(Ar& ar) {
-		uint8_t poly = MutationRef::Reserved_For_EncryptedMutationMessage;
-		serializer(ar, poly, header, encrypted);
-	}
-
-	static bool startsEncryptedMutationMessage(uint8_t byte) {
-		return byte == MutationRef::Reserved_For_EncryptedMutationMessage;
-	}
-	template <class Ar>
-	static bool isNextIn(Ar& ar) {
-		return startsEncryptedMutationMessage(*(const uint8_t*)ar.peekBytes(1));
-	}
-
-	// Encrypt given mutation and return an EncryptedMutationMessage.
-	static EncryptedMutationMessage encrypt(
-	    Arena& arena,
-	    const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>& cipherKeys,
-	    const EncryptCipherDomainId& domainId,
-	    const MutationRef& mutation) {
-		ASSERT_NE(domainId, ENCRYPT_INVALID_DOMAIN_ID);
-		auto textCipherItr = cipherKeys.find(domainId);
-		auto headerCipherItr = cipherKeys.find(ENCRYPT_HEADER_DOMAIN_ID);
-		ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid());
-		ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid());
-		uint8_t iv[AES_256_IV_LENGTH];
-		deterministicRandom()->randomBytes(iv, AES_256_IV_LENGTH);
-		BinaryWriter bw(AssumeVersion(g_network->protocolVersion()));
-		bw << mutation;
-		EncryptedMutationMessage encrypted_mutation;
-		EncryptBlobCipherAes265Ctr cipher(textCipherItr->second,
-		                                  headerCipherItr->second,
-		                                  iv,
-		                                  AES_256_IV_LENGTH,
-		                                  ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
-		encrypted_mutation.encrypted =
-		    cipher
-		        .encrypt(static_cast<const uint8_t*>(bw.getData()), bw.getLength(), &encrypted_mutation.header, arena)
-		        ->toStringRef();
-		return encrypted_mutation;
-	}
-
-	// Encrypt system key space mutation and return an EncryptedMutationMessage.
-	static EncryptedMutationMessage encryptMetadata(
-	    Arena& arena,
-	    const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>& cipherKeys,
-	    const MutationRef& mutation) {
-		return encrypt(arena, cipherKeys, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, mutation);
-	}
-
-	// Read an EncryptedMutationMessage from given reader, decrypt and return the encrypted mutation.
-	// Also return decrypt buffer through buf, if it is specified.
-	template <class Ar>
-	static MutationRef decrypt(Ar& ar,
-	                           Arena& arena,
-	                           const std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>& cipherKeys,
-	                           StringRef* buf = nullptr) {
-		ASSERT(SERVER_KNOBS->ENABLE_ENCRYPTION);
-		EncryptedMutationMessage msg;
-		ar >> msg;
-		auto textCipherItr = cipherKeys.find(msg.header.cipherTextDetails);
-		auto headerCipherItr = cipherKeys.find(msg.header.cipherHeaderDetails);
-		ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid());
-		ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid());
-		DecryptBlobCipherAes256Ctr cipher(textCipherItr->second, headerCipherItr->second, msg.header.iv);
-		StringRef plaintext =
-		    cipher.decrypt(msg.encrypted.begin(), msg.encrypted.size(), msg.header, arena)->toStringRef();
-		if (buf != nullptr) {
-			*buf = plaintext;
-		}
-		ArenaReader reader(arena, plaintext, AssumeVersion(g_network->protocolVersion()));
-		MutationRef mutation;
-		reader >> mutation;
-		return mutation;
-	}
-};
-#endif
diff --git a/fdbserver/include/fdbserver/EncryptionOpsUtils.h b/fdbserver/include/fdbserver/EncryptionOpsUtils.h
new file mode 100644
index 0000000000..157633fe10
--- /dev/null
+++ b/fdbserver/include/fdbserver/EncryptionOpsUtils.h
@@ -0,0 +1,48 @@
+/*
+ * EncryptionOpUtils.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FDBSERVER_ENCRYPTION_OPS_UTIL_H
+#define FDBSERVER_ENCRYPTION_OPS_UTIL_H
+#pragma once
+
+#include "fdbserver/Knobs.h"
+#include "fdbclient/CommitProxyInterface.h"
+
+typedef enum { TLOG_ENCRYPTION = 0, STORAGE_SERVER_ENCRYPTION = 1, BLOB_GRANULE_ENCRYPTION = 2 } EncryptOperationType;
+
+inline bool isEncryptionOpSupported(EncryptOperationType operation_type, const ClientDBInfo& dbInfo) {
+	if (!dbInfo.isEncryptionEnabled) {
+		return false;
+	}
+
+	if (operation_type == TLOG_ENCRYPTION) {
+		return SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION;
+	} else if (operation_type == STORAGE_SERVER_ENCRYPTION) {
+		return SERVER_KNOBS->ENABLE_STORAGE_SERVER_ENCRYPTION;
+	} else if (operation_type == BLOB_GRANULE_ENCRYPTION) {
+		bool supported = SERVER_KNOBS->ENABLE_BLOB_GRANULE_ENCRYPTION && SERVER_KNOBS->BG_METADATA_SOURCE == "tenant";
+		ASSERT((supported && SERVER_KNOBS->ENABLE_ENCRYPTION) || !supported);
+		return supported;
+	} else {
+		return false;
+	}
+}
+
+#endif // FDBSERVER_ENCRYPTION_OPS_UTIL_H
diff --git a/fdbserver/include/fdbserver/GetEncryptCipherKeys.h b/fdbserver/include/fdbserver/GetEncryptCipherKeys.h
deleted file mode 100644
index 9a060c295e..0000000000
--- a/fdbserver/include/fdbserver/GetEncryptCipherKeys.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * GetEncryptCipherKeys.h
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#ifndef FDBSERVER_GETCIPHERKEYS_H
-#define FDBSERVER_GETCIPHERKEYS_H
-
-#include "fdbserver/ServerDBInfo.h"
-#include "flow/BlobCipher.h"
-
-#include <unordered_map>
-#include <unordered_set>
-
-// Get latest cipher keys for given encryption domains. It tries to get the cipher keys from local cache.
-// In case of cache miss, it fetches the cipher keys from EncryptKeyProxy and put the result in the local cache
-// before return.
-Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>> getLatestEncryptCipherKeys(
-    const Reference<AsyncVar<ServerDBInfo> const>& db,
-    const std::unordered_map<EncryptCipherDomainId, EncryptCipherDomainName>& domains);
-
-// Get cipher keys specified by the list of cipher details. It tries to get the cipher keys from local cache.
-// In case of cache miss, it fetches the cipher keys from EncryptKeyProxy and put the result in the local cache
-// before return.
-Future<std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>> getEncryptCipherKeys(
-    const Reference<AsyncVar<ServerDBInfo> const>& db,
-    const std::unordered_set<BlobCipherDetails>& cipherDetails);
-
-struct TextAndHeaderCipherKeys {
-	Reference<BlobCipherKey> cipherTextKey;
-	Reference<BlobCipherKey> cipherHeaderKey;
-};
-
-// Helper method to get latest cipher text key and cipher header key for system domain,
-// used for encrypting system data.
-Future<TextAndHeaderCipherKeys> getLatestSystemEncryptCipherKeys(const Reference<AsyncVar<ServerDBInfo> const>& db);
-
-// Helper method to get both text cipher key and header cipher key for the given encryption header,
-// used for decrypting given encrypted data with encryption header.
-Future<TextAndHeaderCipherKeys> getEncryptCipherKeys(const Reference<AsyncVar<ServerDBInfo> const>& db,
-                                                     const BlobCipherEncryptHeader& header);
-#endif
\ No newline at end of file
diff --git a/fdbserver/include/fdbserver/IEncryptionKeyProvider.actor.h b/fdbserver/include/fdbserver/IEncryptionKeyProvider.actor.h
new file mode 100644
index 0000000000..4dce8c4441
--- /dev/null
+++ b/fdbserver/include/fdbserver/IEncryptionKeyProvider.actor.h
@@ -0,0 +1,284 @@
+/*
+ * IEncryptionKeyProvider.actor.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_IENCRYPTIONKEYPROVIDER_ACTOR_G_H)
+#define FDBSERVER_IENCRYPTIONKEYPROVIDER_ACTOR_G_H
+#include "fdbserver/IEncryptionKeyProvider.actor.g.h"
+#elif !defined(FDBSERVER_IENCRYPTIONKEYPROVIDER_ACTOR_H)
+#define FDBSERVER_IENCRYPTIONKEYPROVIDER_ACTOR_H
+
+#include "fdbclient/GetEncryptCipherKeys.actor.h"
+#include "fdbclient/Tenant.h"
+#include "fdbserver/EncryptionOpsUtils.h"
+#include "fdbserver/ServerDBInfo.h"
+#include "flow/Arena.h"
+
+#define XXH_INLINE_ALL
+#include "flow/xxhash.h"
+
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+typedef uint64_t XOREncryptionKeyID;
+
+// EncryptionKeyRef is somewhat multi-variant, it will contain members representing the union
+// of all fields relevant to any implemented encryption scheme.  They are generally of
+// the form
+//   Page Fields - fields which come from or are stored in the Page
+//   Secret Fields - fields which are only known by the Key Provider
+// but it is up to each encoding and provider which fields are which and which ones are used
+//
+// TODO(yiwu): Rename and/or refactor this struct. It doesn't sound like an encryption key should
+// contain page fields like encryption header.
+struct EncryptionKeyRef {
+
+	EncryptionKeyRef(){};
+	EncryptionKeyRef(Arena& arena, const EncryptionKeyRef& toCopy)
+	  : cipherKeys(toCopy.cipherKeys), secret(arena, toCopy.secret), id(toCopy.id) {}
+	int expectedSize() const { return secret.size(); }
+
+	// Fields for AESEncryptionV1
+	TextAndHeaderCipherKeys cipherKeys;
+	Optional<BlobCipherEncryptHeader> cipherHeader;
+	// Fields for XOREncryption_TestOnly
+	StringRef secret;
+	Optional<XOREncryptionKeyID> id;
+};
+typedef Standalone<EncryptionKeyRef> EncryptionKey;
+
+// Interface used by pager to get encryption keys reading pages from disk
+// and by the BTree to get encryption keys to use for new pages
+class IEncryptionKeyProvider : public ReferenceCounted<IEncryptionKeyProvider> {
+public:
+	virtual ~IEncryptionKeyProvider() {}
+
+	// Get an EncryptionKey with Secret Fields populated based on the given Page Fields.
+	// It is up to the implementation which fields those are.
+	// The output Page Fields must match the input Page Fields.
+	virtual Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) = 0;
+
+	// Get encryption key that should be used for a given user Key-Value range
+	virtual Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) = 0;
+
+	// Setting tenant prefix to tenant name map.
+	virtual void setTenantPrefixIndex(Reference<TenantPrefixIndex> tenantPrefixIndex) {}
+
+	virtual bool shouldEnableEncryption() const = 0;
+};
+
+// The null key provider is useful to simplify page decoding.
+// It throws an error for any key info requested.
+class NullKeyProvider : public IEncryptionKeyProvider {
+public:
+	virtual ~NullKeyProvider() {}
+	bool shouldEnableEncryption() const override { return true; }
+	Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override { throw encryption_key_not_found(); }
+	Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) override {
+		throw encryption_key_not_found();
+	}
+};
+
+// Key provider for dummy XOR encryption scheme
+class XOREncryptionKeyProvider_TestOnly : public IEncryptionKeyProvider {
+public:
+	XOREncryptionKeyProvider_TestOnly(std::string filename) {
+		ASSERT(g_network->isSimulated());
+
+		// Choose a deterministic random filename (without path) byte for secret generation
+		// Remove any leading directory names
+		size_t lastSlash = filename.find_last_of("\\/");
+		if (lastSlash != filename.npos) {
+			filename.erase(0, lastSlash);
+		}
+		xorWith = filename.empty() ? 0x5e
+		                           : (uint8_t)filename[XXH3_64bits(filename.data(), filename.size()) % filename.size()];
+	}
+
+	virtual ~XOREncryptionKeyProvider_TestOnly() {}
+
+	bool shouldEnableEncryption() const override { return true; }
+
+	Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override {
+		if (!key.id.present()) {
+			throw encryption_key_not_found();
+		}
+		EncryptionKey s = key;
+		uint8_t secret = ~(uint8_t)key.id.get() ^ xorWith;
+		s.secret = StringRef(s.arena(), &secret, 1);
+		return s;
+	}
+
+	Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) override {
+		EncryptionKeyRef k;
+		k.id = end.empty() ? 0 : *(end.end() - 1);
+		return getSecrets(k);
+	}
+
+	uint8_t xorWith;
+};
+
+// Key provider to provider cipher keys randomly from a pre-generated pool. Use for testing.
+class RandomEncryptionKeyProvider : public IEncryptionKeyProvider {
+public:
+	RandomEncryptionKeyProvider() {
+		for (unsigned i = 0; i < NUM_CIPHER; i++) {
+			BlobCipherDetails cipherDetails;
+			cipherDetails.encryptDomainId = i;
+			cipherDetails.baseCipherId = deterministicRandom()->randomUInt64();
+			cipherDetails.salt = deterministicRandom()->randomUInt64();
+			cipherKeys[i] = generateCipherKey(cipherDetails);
+		}
+	}
+	virtual ~RandomEncryptionKeyProvider() = default;
+
+	bool shouldEnableEncryption() const override { return true; }
+
+	Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override {
+		ASSERT(key.cipherHeader.present());
+		EncryptionKey s = key;
+		s.cipherKeys.cipherTextKey = cipherKeys[key.cipherHeader.get().cipherTextDetails.encryptDomainId];
+		s.cipherKeys.cipherHeaderKey = cipherKeys[key.cipherHeader.get().cipherHeaderDetails.encryptDomainId];
+		return s;
+	}
+
+	Future<EncryptionKey> getByRange(const KeyRef& /*begin*/, const KeyRef& /*end*/) override {
+		EncryptionKey s;
+		s.cipherKeys.cipherTextKey = getRandomCipherKey();
+		s.cipherKeys.cipherHeaderKey = getRandomCipherKey();
+		return s;
+	}
+
+private:
+	Reference<BlobCipherKey> generateCipherKey(const BlobCipherDetails& cipherDetails) {
+		static unsigned char SHA_KEY[] = "3ab9570b44b8315fdb261da6b1b6c13b";
+		Arena arena;
+		StringRef digest = computeAuthToken(reinterpret_cast<const unsigned char*>(&cipherDetails.baseCipherId),
+		                                    sizeof(EncryptCipherBaseKeyId),
+		                                    SHA_KEY,
+		                                    AES_256_KEY_LENGTH,
+		                                    arena);
+		return makeReference<BlobCipherKey>(cipherDetails.encryptDomainId,
+		                                    cipherDetails.baseCipherId,
+		                                    digest.begin(),
+		                                    AES_256_KEY_LENGTH,
+		                                    cipherDetails.salt,
+		                                    std::numeric_limits<int64_t>::max() /* refreshAt */,
+		                                    std::numeric_limits<int64_t>::max() /* expireAt */);
+	}
+
+	Reference<BlobCipherKey> getRandomCipherKey() {
+		return cipherKeys[deterministicRandom()->randomInt(0, NUM_CIPHER)];
+	}
+
+	static constexpr int NUM_CIPHER = 1000;
+	Reference<BlobCipherKey> cipherKeys[NUM_CIPHER];
+};
+
+// Key provider which extract tenant id from range key prefixes, and fetch tenant specific encryption keys from
+// EncryptKeyProxy.
+class TenantAwareEncryptionKeyProvider : public IEncryptionKeyProvider {
+public:
+	TenantAwareEncryptionKeyProvider(Reference<AsyncVar<ServerDBInfo> const> db) : db(db) {}
+
+	virtual ~TenantAwareEncryptionKeyProvider() = default;
+
+	bool shouldEnableEncryption() const override {
+		return isEncryptionOpSupported(EncryptOperationType::STORAGE_SERVER_ENCRYPTION, db->get().client);
+	}
+
+	ACTOR static Future<EncryptionKey> getSecrets(TenantAwareEncryptionKeyProvider* self, EncryptionKeyRef key) {
+		if (!key.cipherHeader.present()) {
+			TraceEvent("TenantAwareEncryptionKeyProvider_CipherHeaderMissing");
+			throw encrypt_ops_error();
+		}
+		TextAndHeaderCipherKeys cipherKeys = wait(getEncryptCipherKeys(self->db, key.cipherHeader.get()));
+		EncryptionKey s = key;
+		s.cipherKeys = cipherKeys;
+		return s;
+	}
+
+	Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override { return getSecrets(this, key); }
+
+	ACTOR static Future<EncryptionKey> getByRange(TenantAwareEncryptionKeyProvider* self, KeyRef begin, KeyRef end) {
+		EncryptCipherDomainName domainName;
+		EncryptCipherDomainId domainId = self->getEncryptionDomainId(begin, end, &domainName);
+		TextAndHeaderCipherKeys cipherKeys = wait(getLatestEncryptCipherKeysForDomain(self->db, domainId, domainName));
+		EncryptionKey s;
+		s.cipherKeys = cipherKeys;
+		return s;
+	}
+
+	Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) override {
+		return getByRange(this, begin, end);
+	}
+
+	void setTenantPrefixIndex(Reference<TenantPrefixIndex> tenantPrefixIndex) override {
+		ASSERT(tenantPrefixIndex.isValid());
+		this->tenantPrefixIndex = tenantPrefixIndex;
+	}
+
+private:
+	EncryptCipherDomainId getEncryptionDomainId(const KeyRef& begin,
+	                                            const KeyRef& end,
+	                                            EncryptCipherDomainName* domainName) {
+		int64_t domainId = SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID;
+		int64_t beginTenantId = getTenant(begin, true /*inclusive*/);
+		int64_t endTenantId = getTenant(end, false /*inclusive*/);
+		if (beginTenantId == endTenantId && beginTenantId != SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
+			ASSERT(tenantPrefixIndex.isValid());
+			Key tenantPrefix = TenantMapEntry::idToPrefix(beginTenantId);
+			auto view = tenantPrefixIndex->atLatest();
+			auto itr = view.find(tenantPrefix);
+			if (itr != view.end()) {
+				*domainName = *itr;
+				domainId = beginTenantId;
+			} else {
+				// No tenant with the same tenant id. We could be in optional or disabled tenant mode.
+			}
+		}
+		if (domainId == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
+			*domainName = FDB_DEFAULT_ENCRYPT_DOMAIN_NAME;
+		}
+		return domainId;
+	}
+
+	int64_t getTenant(const KeyRef& key, bool inclusive) {
+		// A valid tenant id is always a valid encrypt domain id.
+		static_assert(ENCRYPT_INVALID_DOMAIN_ID < 0);
+		if (key.size() < TENANT_PREFIX_SIZE || key >= systemKeys.begin) {
+			return SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID;
+		}
+		// TODO(yiwu): Use TenantMapEntry::prefixToId() instead.
+		int64_t tenantId = bigEndian64(*reinterpret_cast<const int64_t*>(key.begin()));
+		if (tenantId < 0) {
+			return SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID;
+		}
+		if (!inclusive && key.size() == TENANT_PREFIX_SIZE) {
+			tenantId = tenantId - 1;
+		}
+		ASSERT(tenantId >= 0);
+		return tenantId;
+	}
+
+	Reference<AsyncVar<ServerDBInfo> const> db;
+	Reference<TenantPrefixIndex> tenantPrefixIndex;
+};
+
+#include "flow/unactorcompiler.h"
+#endif
\ No newline at end of file
diff --git a/fdbserver/include/fdbserver/IKeyValueStore.h b/fdbserver/include/fdbserver/IKeyValueStore.h
index aa2e9cc227..4df57780f8 100644
--- a/fdbserver/include/fdbserver/IKeyValueStore.h
+++ b/fdbserver/include/fdbserver/IKeyValueStore.h
@@ -23,9 +23,11 @@
 #pragma once
 
 #include "fdbclient/FDBTypes.h"
-#include "fdbserver/Knobs.h"
-#include "fdbserver/ServerDBInfo.h"
 #include "fdbclient/StorageCheckpoint.h"
+#include "fdbclient/Tenant.h"
+#include "fdbserver/Knobs.h"
+#include "fdbserver/IEncryptionKeyProvider.actor.h"
+#include "fdbserver/ServerDBInfo.h"
 
 struct CheckpointRequest {
 	const Version version; // The FDB version at which the checkpoint is created.
@@ -70,22 +72,19 @@ public:
 	virtual Future<Void> commit(
 	    bool sequential = false) = 0; // returns when prior sets and clears are (atomically) durable
 
-	virtual Future<Optional<Value>> readValue(KeyRef key,
-	                                          ReadType type = ReadType::NORMAL,
-	                                          Optional<UID> debugID = Optional<UID>()) = 0;
+	virtual Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> options = Optional<ReadOptions>()) = 0;
 
 	// Like readValue(), but returns only the first maxLength bytes of the value if it is longer
 	virtual Future<Optional<Value>> readValuePrefix(KeyRef key,
 	                                                int maxLength,
-	                                                ReadType type = ReadType::NORMAL,
-	                                                Optional<UID> debugID = Optional<UID>()) = 0;
+	                                                Optional<ReadOptions> options = Optional<ReadOptions>()) = 0;
 
 	// If rowLimit>=0, reads first rows sorted ascending, otherwise reads last rows sorted descending
 	// The total size of the returned value (less the last entry) will be less than byteLimit
 	virtual Future<RangeResult> readRange(KeyRangeRef keys,
 	                                      int rowLimit = 1 << 30,
 	                                      int byteLimit = 1 << 30,
-	                                      ReadType type = ReadType::NORMAL) = 0;
+	                                      Optional<ReadOptions> options = Optional<ReadOptions>()) = 0;
 
 	// Shard management APIs.
 	// Adds key range to a physical shard.
@@ -150,7 +149,9 @@ extern IKeyValueStore* keyValueStoreSQLite(std::string const& filename,
                                            KeyValueStoreType storeType,
                                            bool checkChecksums = false,
                                            bool checkIntegrity = false);
-extern IKeyValueStore* keyValueStoreRedwoodV1(std::string const& filename, UID logID);
+extern IKeyValueStore* keyValueStoreRedwoodV1(std::string const& filename,
+                                              UID logID,
+                                              Reference<IEncryptionKeyProvider> encryptionKeyProvider = {});
 extern IKeyValueStore* keyValueStoreRocksDB(std::string const& path,
                                             UID logID,
                                             KeyValueStoreType storeType,
@@ -188,7 +189,8 @@ inline IKeyValueStore* openKVStore(KeyValueStoreType storeType,
                                    int64_t memoryLimit,
                                    bool checkChecksums = false,
                                    bool checkIntegrity = false,
-                                   bool openRemotely = false) {
+                                   bool openRemotely = false,
+                                   Reference<IEncryptionKeyProvider> encryptionKeyProvider = {}) {
 	if (openRemotely) {
 		return openRemoteKVStore(storeType, filename, logID, memoryLimit, checkChecksums, checkIntegrity);
 	}
@@ -200,7 +202,7 @@ inline IKeyValueStore* openKVStore(KeyValueStoreType storeType,
 	case KeyValueStoreType::MEMORY:
 		return keyValueStoreMemory(filename, logID, memoryLimit);
 	case KeyValueStoreType::SSD_REDWOOD_V1:
-		return keyValueStoreRedwoodV1(filename, logID);
+		return keyValueStoreRedwoodV1(filename, logID, encryptionKeyProvider);
 	case KeyValueStoreType::SSD_ROCKSDB_V1:
 		return keyValueStoreRocksDB(filename, logID, storeType);
 	case KeyValueStoreType::SSD_SHARDED_ROCKSDB:
diff --git a/fdbserver/include/fdbserver/IPager.h b/fdbserver/include/fdbserver/IPager.h
index fdfc3c364d..440b034aa6 100644
--- a/fdbserver/include/fdbserver/IPager.h
+++ b/fdbserver/include/fdbserver/IPager.h
@@ -17,20 +17,22 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 
 #ifndef FDBSERVER_IPAGER_H
 #define FDBSERVER_IPAGER_H
-#include "flow/Error.h"
-#include "flow/FastAlloc.h"
-#include "flow/ProtocolVersion.h"
 #include <cstddef>
 #include <stdint.h>
-#pragma once
-
-#include "fdbserver/IKeyValueStore.h"
-
-#include "flow/flow.h"
 #include "fdbclient/FDBTypes.h"
+#include "fdbclient/Tenant.h"
+#include "fdbserver/IEncryptionKeyProvider.actor.h"
+#include "fdbserver/IKeyValueStore.h"
+#include "flow/BlobCipher.h"
+#include "flow/Error.h"
+#include "flow/FastAlloc.h"
+#include "flow/flow.h"
+#include "flow/ProtocolVersion.h"
+
 #define XXH_INLINE_ALL
 #include "flow/xxhash.h"
 
@@ -46,10 +48,18 @@ typedef uint32_t QueueID;
 enum class PagerEvents { CacheLookup = 0, CacheHit, CacheMiss, PageWrite, MAXEVENTS };
 static const char* const PagerEventsStrings[] = { "Lookup", "Hit", "Miss", "Write", "Unknown" };
 // Reasons for page level events.
-enum class PagerEventReasons { PointRead = 0, RangeRead, RangePrefetch, Commit, LazyClear, MetaData, MAXEVENTREASONS };
-static const char* const PagerEventReasonsStrings[] = {
-	"Get", "GetR", "GetRPF", "Commit", "LazyClr", "Meta", "Unknown"
+enum class PagerEventReasons {
+	PointRead = 0,
+	FetchRange,
+	RangeRead,
+	RangePrefetch,
+	Commit,
+	LazyClear,
+	MetaData,
+	MAXEVENTREASONS
 };
+static const char* const PagerEventReasonsStrings[] = { "Get",    "FetchR",  "GetR", "GetRPF",
+	                                                    "Commit", "LazyClr", "Meta", "Unknown" };
 
 static const unsigned int nonBtreeLevel = 0;
 static const std::vector<std::pair<PagerEvents, PagerEventReasons>> possibleEventReasonPairs = {
@@ -57,14 +67,17 @@ static const std::vector<std::pair<PagerEvents, PagerEventReasons>> possibleEven
 	{ PagerEvents::CacheLookup, PagerEventReasons::LazyClear },
 	{ PagerEvents::CacheLookup, PagerEventReasons::PointRead },
 	{ PagerEvents::CacheLookup, PagerEventReasons::RangeRead },
+	{ PagerEvents::CacheLookup, PagerEventReasons::FetchRange },
 	{ PagerEvents::CacheHit, PagerEventReasons::Commit },
 	{ PagerEvents::CacheHit, PagerEventReasons::LazyClear },
 	{ PagerEvents::CacheHit, PagerEventReasons::PointRead },
 	{ PagerEvents::CacheHit, PagerEventReasons::RangeRead },
+	{ PagerEvents::CacheHit, PagerEventReasons::FetchRange },
 	{ PagerEvents::CacheMiss, PagerEventReasons::Commit },
 	{ PagerEvents::CacheMiss, PagerEventReasons::LazyClear },
 	{ PagerEvents::CacheMiss, PagerEventReasons::PointRead },
 	{ PagerEvents::CacheMiss, PagerEventReasons::RangeRead },
+	{ PagerEvents::CacheMiss, PagerEventReasons::FetchRange },
 	{ PagerEvents::PageWrite, PagerEventReasons::Commit },
 	{ PagerEvents::PageWrite, PagerEventReasons::LazyClear },
 };
@@ -78,11 +91,7 @@ static const std::vector<std::pair<PagerEvents, PagerEventReasons>> L0PossibleEv
 	{ PagerEvents::PageWrite, PagerEventReasons::MetaData },
 };
 
-enum EncodingType : uint8_t {
-	XXHash64 = 0,
-	// For testing purposes
-	XOREncryption = 1
-};
+enum EncodingType : uint8_t { XXHash64 = 0, XOREncryption_TestOnly = 1, AESEncryptionV1 = 2, MAX_ENCODING_TYPE = 3 };
 
 enum PageType : uint8_t {
 	HeaderPage = 0,
@@ -93,41 +102,6 @@ enum PageType : uint8_t {
 	QueuePageInExtent = 5
 };
 
-// Encryption key ID
-typedef uint64_t KeyID;
-
-// EncryptionKeyRef is somewhat multi-variant, it will contain members representing the union
-// of all fields relevant to any implemented encryption scheme.  They are generally of
-// the form
-//   Page Fields - fields which come from or are stored in the Page
-//   Secret Fields - fields which are only known by the Key Provider
-// but it is up to each encoding and provider which fields are which and which ones are used
-struct EncryptionKeyRef {
-
-	EncryptionKeyRef(){};
-	EncryptionKeyRef(Arena& arena, const EncryptionKeyRef& toCopy) : secret(arena, toCopy.secret), id(toCopy.id) {}
-	int expectedSize() const { return secret.size(); }
-
-	StringRef secret;
-	Optional<KeyID> id;
-};
-typedef Standalone<EncryptionKeyRef> EncryptionKey;
-
-// Interface used by pager to get encryption keys by ID when reading pages from disk
-// and by the BTree to get encryption keys to use for new pages
-class IEncryptionKeyProvider {
-public:
-	virtual ~IEncryptionKeyProvider() {}
-
-	// Get an EncryptionKey with Secret Fields populated based on the given Page Fields.
-	// It is up to the implementation which fields those are.
-	// The output Page Fields must match the input Page Fields.
-	virtual Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) = 0;
-
-	// Get encryption key that should be used for a given user Key-Value range
-	virtual Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) = 0;
-};
-
 // This is a hacky way to attach an additional object of an arbitrary type at runtime to another object.
 // It stores an arbitrary void pointer and a void pointer function to call when the ArbitraryObject
 // is destroyed.
@@ -328,7 +302,7 @@ public:
 	};
 
 	// An encoding that validates the payload with an XXHash checksum
-	struct XXHashEncodingHeader {
+	struct XXHashEncoder {
 		XXH64_hash_t checksum;
 		void encode(uint8_t* payload, int len, PhysicalPageID seed) {
 			checksum = XXH3_64bits_withSeed(payload, len, seed);
@@ -342,7 +316,7 @@ public:
 
 	// A dummy "encrypting" encoding which uses XOR with a 1 byte secret key on
 	// the payload to obfuscate it and protects the payload with an XXHash checksum.
-	struct XOREncryptionEncodingHeader {
+	struct XOREncryptionEncoder {
 		// Checksum is on unencrypted payload
 		XXH64_hash_t checksum;
 		uint8_t keyID;
@@ -362,6 +336,27 @@ public:
 			}
 		}
 	};
+
+	struct AESEncryptionV1Encoder {
+		BlobCipherEncryptHeader header;
+
+		void encode(const TextAndHeaderCipherKeys& cipherKeys, uint8_t* payload, int len) {
+			EncryptBlobCipherAes265Ctr cipher(
+			    cipherKeys.cipherTextKey, cipherKeys.cipherHeaderKey, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
+			Arena arena;
+			StringRef ciphertext = cipher.encrypt(payload, len, &header, arena)->toStringRef();
+			ASSERT_EQ(len, ciphertext.size());
+			memcpy(payload, ciphertext.begin(), len);
+		}
+
+		void decode(const TextAndHeaderCipherKeys& cipherKeys, uint8_t* payload, int len) {
+			DecryptBlobCipherAes256Ctr cipher(cipherKeys.cipherTextKey, cipherKeys.cipherHeaderKey, header.iv);
+			Arena arena;
+			StringRef plaintext = cipher.decrypt(payload, len, header, arena)->toStringRef();
+			ASSERT_EQ(len, plaintext.size());
+			memcpy(payload, plaintext.begin(), len);
+		}
+	};
 #pragma pack(pop)
 
 	// Get the size of the encoding header based on type
@@ -369,9 +364,11 @@ public:
 	// existing pages, the payload offset is stored in the page.
 	static int encodingHeaderSize(EncodingType t) {
 		if (t == EncodingType::XXHash64) {
-			return sizeof(XXHashEncodingHeader);
-		} else if (t == EncodingType::XOREncryption) {
-			return sizeof(XOREncryptionEncodingHeader);
+			return sizeof(XXHashEncoder);
+		} else if (t == EncodingType::XOREncryption_TestOnly) {
+			return sizeof(XOREncryptionEncoder);
+		} else if (t == EncodingType::AESEncryptionV1) {
+			return sizeof(AESEncryptionV1Encoder);
 		} else {
 			throw page_encoding_not_supported();
 		}
@@ -475,12 +472,15 @@ public:
 		ASSERT(VALGRIND_CHECK_MEM_IS_DEFINED(pPayload, payloadSize) == 0);
 
 		if (page->encodingType == EncodingType::XXHash64) {
-			page->getEncodingHeader<XXHashEncodingHeader>()->encode(pPayload, payloadSize, pageID);
-		} else if (page->encodingType == EncodingType::XOREncryption) {
+			page->getEncodingHeader<XXHashEncoder>()->encode(pPayload, payloadSize, pageID);
+		} else if (page->encodingType == EncodingType::XOREncryption_TestOnly) {
 			ASSERT(encryptionKey.secret.size() == 1);
-			XOREncryptionEncodingHeader* xh = page->getEncodingHeader<XOREncryptionEncodingHeader>();
+			XOREncryptionEncoder* xh = page->getEncodingHeader<XOREncryptionEncoder>();
 			xh->keyID = encryptionKey.id.orDefault(0);
 			xh->encode(encryptionKey.secret[0], pPayload, payloadSize, pageID);
+		} else if (page->encodingType == EncodingType::AESEncryptionV1) {
+			AESEncryptionV1Encoder* eh = page->getEncodingHeader<AESEncryptionV1Encoder>();
+			eh->encode(encryptionKey.cipherKeys, pPayload, payloadSize);
 		} else {
 			throw page_encoding_not_supported();
 		}
@@ -504,8 +504,11 @@ public:
 		payloadSize = logicalSize - (pPayload - buffer);
 
 		// Populate encryption key with relevant fields from page
-		if (page->encodingType == EncodingType::XOREncryption) {
-			encryptionKey.id = page->getEncodingHeader<XOREncryptionEncodingHeader>()->keyID;
+		if (page->encodingType == EncodingType::XOREncryption_TestOnly) {
+			encryptionKey.id = page->getEncodingHeader<XOREncryptionEncoder>()->keyID;
+		} else if (page->encodingType == EncodingType::AESEncryptionV1) {
+			AESEncryptionV1Encoder* eh = page->getEncodingHeader<AESEncryptionV1Encoder>();
+			encryptionKey.cipherHeader = eh->header;
 		}
 
 		if (page->headerVersion == 1) {
@@ -525,11 +528,13 @@ public:
 	// Post:  Payload has been verified and decrypted if necessary
 	void postReadPayload(PhysicalPageID pageID) {
 		if (page->encodingType == EncodingType::XXHash64) {
-			page->getEncodingHeader<XXHashEncodingHeader>()->decode(pPayload, payloadSize, pageID);
-		} else if (page->encodingType == EncodingType::XOREncryption) {
+			page->getEncodingHeader<XXHashEncoder>()->decode(pPayload, payloadSize, pageID);
+		} else if (page->encodingType == EncodingType::XOREncryption_TestOnly) {
 			ASSERT(encryptionKey.secret.size() == 1);
-			page->getEncodingHeader<XOREncryptionEncodingHeader>()->decode(
+			page->getEncodingHeader<XOREncryptionEncoder>()->decode(
 			    encryptionKey.secret[0], pPayload, payloadSize, pageID);
+		} else if (page->encodingType == EncodingType::AESEncryptionV1) {
+			page->getEncodingHeader<AESEncryptionV1Encoder>()->decode(encryptionKey.cipherKeys, pPayload, payloadSize);
 		} else {
 			throw page_encoding_not_supported();
 		}
@@ -537,7 +542,9 @@ public:
 
 	const Arena& getArena() const { return arena; }
 
-	static bool isEncodingTypeEncrypted(EncodingType t) { return t == EncodingType::XOREncryption; }
+	static bool isEncodingTypeEncrypted(EncodingType t) {
+		return t == EncodingType::AESEncryptionV1 || t == EncodingType::XOREncryption_TestOnly;
+	}
 
 	// Returns true if the page's encoding type employs encryption
 	bool isEncrypted() const { return isEncodingTypeEncrypted(getEncodingType()); }
@@ -739,52 +746,4 @@ protected:
 	~IPager2() {} // Destruction should be done using close()/dispose() from the IClosable interface
 };
 
-// The null key provider is useful to simplify page decoding.
-// It throws an error for any key info requested.
-class NullKeyProvider : public IEncryptionKeyProvider {
-public:
-	virtual ~NullKeyProvider() {}
-	Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override { throw encryption_key_not_found(); }
-	Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) override {
-		throw encryption_key_not_found();
-	}
-};
-
-// Key provider for dummy XOR encryption scheme
-class XOREncryptionKeyProvider : public IEncryptionKeyProvider {
-public:
-	XOREncryptionKeyProvider(std::string filename) {
-		ASSERT(g_network->isSimulated());
-
-		// Choose a deterministic random filename (without path) byte for secret generation
-		// Remove any leading directory names
-		size_t lastSlash = filename.find_last_of("\\/");
-		if (lastSlash != filename.npos) {
-			filename.erase(0, lastSlash);
-		}
-		xorWith = filename.empty() ? 0x5e
-		                           : (uint8_t)filename[XXH3_64bits(filename.data(), filename.size()) % filename.size()];
-	}
-
-	virtual ~XOREncryptionKeyProvider() {}
-
-	virtual Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override {
-		if (!key.id.present()) {
-			throw encryption_key_not_found();
-		}
-		EncryptionKey s = key;
-		uint8_t secret = ~(uint8_t)key.id.get() ^ xorWith;
-		s.secret = StringRef(s.arena(), &secret, 1);
-		return s;
-	}
-
-	virtual Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) override {
-		EncryptionKeyRef k;
-		k.id = end.empty() ? 0 : *(end.end() - 1);
-		return getSecrets(k);
-	}
-
-	uint8_t xorWith;
-};
-
 #endif
diff --git a/fdbserver/include/fdbserver/ProxyCommitData.actor.h b/fdbserver/include/fdbserver/ProxyCommitData.actor.h
index b6a75f7ea7..c6e173387a 100644
--- a/fdbserver/include/fdbserver/ProxyCommitData.actor.h
+++ b/fdbserver/include/fdbserver/ProxyCommitData.actor.h
@@ -19,6 +19,7 @@
  */
 
 #pragma once
+#include "fdbserver/EncryptionOpsUtils.h"
 #if defined(NO_INTELLISENSE) && !defined(FDBSERVER_PROXYCOMMITDATA_ACTOR_G_H)
 #define FDBSERVER_PROXYCOMMITDATA_ACTOR_G_H
 #include "fdbserver/ProxyCommitData.actor.g.h"
@@ -30,9 +31,9 @@
 #include "fdbrpc/Stats.h"
 #include "fdbserver/Knobs.h"
 #include "fdbserver/LogSystem.h"
+#include "fdbserver/LogSystemDiskQueueAdapter.h"
 #include "fdbserver/MasterInterface.h"
 #include "fdbserver/ResolverInterface.h"
-#include "fdbserver/LogSystemDiskQueueAdapter.h"
 #include "flow/IRandom.h"
 
 #include "flow/actorcompiler.h" // This must be the last #include.
@@ -231,6 +232,8 @@ struct ProxyCommitData {
 	double lastResolverReset;
 	int localTLogCount = -1;
 
+	bool isEncryptionEnabled = false;
+
 	// The tag related to a storage server rarely change, so we keep a vector of tags for each key range to be slightly
 	// more CPU efficient. When a tag related to a storage server does change, we empty out all of these vectors to
 	// signify they must be repopulated. We do not repopulate them immediately to avoid a slow task.
@@ -299,7 +302,8 @@ struct ProxyCommitData {
 	    cx(openDBOnServer(db, TaskPriority::DefaultEndpoint, LockAware::True)), db(db),
 	    singleKeyMutationEvent(LiteralStringRef("SingleKeyMutation")), lastTxsPop(0), popRemoteTxs(false),
 	    lastStartCommit(0), lastCommitLatency(SERVER_KNOBS->REQUIRED_MIN_RECOVERY_DURATION), lastCommitTime(0),
-	    lastMasterReset(now()), lastResolverReset(now()) {
+	    lastMasterReset(now()), lastResolverReset(now()),
+	    isEncryptionEnabled(isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION, db->get().client)) {
 		commitComputePerOperation.resize(SERVER_KNOBS->PROXY_COMPUTE_BUCKETS, 0.0);
 	}
 };
diff --git a/fdbserver/include/fdbserver/QuietDatabase.h b/fdbserver/include/fdbserver/QuietDatabase.h
index f2f23b5e6a..e1bc9b4fcc 100644
--- a/fdbserver/include/fdbserver/QuietDatabase.h
+++ b/fdbserver/include/fdbserver/QuietDatabase.h
@@ -41,7 +41,9 @@ Future<bool> getTeamCollectionValid(Database const& cx, WorkerInterface const&);
 Future<bool> getTeamCollectionValid(Database const& cx, Reference<AsyncVar<struct ServerDBInfo> const> const&);
 Future<std::vector<StorageServerInterface>> getStorageServers(Database const& cx,
                                                               bool const& use_system_priority = false);
-Future<std::vector<BlobWorkerInterface>> getBlobWorkers(Database const& cx, bool const& use_system_priority = false);
+Future<std::vector<BlobWorkerInterface>> getBlobWorkers(Database const& cx,
+                                                        bool const& use_system_priority = false,
+                                                        Version* const& grv = nullptr);
 Future<std::vector<WorkerDetails>> getWorkers(Reference<AsyncVar<ServerDBInfo> const> const& dbInfo,
                                               int const& flags = 0);
 Future<WorkerInterface> getMasterWorker(Database const& cx, Reference<AsyncVar<ServerDBInfo> const> const& dbInfo);
diff --git a/fdbserver/include/fdbserver/Ratekeeper.h b/fdbserver/include/fdbserver/Ratekeeper.h
index 2c46533e45..02d9d00c54 100644
--- a/fdbserver/include/fdbserver/Ratekeeper.h
+++ b/fdbserver/include/fdbserver/Ratekeeper.h
@@ -46,6 +46,8 @@ enum limitReason_t {
 	log_server_min_free_space_ratio,
 	storage_server_durability_lag, // 10
 	storage_server_list_fetch_failed,
+	blob_worker_lag,
+	blob_worker_missing,
 	limitReason_t_end
 };
 
@@ -111,6 +113,8 @@ struct RatekeeperLimits {
 	int64_t lastDurabilityLag;
 	double durabilityLagLimit;
 
+	double bwLagTarget;
+
 	TransactionPriority priority;
 	std::string context;
 
@@ -123,7 +127,8 @@ struct RatekeeperLimits {
 	                 int64_t logTargetBytes,
 	                 int64_t logSpringBytes,
 	                 double maxVersionDifference,
-	                 int64_t durabilityLagTargetVersions);
+	                 int64_t durabilityLagTargetVersions,
+	                 double bwLagTarget);
 };
 
 class Ratekeeper {
@@ -137,6 +142,18 @@ class Ratekeeper {
 
 		double lastUpdateTime{ 0.0 };
 		double lastTagPushTime{ 0.0 };
+		Version version{ 0 };
+	};
+
+	struct VersionInfo {
+		int64_t totalTransactions;
+		int64_t batchTransactions;
+		double created;
+
+		VersionInfo(int64_t totalTransactions, int64_t batchTransactions, double created)
+		  : totalTransactions(totalTransactions), batchTransactions(batchTransactions), created(created) {}
+
+		VersionInfo() : totalTransactions(0), batchTransactions(0), created(0.0) {}
 	};
 
 	UID id;
@@ -165,8 +182,28 @@ class Ratekeeper {
 	RatekeeperLimits batchLimits;
 
 	Deque<double> actualTpsHistory;
+	Version maxVersion;
+	double blobWorkerTime;
+	double unblockedAssignmentTime;
+	std::map<Version, Ratekeeper::VersionInfo> version_transactions;
+	std::map<Version, std::pair<double, Optional<double>>> version_recovery;
+	Deque<std::pair<double, Version>> blobWorkerVersionHistory;
 	Optional<Key> remoteDC;
 
+	double getRecoveryDuration(Version ver) {
+		auto it = version_recovery.lower_bound(ver);
+		double recoveryDuration = 0;
+		while (it != version_recovery.end()) {
+			if (it->second.second.present()) {
+				recoveryDuration += it->second.second.get() - it->second.first;
+			} else {
+				recoveryDuration += now() - it->second.first;
+			}
+			++it;
+		}
+		return recoveryDuration;
+	}
+
 	Ratekeeper(UID id, Database db);
 
 	Future<Void> configurationMonitor();
@@ -182,6 +219,7 @@ class Ratekeeper {
 	void tryAutoThrottleTag(TransactionTag, double rate, double busyness, TagThrottledReason);
 	void tryAutoThrottleTag(StorageQueueInfo&, int64_t storageQueue, int64_t storageDurabilityLag);
 	Future<Void> monitorThrottlingChanges();
+	Future<Void> monitorBlobWorkers(Reference<AsyncVar<ServerDBInfo> const> dbInfo);
 
 public:
 	static Future<Void> run(RatekeeperInterface rkInterf, Reference<AsyncVar<ServerDBInfo> const> dbInfo);
diff --git a/fdbserver/include/fdbserver/RatekeeperInterface.h b/fdbserver/include/fdbserver/RatekeeperInterface.h
index b361b87a80..2ed6775736 100644
--- a/fdbserver/include/fdbserver/RatekeeperInterface.h
+++ b/fdbserver/include/fdbserver/RatekeeperInterface.h
@@ -78,11 +78,20 @@ struct GetRateInfoReply {
 	double leaseDuration;
 	HealthMetrics healthMetrics;
 
-	Optional<PrioritizedTransactionTagMap<ClientTagThrottleLimits>> throttledTags;
+	// Depending on the value of SERVER_KNOBS->ENFORCE_TAG_THROTTLING_ON_PROXIES,
+	// one of these fields may be populated
+	Optional<PrioritizedTransactionTagMap<ClientTagThrottleLimits>> clientThrottledTags;
+	Optional<PrioritizedTransactionTagMap<double>> proxyThrottledTags;
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, transactionRate, batchTransactionRate, leaseDuration, healthMetrics, throttledTags);
+		serializer(ar,
+		           transactionRate,
+		           batchTransactionRate,
+		           leaseDuration,
+		           healthMetrics,
+		           clientThrottledTags,
+		           proxyThrottledTags);
 	}
 };
 
@@ -91,6 +100,7 @@ struct GetRateInfoRequest {
 	UID requesterID;
 	int64_t totalReleasedTransactions;
 	int64_t batchReleasedTransactions;
+	Version version;
 
 	TransactionTagMap<uint64_t> throttledTagCounts;
 	bool detailed;
@@ -100,16 +110,23 @@ struct GetRateInfoRequest {
 	GetRateInfoRequest(UID const& requesterID,
 	                   int64_t totalReleasedTransactions,
 	                   int64_t batchReleasedTransactions,
+	                   Version version,
 	                   TransactionTagMap<uint64_t> throttledTagCounts,
 	                   bool detailed)
 	  : requesterID(requesterID), totalReleasedTransactions(totalReleasedTransactions),
-	    batchReleasedTransactions(batchReleasedTransactions), throttledTagCounts(throttledTagCounts),
+	    batchReleasedTransactions(batchReleasedTransactions), version(version), throttledTagCounts(throttledTagCounts),
 	    detailed(detailed) {}
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(
-		    ar, requesterID, totalReleasedTransactions, batchReleasedTransactions, throttledTagCounts, detailed, reply);
+		serializer(ar,
+		           requesterID,
+		           totalReleasedTransactions,
+		           batchReleasedTransactions,
+		           version,
+		           throttledTagCounts,
+		           detailed,
+		           reply);
 	}
 };
 
diff --git a/fdbserver/include/fdbserver/RemoteIKeyValueStore.actor.h b/fdbserver/include/fdbserver/RemoteIKeyValueStore.actor.h
index 6aa2c40c83..e110c480c8 100644
--- a/fdbserver/include/fdbserver/RemoteIKeyValueStore.actor.h
+++ b/fdbserver/include/fdbserver/RemoteIKeyValueStore.actor.h
@@ -155,13 +155,12 @@ struct OpenKVStoreRequest {
 struct IKVSGetValueRequest {
 	constexpr static FileIdentifier file_identifier = 1029439;
 	KeyRef key;
-	ReadType type;
-	Optional<UID> debugID = Optional<UID>();
+	Optional<ReadOptions> options;
 	ReplyPromise<Optional<Value>> reply;
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, key, type, debugID, reply);
+		serializer(ar, key, options, reply);
 	}
 };
 
@@ -202,13 +201,12 @@ struct IKVSReadValuePrefixRequest {
 	constexpr static FileIdentifier file_identifier = 1928374;
 	KeyRef key;
 	int maxLength;
-	ReadType type;
-	Optional<UID> debugID = Optional<UID>();
+	Optional<ReadOptions> options;
 	ReplyPromise<Optional<Value>> reply;
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, key, maxLength, type, debugID, reply);
+		serializer(ar, key, maxLength, options, reply);
 	}
 };
 
@@ -246,12 +244,12 @@ struct IKVSReadRangeRequest {
 	KeyRangeRef keys;
 	int rowLimit;
 	int byteLimit;
-	ReadType type;
+	Optional<ReadOptions> options;
 	ReplyPromise<IKVSReadRangeReply> reply;
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, keys, rowLimit, byteLimit, type, reply);
+		serializer(ar, keys, rowLimit, byteLimit, options, reply);
 	}
 };
 
@@ -402,25 +400,22 @@ struct RemoteIKeyValueStore : public IKeyValueStore {
 		return commitAndGetStorageBytes(this, commitReply);
 	}
 
-	Future<Optional<Value>> readValue(KeyRef key,
-	                                  ReadType type = ReadType::NORMAL,
-	                                  Optional<UID> debugID = Optional<UID>()) override {
-		return readValueImpl(this, IKVSGetValueRequest{ key, type, debugID, ReplyPromise<Optional<Value>>() });
+	Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> options = Optional<ReadOptions>()) override {
+		return readValueImpl(this, IKVSGetValueRequest{ key, options, ReplyPromise<Optional<Value>>() });
 	}
 
 	Future<Optional<Value>> readValuePrefix(KeyRef key,
 	                                        int maxLength,
-	                                        ReadType type = ReadType::NORMAL,
-	                                        Optional<UID> debugID = Optional<UID>()) override {
+	                                        Optional<ReadOptions> options = Optional<ReadOptions>()) override {
 		return interf.readValuePrefix.getReply(
-		    IKVSReadValuePrefixRequest{ key, maxLength, type, debugID, ReplyPromise<Optional<Value>>() });
+		    IKVSReadValuePrefixRequest{ key, maxLength, options, ReplyPromise<Optional<Value>>() });
 	}
 
 	Future<RangeResult> readRange(KeyRangeRef keys,
 	                              int rowLimit = 1 << 30,
 	                              int byteLimit = 1 << 30,
-	                              ReadType type = ReadType::NORMAL) override {
-		IKVSReadRangeRequest req{ keys, rowLimit, byteLimit, type, ReplyPromise<IKVSReadRangeReply>() };
+	                              Optional<ReadOptions> options = Optional<ReadOptions>()) override {
+		IKVSReadRangeRequest req{ keys, rowLimit, byteLimit, options, ReplyPromise<IKVSReadRangeReply>() };
 		return fmap([](const IKVSReadRangeReply& reply) { return reply.toRangeResult(); },
 		            interf.readRange.getReply(req));
 	}
@@ -449,7 +444,7 @@ struct RemoteIKeyValueStore : public IKeyValueStore {
 			when(wait(delay(SERVER_KNOBS->REMOTE_KV_STORE_MAX_INIT_DURATION))) {
 				TraceEvent(SevError, "RemoteIKVSInitTooLong")
 				    .detail("TimeLimit", SERVER_KNOBS->REMOTE_KV_STORE_MAX_INIT_DURATION);
-				throw please_reboot_remote_kv_store();
+				throw please_reboot_kv_store(); // this will reboot the kv store
 			}
 		}
 		state Future<Void> connectionCheckingDelay = delay(FLOW_KNOBS->FAILURE_DETECTION_DELAY);
@@ -463,21 +458,21 @@ struct RemoteIKeyValueStore : public IKeyValueStore {
 				if (e.isError())
 					throw e.getError();
 				else
-					return e.get();
+					return Never();
 			}
 			when(int res = wait(returnCode)) {
 				TraceEvent(res != 0 ? SevError : SevInfo, "SpawnedProcessDied").detail("Res", res);
 				if (res)
-					throw please_reboot_remote_kv_store(); // this will reboot the worker
+					throw please_reboot_kv_store(); // this will reboot the kv store
 				else
-					return Void();
+					return Never();
 			}
 			when(wait(connectionCheckingDelay)) {
 				// for the corner case where the child process stuck and waitpid also does not give update on it
 				// In this scenario, we need to manually reboot the storage engine process
 				if (IFailureMonitor::failureMonitor().getState(childAddr).isFailed()) {
 					TraceEvent(SevError, "RemoteKVStoreConnectionStuck").log();
-					throw please_reboot_remote_kv_store(); // this will reboot the worker
+					throw please_reboot_kv_store(); // this will reboot the kv store
 				}
 				connectionCheckingDelay = delay(FLOW_KNOBS->FAILURE_DETECTION_DELAY);
 			}
diff --git a/fdbserver/include/fdbserver/TagThrottler.h b/fdbserver/include/fdbserver/TagThrottler.h
index 146aa0851c..5e00e7be46 100644
--- a/fdbserver/include/fdbserver/TagThrottler.h
+++ b/fdbserver/include/fdbserver/TagThrottler.h
@@ -40,6 +40,10 @@ public:
 	// Also, erase expired tags
 	virtual PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates() = 0;
 
+	// For each tag and priority combination, return the throughput limit for the cluster
+	// (to be shared across all GRV proxies)
+	virtual PrioritizedTransactionTagMap<double> getProxyRates(int numProxies) = 0;
+
 	virtual int64_t autoThrottleCount() const = 0;
 	virtual uint32_t busyReadTagCount() const = 0;
 	virtual uint32_t busyWriteTagCount() const = 0;
@@ -62,6 +66,7 @@ public:
 	void addRequests(TransactionTag tag, int count) override;
 	uint64_t getThrottledTagChangeId() const override;
 	PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates() override;
+	PrioritizedTransactionTagMap<double> getProxyRates(int numProxies) override { throw not_implemented(); }
 	int64_t autoThrottleCount() const override;
 	uint32_t busyReadTagCount() const override;
 	uint32_t busyWriteTagCount() const override;
@@ -89,6 +94,7 @@ public:
 
 	Future<Void> tryUpdateAutoThrottling(StorageQueueInfo const&) override;
 	PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates() override;
+	PrioritizedTransactionTagMap<double> getProxyRates(int numProxies) override;
 
 	// Testing only:
 public:
diff --git a/fdbserver/include/fdbserver/TenantEntryCache.actor.h b/fdbserver/include/fdbserver/TenantEntryCache.actor.h
new file mode 100644
index 0000000000..3fc261b1d9
--- /dev/null
+++ b/fdbserver/include/fdbserver/TenantEntryCache.actor.h
@@ -0,0 +1,390 @@
+/*
+ * TenantEntryCache.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_TENANTENTRYCACHE_ACTOR_G_H)
+#define FDBSERVER_TENANTENTRYCACHE_ACTOR_G_H
+#include "fdbserver/TenantEntryCache.actor.g.h"
+#elif !defined(FDBSERVER_TENANTENTRYCACHE_ACTOR_H)
+#define FDBSERVER_TENANTENTRYCACHE_ACTOR_H
+
+#pragma once
+
+#include "fdbclient/DatabaseContext.h"
+#include "fdbclient/FDBOptions.g.h"
+#include "fdbclient/FDBTypes.h"
+#include "fdbclient/RunTransaction.actor.h"
+#include "fdbclient/Tenant.h"
+#include "fdbclient/TenantManagement.actor.h"
+#include "fdbserver/Knobs.h"
+#include "fdbrpc/TenantName.h"
+#include "flow/IndexedSet.h"
+
+#include <functional>
+#include <unordered_map>
+
+#include "flow/actorcompiler.h" // has to be last include
+
+using TenantNameEntryPair = std::pair<TenantName, TenantMapEntry>;
+using TenantNameEntryPairVec = std::vector<TenantNameEntryPair>;
+
+enum class TenantEntryCacheRefreshReason { INIT = 1, PERIODIC_TASK = 2, CACHE_MISS = 3, REMOVE_ENTRY = 4 };
+enum class TenantEntryCacheRefreshMode { PERIODIC_TASK = 1, NONE = 2 };
+
+template <class T>
+struct TenantEntryCachePayload {
+	TenantName name;
+	TenantMapEntry entry;
+	// Custom client payload
+	T payload;
+};
+
+template <class T>
+using TenantEntryCachePayloadFunc = std::function<TenantEntryCachePayload<T>(const TenantName&, const TenantMapEntry&)>;
+
+// In-memory cache for TenantEntryMap objects. It supports three indices:
+// 1. Lookup by 'TenantId'
+// 2. Lookup by 'TenantPrefix'
+// 3. Lookup by 'TenantName'
+//
+// TODO:
+// ----
+// The cache allows user to construct the 'cached object' by supplying a callback. The cache implements a periodic
+// refresh mechanism, polling underlying database for updates (add/remove tenants), in future we might want to implement
+// database range-watch to monitor such updates
+
+template <class T>
+class TenantEntryCache : public ReferenceCounted<TenantEntryCache<T>>, NonCopyable {
+private:
+	UID uid;
+	Database db;
+	TenantEntryCachePayloadFunc<T> createPayloadFunc;
+	TenantEntryCacheRefreshMode refreshMode;
+
+	Future<Void> refresher;
+	Map<int64_t, TenantEntryCachePayload<T>> mapByTenantId;
+	Map<TenantName, TenantEntryCachePayload<T>> mapByTenantName;
+
+	CounterCollection metrics;
+	Counter hits;
+	Counter misses;
+	Counter refreshByCacheInit;
+	Counter refreshByCacheMiss;
+	Counter numRefreshes;
+
+	ACTOR static Future<TenantNameEntryPairVec> getTenantList(Reference<ReadYourWritesTransaction> tr) {
+		tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+		tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
+
+		KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> tenantList =
+		    wait(TenantMetadata::tenantMap().getRange(
+		        tr, Optional<TenantName>(), Optional<TenantName>(), CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1));
+		ASSERT(tenantList.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER && !tenantList.more);
+
+		TraceEvent(SevDebug, "TenantEntryCacheGetTenantList").detail("Count", tenantList.results.size());
+
+		return tenantList.results;
+	}
+
+	static void updateCacheRefreshMetrics(TenantEntryCache<T>* cache, TenantEntryCacheRefreshReason reason) {
+		if (reason == TenantEntryCacheRefreshReason::INIT) {
+			cache->refreshByCacheInit += 1;
+		} else if (reason == TenantEntryCacheRefreshReason::CACHE_MISS) {
+			cache->refreshByCacheMiss += 1;
+		}
+
+		cache->numRefreshes += 1;
+	}
+
+	ACTOR static Future<Void> refreshImpl(TenantEntryCache<T>* cache, TenantEntryCacheRefreshReason reason) {
+		TraceEvent(SevDebug, "TenantEntryCacheRefreshStart", cache->id()).detail("Reason", static_cast<int>(reason));
+
+		state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction();
+		loop {
+			try {
+				state TenantNameEntryPairVec tenantList = wait(getTenantList(tr));
+
+				// Refresh cache entries reflecting the latest database state
+				cache->clear();
+				for (auto& tenant : tenantList) {
+					cache->put(tenant);
+				}
+
+				updateCacheRefreshMetrics(cache, reason);
+				break;
+			} catch (Error& e) {
+				if (e.code() != error_code_actor_cancelled) {
+					TraceEvent(SevInfo, "TenantEntryCacheRefreshError", cache->id())
+					    .errorUnsuppressed(e)
+					    .suppressFor(1.0);
+				}
+				wait(tr->onError(e));
+			}
+		}
+
+		TraceEvent(SevDebug, "TenantEntryCacheRefreshEnd", cache->id()).detail("Reason", static_cast<int>(reason));
+
+		return Void();
+	}
+
+	ACTOR static Future<Optional<TenantEntryCachePayload<T>>> getByIdImpl(TenantEntryCache<T>* cache,
+	                                                                      int64_t tenantId) {
+		Optional<TenantEntryCachePayload<T>> ret = cache->lookupById(tenantId);
+		if (ret.present()) {
+			cache->hits += 1;
+			return ret;
+		}
+
+		TraceEvent(SevInfo, "TenantEntryCacheGetByIdRefresh").detail("TenantId", tenantId);
+
+		// Entry not found. Refresh cacheEntries by scanning underlying KeyRange.
+		// TODO: Cache will implement a "KeyRange" watch, monitoring notification when a new entry gets added or any
+		// existing entry gets updated within the KeyRange of interest. Hence, misses would be very rare
+		wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS));
+
+		cache->misses += 1;
+		return cache->lookupById(tenantId);
+	}
+
+	ACTOR static Future<Optional<TenantEntryCachePayload<T>>> getByNameImpl(TenantEntryCache<T>* cache,
+	                                                                        TenantName name) {
+		Optional<TenantEntryCachePayload<T>> ret = cache->lookupByName(name);
+		if (ret.present()) {
+			cache->hits += 1;
+			return ret;
+		}
+
+		TraceEvent("TenantEntryCacheGetByNameRefresh").detail("TenantName", name);
+
+		// Entry not found. Refresh cacheEntries by scanning underlying KeyRange.
+		// TODO: Cache will implement a "KeyRange" watch, monitoring notification when a new entry gets added or any
+		// existing entry gets updated within the KeyRange of interest. Hence, misses would be very rare
+		wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS));
+
+		cache->misses += 1;
+		return cache->lookupByName(name);
+	}
+
+	Optional<TenantEntryCachePayload<T>> lookupById(int64_t tenantId) {
+		Optional<TenantEntryCachePayload<T>> ret;
+		auto itr = mapByTenantId.find(tenantId);
+		if (itr == mapByTenantId.end()) {
+			return ret;
+		}
+
+		return itr->value;
+	}
+
+	Optional<TenantEntryCachePayload<T>> lookupByName(TenantName name) {
+		Optional<TenantEntryCachePayload<T>> ret;
+		auto itr = mapByTenantName.find(name);
+		if (itr == mapByTenantName.end()) {
+			return ret;
+		}
+
+		return itr->value;
+	}
+
+	Future<Void> refresh(TenantEntryCacheRefreshReason reason) { return refreshImpl(this, reason); }
+
+	static TenantEntryCachePayload<Void> defaultCreatePayload(const TenantName& name, const TenantMapEntry& entry) {
+		TenantEntryCachePayload<Void> payload;
+		payload.name = name;
+		payload.entry = entry;
+
+		return payload;
+	}
+
+	Future<Void> removeEntryInt(Optional<int64_t> tenantId,
+	                            Optional<KeyRef> tenantPrefix,
+	                            Optional<TenantName> tenantName,
+	                            bool refreshCache) {
+		typename Map<int64_t, TenantEntryCachePayload<T>>::iterator itrId;
+		typename Map<TenantName, TenantEntryCachePayload<T>>::iterator itrName;
+
+		if (tenantId.present() || tenantPrefix.present()) {
+			// Ensure either tenantId OR tenantPrefix is valid (but not both)
+			ASSERT(tenantId.present() != tenantPrefix.present());
+			ASSERT(!tenantName.present());
+
+			int64_t tId = tenantId.present() ? tenantId.get() : TenantMapEntry::prefixToId(tenantPrefix.get());
+			TraceEvent("TenantEntryCacheRemoveEntry").detail("Id", tId);
+			itrId = mapByTenantId.find(tId);
+			if (itrId == mapByTenantId.end()) {
+				return Void();
+			}
+			// Ensure byId and byName cache are in-sync
+			itrName = mapByTenantName.find(itrId->value.name);
+			ASSERT(itrName != mapByTenantName.end());
+		} else if (tenantName.present()) {
+			ASSERT(!tenantId.present() && !tenantPrefix.present());
+
+			TraceEvent("TenantEntryCacheRemoveEntry").detail("Name", tenantName.get());
+			itrName = mapByTenantName.find(tenantName.get());
+			if (itrName == mapByTenantName.end()) {
+				return Void();
+			}
+			// Ensure byId and byName cache are in-sync
+			itrId = mapByTenantId.find(itrName->value.entry.id);
+			ASSERT(itrId != mapByTenantId.end());
+		} else {
+			// Invalid input, one of: tenantId, tenantPrefix or tenantName needs to be valid.
+			throw operation_failed();
+		}
+
+		ASSERT(itrId != mapByTenantId.end() && itrName != mapByTenantName.end());
+
+		TraceEvent("TenantEntryCacheRemoveEntry")
+		    .detail("Id", itrId->key)
+		    .detail("Prefix", itrId->value.entry.prefix)
+		    .detail("Name", itrName->key);
+
+		mapByTenantId.erase(itrId);
+		mapByTenantName.erase(itrName);
+
+		if (refreshCache) {
+			return refreshImpl(this, TenantEntryCacheRefreshReason::REMOVE_ENTRY);
+		}
+
+		return Void();
+	}
+
+public:
+	TenantEntryCache(Database db)
+	  : uid(deterministicRandom()->randomUniqueID()), db(db), createPayloadFunc(defaultCreatePayload),
+	    refreshMode(TenantEntryCacheRefreshMode::PERIODIC_TASK), metrics("TenantEntryCacheMetrics", uid.toString()),
+	    hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics),
+	    refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
+	    refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
+	    numRefreshes("TenantEntryCacheNumRefreshes", metrics) {
+		TraceEvent("TenantEntryCacheCreatedDefaultFunc", uid);
+	}
+
+	TenantEntryCache(Database db, TenantEntryCachePayloadFunc<T> fn)
+	  : uid(deterministicRandom()->randomUniqueID()), db(db), createPayloadFunc(fn),
+	    refreshMode(TenantEntryCacheRefreshMode::PERIODIC_TASK), metrics("TenantEntryCacheMetrics", uid.toString()),
+	    hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics),
+	    refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
+	    refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
+	    numRefreshes("TenantEntryCacheNumRefreshes", metrics) {
+		TraceEvent("TenantEntryCacheCreated", uid);
+	}
+
+	TenantEntryCache(Database db, UID id, TenantEntryCachePayloadFunc<T> fn)
+	  : uid(id), db(db), createPayloadFunc(fn), refreshMode(TenantEntryCacheRefreshMode::PERIODIC_TASK),
+	    metrics("TenantEntryCacheMetrics", uid.toString()), hits("TenantEntryCacheHits", metrics),
+	    misses("TenantEntryCacheMisses", metrics), refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
+	    refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
+	    numRefreshes("TenantEntryCacheNumRefreshes", metrics) {
+		TraceEvent("TenantEntryCacheCreated", uid);
+	}
+
+	TenantEntryCache(Database db, UID id, TenantEntryCachePayloadFunc<T> fn, TenantEntryCacheRefreshMode mode)
+	  : uid(id), db(db), createPayloadFunc(fn), refreshMode(mode), metrics("TenantEntryCacheMetrics", uid.toString()),
+	    hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics),
+	    refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
+	    refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
+	    numRefreshes("TenantEntryCacheNumRefreshes", metrics) {
+		TraceEvent("TenantEntryCacheCreated", uid);
+	}
+
+	Future<Void> init() {
+		TraceEvent("TenantEntryCacheInit", uid);
+
+		Future<Void> f = refreshImpl(this, TenantEntryCacheRefreshReason::INIT);
+
+		// Launch reaper task to periodically refresh cache by scanning database KeyRange
+		TenantEntryCacheRefreshReason reason = TenantEntryCacheRefreshReason::PERIODIC_TASK;
+		if (refreshMode == TenantEntryCacheRefreshMode::PERIODIC_TASK) {
+			refresher = recurringAsync([&, reason]() { return refresh(reason); },
+			                           SERVER_KNOBS->TENANT_CACHE_LIST_REFRESH_INTERVAL, /* interval */
+			                           true, /* absoluteIntervalDelay */
+			                           SERVER_KNOBS->TENANT_CACHE_LIST_REFRESH_INTERVAL, /* intialDelay */
+			                           TaskPriority::Worker);
+		}
+
+		return f;
+	}
+
+	Database getDatabase() const { return db; }
+	UID id() const { return uid; }
+
+	void clear() {
+		mapByTenantId.clear();
+		mapByTenantName.clear();
+	}
+
+	Future<Void> removeEntryById(int64_t tenantId, bool refreshCache = false) {
+		return removeEntryInt(tenantId, Optional<KeyRef>(), Optional<TenantName>(), refreshCache);
+	}
+	Future<Void> removeEntryByPrefix(KeyRef tenantPrefix, bool refreshCache = false) {
+		return removeEntryInt(Optional<int64_t>(), tenantPrefix, Optional<TenantName>(), refreshCache);
+	}
+	Future<Void> removeEntryByName(TenantName tenantName, bool refreshCache = false) {
+		return removeEntryInt(Optional<int64_t>(), Optional<KeyRef>(), tenantName, refreshCache);
+	}
+
+	void put(const TenantNameEntryPair& pair) {
+		TenantEntryCachePayload<T> payload = createPayloadFunc(pair.first, pair.second);
+		auto idItr = mapByTenantId.find(pair.second.id);
+		auto nameItr = mapByTenantName.find(pair.first);
+
+		Optional<TenantName> existingName;
+		Optional<int64_t> existingId;
+		if (nameItr != mapByTenantName.end()) {
+			existingId = nameItr->value.entry.id;
+			mapByTenantId.erase(nameItr->value.entry.id);
+		}
+		if (idItr != mapByTenantId.end()) {
+			existingName = idItr->value.name;
+			mapByTenantName.erase(idItr->value.name);
+		}
+
+		mapByTenantId[pair.second.id] = payload;
+		mapByTenantName[pair.first] = payload;
+
+		TraceEvent("TenantEntryCachePut")
+		    .detail("TenantName", pair.first)
+		    .detail("TenantNameExisting", existingName)
+		    .detail("TenantID", pair.second.id)
+		    .detail("TenantIDExisting", existingId)
+		    .detail("TenantPrefix", pair.second.prefix);
+
+		CODE_PROBE(idItr == mapByTenantId.end() && nameItr == mapByTenantName.end(), "TenantCache new entry");
+		CODE_PROBE(idItr != mapByTenantId.end() && nameItr == mapByTenantName.end(), "TenantCache entry name updated");
+		CODE_PROBE(idItr == mapByTenantId.end() && nameItr != mapByTenantName.end(), "TenantCache entry id updated");
+		CODE_PROBE(idItr != mapByTenantId.end() && nameItr != mapByTenantName.end(),
+		           "TenantCache entry id and name updated");
+	}
+
+	Future<Optional<TenantEntryCachePayload<T>>> getById(int64_t tenantId) { return getByIdImpl(this, tenantId); }
+	Future<Optional<TenantEntryCachePayload<T>>> getByPrefix(KeyRef prefix) {
+		int64_t id = TenantMapEntry::prefixToId(prefix);
+		return getByIdImpl(this, id);
+	}
+	Future<Optional<TenantEntryCachePayload<T>>> getByName(TenantName name) { return getByNameImpl(this, name); }
+
+	// Counter access APIs
+	Counter::Value numCacheRefreshes() const { return numRefreshes.getValue(); }
+	Counter::Value numRefreshByMisses() const { return refreshByCacheMiss.getValue(); }
+	Counter::Value numRefreshByInit() const { return refreshByCacheInit.getValue(); }
+};
+
+#include "flow/unactorcompiler.h"
+#endif // FDBSERVER_TENANTENTRYCACHE_ACTOR_H
\ No newline at end of file
diff --git a/fdbserver/include/fdbserver/WorkerInterface.actor.h b/fdbserver/include/fdbserver/WorkerInterface.actor.h
index 1168632e1e..e3f60674d3 100644
--- a/fdbserver/include/fdbserver/WorkerInterface.actor.h
+++ b/fdbserver/include/fdbserver/WorkerInterface.actor.h
@@ -27,7 +27,7 @@
 
 #include "fdbserver/BackupInterface.h"
 #include "fdbserver/DataDistributorInterface.h"
-#include "fdbserver/EncryptKeyProxyInterface.h"
+#include "fdbclient/EncryptKeyProxyInterface.h"
 #include "fdbserver/MasterInterface.h"
 #include "fdbserver/TLogInterface.h"
 #include "fdbserver/RatekeeperInterface.h"
@@ -1092,6 +1092,7 @@ ACTOR Future<Void> encryptKeyProxyServer(EncryptKeyProxyInterface ei, Reference<
 class IKeyValueStore;
 class ServerCoordinators;
 class IDiskQueue;
+class IEncryptionKeyProvider;
 ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
                                  StorageServerInterface ssi,
                                  Tag seedTag,
@@ -1100,7 +1101,8 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
                                  Version tssSeedVersion,
                                  ReplyPromise<InitializeStorageReply> recruitReply,
                                  Reference<AsyncVar<ServerDBInfo> const> db,
-                                 std::string folder);
+                                 std::string folder,
+                                 Reference<IEncryptionKeyProvider> encryptionKeyProvider);
 ACTOR Future<Void> storageServer(
     IKeyValueStore* persistentData,
     StorageServerInterface ssi,
@@ -1108,7 +1110,8 @@ ACTOR Future<Void> storageServer(
     std::string folder,
     Promise<Void> recovered,
     Reference<IClusterConnectionRecord>
-        connRecord); // changes pssi->id() to be the recovered ID); // changes pssi->id() to be the recovered ID
+        connRecord, // changes pssi->id() to be the recovered ID); // changes pssi->id() to be the recovered ID
+    Reference<IEncryptionKeyProvider> encryptionKeyProvider);
 ACTOR Future<Void> masterServer(MasterInterface mi,
                                 Reference<AsyncVar<ServerDBInfo> const> db,
                                 Reference<AsyncVar<Optional<ClusterControllerFullInterface>> const> ccInterface,
diff --git a/fdbserver/include/fdbserver/workloads/ApiWorkload.h b/fdbserver/include/fdbserver/workloads/ApiWorkload.h
index 4972013f5c..9f510ce471 100644
--- a/fdbserver/include/fdbserver/workloads/ApiWorkload.h
+++ b/fdbserver/include/fdbserver/workloads/ApiWorkload.h
@@ -286,9 +286,11 @@ struct ApiWorkload : TestWorkload {
 		minValueLength = getOption(options, LiteralStringRef("minValueLength"), 1);
 		maxValueLength = getOption(options, LiteralStringRef("maxValueLength"), 10000);
 
-		useExtraDB = g_network->isSimulated() && g_simulator.extraDB != nullptr;
+		useExtraDB = g_network->isSimulated() && !g_simulator.extraDatabases.empty();
 		if (useExtraDB) {
-			auto extraFile = makeReference<ClusterConnectionMemoryRecord>(*g_simulator.extraDB);
+			ASSERT(g_simulator.extraDatabases.size() == 1);
+			auto extraFile =
+			    makeReference<ClusterConnectionMemoryRecord>(ClusterConnectionString(g_simulator.extraDatabases[0]));
 			extraDB = Database::createDatabase(extraFile, -1);
 		}
 	}
diff --git a/fdbserver/include/fdbserver/workloads/BulkSetup.actor.h b/fdbserver/include/fdbserver/workloads/BulkSetup.actor.h
index 216c9bdb2f..575c361be4 100644
--- a/fdbserver/include/fdbserver/workloads/BulkSetup.actor.h
+++ b/fdbserver/include/fdbserver/workloads/BulkSetup.actor.h
@@ -181,15 +181,30 @@ ACTOR Future<std::vector<std::pair<uint64_t, double>>> trackInsertionCount(Datab
 
 ACTOR template <class T>
 Future<Void> waitForLowInFlight(Database cx, T* workload) {
+	state Future<Void> timeout = delay(600.0);
 	loop {
-		int64_t inFlight = wait(getDataInFlight(cx, workload->dbInfo));
-		TraceEvent("DynamicWarming").detail("InFlight", inFlight);
-		if (inFlight > 1e6) { // Wait for just 1 MB to be in flight
-			wait(delay(1.0));
-		} else {
-			wait(delay(1.0));
-			TraceEvent("DynamicWarmingDone").log();
-			break;
+		try {
+			if (timeout.isReady()) {
+				throw timed_out();
+			}
+
+			int64_t inFlight = wait(getDataInFlight(cx, workload->dbInfo));
+			TraceEvent("DynamicWarming").detail("InFlight", inFlight);
+			if (inFlight > 1e6) { // Wait for just 1 MB to be in flight
+				wait(delay(1.0));
+			} else {
+				wait(delay(1.0));
+				TraceEvent("DynamicWarmingDone").log();
+				break;
+			}
+		} catch (Error& e) {
+			if (e.code() == error_code_attribute_not_found) {
+				// DD may not be initialized yet and attribute "DataInFlight" can be missing
+				wait(delay(1.0));
+			} else {
+				TraceEvent(SevWarn, "WaitForLowInFlightError").error(e);
+				throw;
+			}
 		}
 	}
 	return Void();
diff --git a/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h b/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h
new file mode 100644
index 0000000000..7b8275d2b7
--- /dev/null
+++ b/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h
@@ -0,0 +1,363 @@
+
+/*
+ * MetaclusterConsistency.actor.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+// When actually compiled (NO_INTELLISENSE), include the generated version of this file.  In intellisense use the source
+// version.
+#include "fdbclient/FDBOptions.g.h"
+#include "flow/BooleanParam.h"
+#if defined(NO_INTELLISENSE) && !defined(WORKLOADS_METACLUSTER_CONSISTENCY_ACTOR_G_H)
+#define WORKLOADS_METACLUSTER_CONSISTENCY_ACTOR_G_H
+#include "fdbserver/workloads/MetaclusterConsistency.actor.g.h"
+#elif !defined(WORKLOADS_METACLUSTER_CONSISTENCY_ACTOR_H)
+#define WORKLOADS_METACLUSTER_CONSISTENCY_ACTOR_H
+
+#include "fdbclient/Metacluster.h"
+#include "fdbclient/MetaclusterManagement.actor.h"
+#include "fdbserver/workloads/TenantConsistency.actor.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+FDB_DECLARE_BOOLEAN_PARAM(AllowPartialMetaclusterOperations);
+
+template <class DB>
+class MetaclusterConsistencyCheck {
+private:
+	Reference<DB> managementDb;
+	AllowPartialMetaclusterOperations allowPartialMetaclusterOperations = AllowPartialMetaclusterOperations::True;
+
+	struct ManagementClusterData {
+		Optional<MetaclusterRegistrationEntry> metaclusterRegistration;
+		std::map<ClusterName, DataClusterMetadata> dataClusters;
+		KeyBackedRangeResult<Tuple> clusterCapacityTuples;
+		KeyBackedRangeResult<std::pair<ClusterName, int64_t>> clusterTenantCounts;
+		KeyBackedRangeResult<Tuple> clusterTenantTuples;
+		KeyBackedRangeResult<Tuple> clusterTenantGroupTuples;
+
+		std::map<TenantName, TenantMapEntry> tenantMap;
+		KeyBackedRangeResult<std::pair<TenantGroupName, TenantGroupEntry>> tenantGroups;
+
+		std::map<ClusterName, std::set<TenantName>> clusterTenantMap;
+		std::map<ClusterName, std::set<TenantGroupName>> clusterTenantGroupMap;
+
+		int64_t tenantCount;
+		RangeResult systemTenantSubspaceKeys;
+	};
+
+	ManagementClusterData managementMetadata;
+
+	// Note: this check can only be run on metaclusters with a reasonable number of tenants, as should be
+	// the case with the current metacluster simulation workloads
+	static inline const int metaclusterMaxTenants = 10e6;
+
+	ACTOR static Future<Void> loadManagementClusterMetadata(MetaclusterConsistencyCheck* self) {
+		state Reference<typename DB::TransactionT> managementTr = self->managementDb->createTransaction();
+		state std::vector<std::pair<TenantName, TenantMapEntry>> tenantList;
+
+		loop {
+			try {
+				managementTr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+				state typename transaction_future_type<typename DB::TransactionT, RangeResult>::type
+				    systemTenantSubspaceKeysFuture = managementTr->getRange(prefixRange(TenantMetadata::subspace()), 1);
+
+				wait(store(self->managementMetadata.metaclusterRegistration,
+				           MetaclusterMetadata::metaclusterRegistration().get(managementTr)) &&
+				     store(self->managementMetadata.dataClusters,
+				           MetaclusterAPI::listClustersTransaction(
+				               managementTr, ""_sr, "\xff\xff"_sr, CLIENT_KNOBS->MAX_DATA_CLUSTERS + 1)) &&
+				     store(self->managementMetadata.clusterCapacityTuples,
+				           MetaclusterAPI::ManagementClusterMetadata::clusterCapacityIndex.getRange(
+				               managementTr, {}, {}, CLIENT_KNOBS->MAX_DATA_CLUSTERS)) &&
+				     store(self->managementMetadata.clusterTenantCounts,
+				           MetaclusterAPI::ManagementClusterMetadata::clusterTenantCount.getRange(
+				               managementTr, {}, {}, CLIENT_KNOBS->MAX_DATA_CLUSTERS)) &&
+				     store(self->managementMetadata.clusterTenantTuples,
+				           MetaclusterAPI::ManagementClusterMetadata::clusterTenantIndex.getRange(
+				               managementTr, {}, {}, metaclusterMaxTenants)) &&
+				     store(self->managementMetadata.clusterTenantGroupTuples,
+				           MetaclusterAPI::ManagementClusterMetadata::clusterTenantGroupIndex.getRange(
+				               managementTr, {}, {}, metaclusterMaxTenants)) &&
+				     store(self->managementMetadata.tenantCount,
+				           MetaclusterAPI::ManagementClusterMetadata::tenantMetadata().tenantCount.getD(
+				               managementTr, Snapshot::False, 0)) &&
+				     store(tenantList,
+				           MetaclusterAPI::listTenantsTransaction(
+				               managementTr, ""_sr, "\xff\xff"_sr, metaclusterMaxTenants)) &&
+				     store(self->managementMetadata.tenantGroups,
+				           MetaclusterAPI::ManagementClusterMetadata::tenantMetadata().tenantGroupMap.getRange(
+				               managementTr, {}, {}, metaclusterMaxTenants)) &&
+				     store(self->managementMetadata.systemTenantSubspaceKeys,
+				           safeThreadFutureToFuture(systemTenantSubspaceKeysFuture)));
+
+				break;
+			} catch (Error& e) {
+				wait(safeThreadFutureToFuture(managementTr->onError(e)));
+			}
+		}
+
+		self->managementMetadata.tenantMap = std::map<TenantName, TenantMapEntry>(tenantList.begin(), tenantList.end());
+
+		for (auto t : self->managementMetadata.clusterTenantTuples.results) {
+			ASSERT(t.size() == 3);
+			TenantName tenantName = t.getString(1);
+			int64_t tenantId = t.getInt(2);
+			ASSERT(tenantId == self->managementMetadata.tenantMap[tenantName].id);
+			self->managementMetadata.clusterTenantMap[t.getString(0)].insert(tenantName);
+		}
+
+		for (auto t : self->managementMetadata.clusterTenantGroupTuples.results) {
+			ASSERT(t.size() == 2);
+			TenantGroupName tenantGroupName = t.getString(1);
+			self->managementMetadata.clusterTenantGroupMap[t.getString(0)].insert(tenantGroupName);
+		}
+
+		return Void();
+	}
+
+	void validateManagementCluster() {
+		ASSERT(managementMetadata.metaclusterRegistration.present());
+		ASSERT(managementMetadata.metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_MANAGEMENT);
+		ASSERT(managementMetadata.metaclusterRegistration.get().id ==
+		           managementMetadata.metaclusterRegistration.get().metaclusterId &&
+		       managementMetadata.metaclusterRegistration.get().name ==
+		           managementMetadata.metaclusterRegistration.get().metaclusterName);
+		ASSERT(managementMetadata.dataClusters.size() <= CLIENT_KNOBS->MAX_DATA_CLUSTERS);
+		ASSERT(managementMetadata.tenantCount <= metaclusterMaxTenants);
+		ASSERT(managementMetadata.clusterCapacityTuples.results.size() <= managementMetadata.dataClusters.size() &&
+		       !managementMetadata.clusterCapacityTuples.more);
+		ASSERT(managementMetadata.clusterTenantCounts.results.size() <= managementMetadata.dataClusters.size() &&
+		       !managementMetadata.clusterTenantCounts.more);
+		ASSERT(managementMetadata.clusterTenantTuples.results.size() == managementMetadata.tenantCount &&
+		       !managementMetadata.clusterTenantTuples.more);
+		ASSERT(managementMetadata.clusterTenantGroupTuples.results.size() <= managementMetadata.tenantCount &&
+		       !managementMetadata.clusterTenantGroupTuples.more);
+		ASSERT(managementMetadata.tenantMap.size() == managementMetadata.tenantCount);
+		ASSERT(managementMetadata.tenantGroups.results.size() <= managementMetadata.tenantCount &&
+		       !managementMetadata.tenantGroups.more);
+		ASSERT(managementMetadata.clusterTenantGroupTuples.results.size() ==
+		       managementMetadata.tenantGroups.results.size());
+
+		// Parse the cluster capacity index. Check that no cluster is represented in the index more than once.
+		std::map<ClusterName, int64_t> clusterAllocatedMap;
+		for (auto t : managementMetadata.clusterCapacityTuples.results) {
+			ASSERT(t.size() == 2);
+			auto result = clusterAllocatedMap.emplace(t.getString(1), t.getInt(0));
+			ASSERT(result.second);
+		}
+
+		// Validate various properties for each data cluster
+		int numFoundInAllocatedMap = 0;
+		int numFoundInTenantGroupMap = 0;
+		for (auto [clusterName, clusterMetadata] : managementMetadata.dataClusters) {
+			// If the cluster has capacity, it should be in the capacity index and have the correct count of
+			// allocated tenants stored there
+			auto allocatedItr = clusterAllocatedMap.find(clusterName);
+			if (!clusterMetadata.entry.hasCapacity()) {
+				ASSERT(allocatedItr == clusterAllocatedMap.end());
+			} else {
+				ASSERT(allocatedItr->second == clusterMetadata.entry.allocated.numTenantGroups);
+				++numFoundInAllocatedMap;
+			}
+
+			// Check that the number of tenant groups in the cluster is smaller than the allocated number of tenant
+			// groups.
+			auto tenantGroupItr = managementMetadata.clusterTenantGroupMap.find(clusterName);
+			if (tenantGroupItr != managementMetadata.clusterTenantGroupMap.end()) {
+				ASSERT(tenantGroupItr->second.size() <= clusterMetadata.entry.allocated.numTenantGroups);
+				++numFoundInTenantGroupMap;
+			}
+		}
+		// Check that we exhausted the cluster capacity index and the cluster tenant group index
+		ASSERT(numFoundInAllocatedMap == clusterAllocatedMap.size());
+		ASSERT(numFoundInTenantGroupMap == managementMetadata.clusterTenantGroupMap.size());
+
+		// Check that our cluster tenant counters match the number of tenants in the cluster index
+		std::map<ClusterName, int64_t> countsMap(managementMetadata.clusterTenantCounts.results.begin(),
+		                                         managementMetadata.clusterTenantCounts.results.end());
+		for (auto [cluster, clusterTenants] : managementMetadata.clusterTenantMap) {
+			auto itr = countsMap.find(cluster);
+			ASSERT((clusterTenants.empty() && itr == countsMap.end()) || itr->second == clusterTenants.size());
+		}
+
+		// Iterate through all tenants and verify related metadata
+		std::map<ClusterName, int> clusterAllocated;
+		std::set<TenantGroupName> processedTenantGroups;
+		for (auto [name, entry] : managementMetadata.tenantMap) {
+			ASSERT(entry.assignedCluster.present());
+
+			// Each tenant should be assigned to the same cluster where it is stored in the cluster tenant index
+			auto clusterItr = managementMetadata.clusterTenantMap.find(entry.assignedCluster.get());
+			ASSERT(clusterItr != managementMetadata.clusterTenantMap.end());
+			ASSERT(clusterItr->second.count(name));
+
+			if (entry.tenantGroup.present()) {
+				// Count the number of tenant groups allocated in each cluster
+				if (processedTenantGroups.insert(entry.tenantGroup.get()).second) {
+					++clusterAllocated[entry.assignedCluster.get()];
+				}
+				// The tenant group should be stored in the same cluster where it is stored in the cluster tenant
+				// group index
+				auto clusterTenantGroupItr = managementMetadata.clusterTenantGroupMap.find(entry.assignedCluster.get());
+				ASSERT(clusterTenantGroupItr != managementMetadata.clusterTenantMap.end());
+				ASSERT(clusterTenantGroupItr->second.count(entry.tenantGroup.get()));
+			} else {
+				// Track the actual tenant group allocation per cluster (a tenant with no group counts against the
+				// allocation)
+				++clusterAllocated[entry.assignedCluster.get()];
+			}
+		}
+
+		// The actual allocation for each cluster should match what is stored in the cluster metadata
+		for (auto [name, allocated] : clusterAllocated) {
+			auto itr = managementMetadata.dataClusters.find(name);
+			ASSERT(itr != managementMetadata.dataClusters.end());
+			ASSERT(allocated == itr->second.entry.allocated.numTenantGroups);
+		}
+
+		// Each tenant group in the tenant group map should be present in the cluster tenant group map
+		// and have the correct cluster assigned to it.
+		for (auto [name, entry] : managementMetadata.tenantGroups.results) {
+			ASSERT(entry.assignedCluster.present());
+			auto clusterItr = managementMetadata.clusterTenantGroupMap.find(entry.assignedCluster.get());
+			ASSERT(clusterItr->second.count(name));
+		}
+
+		// We should not be storing any data in the `\xff` tenant subspace.
+		ASSERT(managementMetadata.systemTenantSubspaceKeys.empty());
+	}
+
+	ACTOR static Future<Void> validateDataCluster(MetaclusterConsistencyCheck* self,
+	                                              ClusterName clusterName,
+	                                              DataClusterMetadata clusterMetadata) {
+		state Reference<IDatabase> dataDb = wait(MetaclusterAPI::openDatabase(clusterMetadata.connectionString));
+		state Reference<ITransaction> dataTr = dataDb->createTransaction();
+
+		state Optional<MetaclusterRegistrationEntry> dataClusterRegistration;
+		state std::vector<std::pair<TenantName, TenantMapEntry>> dataClusterTenantList;
+		state KeyBackedRangeResult<std::pair<TenantGroupName, TenantGroupEntry>> dataClusterTenantGroups;
+
+		state TenantConsistencyCheck<IDatabase> tenantConsistencyCheck(dataDb);
+		wait(tenantConsistencyCheck.run());
+
+		loop {
+			try {
+				dataTr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+				wait(store(dataClusterRegistration, MetaclusterMetadata::metaclusterRegistration().get(dataTr)) &&
+				     store(dataClusterTenantList,
+				           TenantAPI::listTenantsTransaction(
+				               dataTr, ""_sr, "\xff\xff"_sr, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)) &&
+				     store(dataClusterTenantGroups,
+				           TenantMetadata::tenantGroupMap().getRange(
+				               dataTr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)));
+
+				break;
+			} catch (Error& e) {
+				wait(safeThreadFutureToFuture(dataTr->onError(e)));
+			}
+		}
+
+		state std::map<TenantName, TenantMapEntry> dataClusterTenantMap(dataClusterTenantList.begin(),
+		                                                                dataClusterTenantList.end());
+
+		ASSERT(dataClusterRegistration.present());
+		ASSERT(dataClusterRegistration.get().clusterType == ClusterType::METACLUSTER_DATA);
+		ASSERT(dataClusterRegistration.get().matches(self->managementMetadata.metaclusterRegistration.get()));
+		ASSERT(dataClusterRegistration.get().name == clusterName);
+		ASSERT(dataClusterRegistration.get().id == clusterMetadata.entry.id);
+
+		auto& expectedTenants = self->managementMetadata.clusterTenantMap[clusterName];
+
+		if (!self->allowPartialMetaclusterOperations) {
+			ASSERT(dataClusterTenantMap.size() == expectedTenants.size());
+		} else {
+			ASSERT(dataClusterTenantMap.size() <= expectedTenants.size());
+			for (auto tenantName : expectedTenants) {
+				if (!dataClusterTenantMap.count(tenantName)) {
+					TenantMapEntry const& metaclusterEntry = self->managementMetadata.tenantMap[tenantName];
+					if (metaclusterEntry.renamePair.present() &&
+					    (metaclusterEntry.tenantState == TenantState::RENAMING_FROM ||
+					     metaclusterEntry.tenantState == TenantState::RENAMING_TO)) {
+						ASSERT(dataClusterTenantMap.count(metaclusterEntry.renamePair.get()));
+					} else {
+						ASSERT(metaclusterEntry.tenantState == TenantState::REGISTERING ||
+						       metaclusterEntry.tenantState == TenantState::REMOVING);
+					}
+				}
+			}
+		}
+
+		for (auto [name, entry] : dataClusterTenantMap) {
+			ASSERT(expectedTenants.count(name));
+			TenantMapEntry const& metaclusterEntry = self->managementMetadata.tenantMap[name];
+			ASSERT(!entry.assignedCluster.present());
+			ASSERT(entry.id == metaclusterEntry.id);
+			ASSERT(entry.encrypted == metaclusterEntry.encrypted);
+
+			ASSERT(entry.tenantState == TenantState::READY);
+			ASSERT(self->allowPartialMetaclusterOperations || metaclusterEntry.tenantState == TenantState::READY);
+			if (metaclusterEntry.tenantState != TenantState::UPDATING_CONFIGURATION &&
+			    metaclusterEntry.tenantState != TenantState::REMOVING) {
+				ASSERT(entry.configurationSequenceNum == metaclusterEntry.configurationSequenceNum);
+			} else {
+				ASSERT(entry.configurationSequenceNum <= metaclusterEntry.configurationSequenceNum);
+			}
+
+			if (entry.configurationSequenceNum == metaclusterEntry.configurationSequenceNum) {
+				ASSERT(entry.tenantGroup == metaclusterEntry.tenantGroup);
+			}
+		}
+
+		auto& expectedTenantGroups = self->managementMetadata.clusterTenantGroupMap[clusterName];
+		ASSERT(dataClusterTenantGroups.results.size() == expectedTenantGroups.size());
+		for (auto [name, entry] : dataClusterTenantGroups.results) {
+			ASSERT(expectedTenantGroups.count(name));
+			ASSERT(!entry.assignedCluster.present());
+		}
+
+		return Void();
+	}
+
+	ACTOR static Future<Void> run(MetaclusterConsistencyCheck* self) {
+		state TenantConsistencyCheck<DB> managementTenantConsistencyCheck(self->managementDb);
+		wait(managementTenantConsistencyCheck.run());
+		wait(loadManagementClusterMetadata(self));
+		self->validateManagementCluster();
+
+		state std::vector<Future<Void>> dataClusterChecks;
+		state std::map<ClusterName, DataClusterMetadata>::iterator dataClusterItr;
+		for (auto [clusterName, clusterMetadata] : self->managementMetadata.dataClusters) {
+			dataClusterChecks.push_back(validateDataCluster(self, clusterName, clusterMetadata));
+		}
+		wait(waitForAll(dataClusterChecks));
+
+		return Void();
+	}
+
+public:
+	MetaclusterConsistencyCheck() {}
+	MetaclusterConsistencyCheck(Reference<DB> managementDb,
+	                            AllowPartialMetaclusterOperations allowPartialMetaclusterOperations)
+	  : managementDb(managementDb), allowPartialMetaclusterOperations(allowPartialMetaclusterOperations) {}
+
+	Future<Void> run() { return run(this); }
+};
+
+#endif
\ No newline at end of file
diff --git a/fdbserver/include/fdbserver/workloads/TenantConsistency.actor.h b/fdbserver/include/fdbserver/workloads/TenantConsistency.actor.h
new file mode 100644
index 0000000000..86fe3e9c09
--- /dev/null
+++ b/fdbserver/include/fdbserver/workloads/TenantConsistency.actor.h
@@ -0,0 +1,229 @@
+
+/*
+ * TenantConsistency.actor.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+// When actually compiled (NO_INTELLISENSE), include the generated version of this file.  In intellisense use the source
+// version.
+#include "fdbclient/FDBOptions.g.h"
+#include "fdbclient/KeyBackedTypes.h"
+#include "flow/BooleanParam.h"
+#if defined(NO_INTELLISENSE) && !defined(WORKLOADS_TENANT_CONSISTENCY_ACTOR_G_H)
+#define WORKLOADS_TENANT_CONSISTENCY_ACTOR_G_H
+#include "fdbserver/workloads/TenantConsistency.actor.g.h"
+#elif !defined(WORKLOADS_TENANT_CONSISTENCY_ACTOR_H)
+#define WORKLOADS_TENANT_CONSISTENCY_ACTOR_H
+
+#include "fdbclient/Metacluster.h"
+#include "fdbclient/MetaclusterManagement.actor.h"
+#include "fdbclient/Tenant.h"
+#include "fdbclient/TenantManagement.actor.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+template <class DB>
+class TenantConsistencyCheck {
+private:
+	Reference<DB> db;
+
+	struct TenantData {
+		Optional<MetaclusterRegistrationEntry> metaclusterRegistration;
+		std::map<TenantName, TenantMapEntry> tenantMap;
+		std::map<int64_t, TenantName> tenantIdIndex;
+		int64_t lastTenantId;
+		int64_t tenantCount;
+		std::set<int64_t> tenantTombstones;
+		Optional<TenantTombstoneCleanupData> tombstoneCleanupData;
+		std::map<TenantGroupName, TenantGroupEntry> tenantGroupMap;
+		std::map<TenantGroupName, std::set<TenantName>> tenantGroupIndex;
+
+		std::set<TenantName> tenantsInTenantGroupIndex;
+
+		ClusterType clusterType;
+	};
+
+	TenantData metadata;
+
+	// Note: this check can only be run on metaclusters with a reasonable number of tenants, as should be
+	// the case with the current metacluster simulation workloads
+	static inline const int metaclusterMaxTenants = 10e6;
+
+	ACTOR static Future<Void> loadTenantMetadata(TenantConsistencyCheck* self) {
+		state Reference<typename DB::TransactionT> tr = self->db->createTransaction();
+		state KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> tenantList;
+		state KeyBackedRangeResult<std::pair<int64_t, TenantName>> tenantIdIndexList;
+		state KeyBackedRangeResult<int64_t> tenantTombstoneList;
+		state KeyBackedRangeResult<std::pair<TenantGroupName, TenantGroupEntry>> tenantGroupList;
+		state KeyBackedRangeResult<Tuple> tenantGroupTenantTuples;
+		state TenantMetadataSpecification* tenantMetadata;
+
+		loop {
+			try {
+				tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+				wait(store(self->metadata.metaclusterRegistration,
+				           MetaclusterMetadata::metaclusterRegistration().get(tr)));
+
+				self->metadata.clusterType = self->metadata.metaclusterRegistration.present()
+				                                 ? self->metadata.metaclusterRegistration.get().clusterType
+				                                 : ClusterType::STANDALONE;
+
+				if (self->metadata.clusterType == ClusterType::METACLUSTER_MANAGEMENT) {
+					tenantMetadata = &MetaclusterAPI::ManagementClusterMetadata::tenantMetadata();
+				} else {
+					tenantMetadata = &TenantMetadata::instance();
+				}
+
+				wait(
+				    store(tenantList, tenantMetadata->tenantMap.getRange(tr, {}, {}, metaclusterMaxTenants)) &&
+				    store(tenantIdIndexList,
+				          tenantMetadata->tenantIdIndex.getRange(tr, {}, {}, metaclusterMaxTenants)) &&
+				    store(self->metadata.lastTenantId, tenantMetadata->lastTenantId.getD(tr, Snapshot::False, -1)) &&
+				    store(self->metadata.tenantCount, tenantMetadata->tenantCount.getD(tr, Snapshot::False, 0)) &&
+				    store(tenantTombstoneList,
+				          tenantMetadata->tenantTombstones.getRange(tr, {}, {}, metaclusterMaxTenants)) &&
+				    store(self->metadata.tombstoneCleanupData, tenantMetadata->tombstoneCleanupData.get(tr)) &&
+				    store(tenantGroupTenantTuples,
+				          tenantMetadata->tenantGroupTenantIndex.getRange(tr, {}, {}, metaclusterMaxTenants)) &&
+				    store(tenantGroupList, tenantMetadata->tenantGroupMap.getRange(tr, {}, {}, metaclusterMaxTenants)));
+
+				break;
+			} catch (Error& e) {
+				wait(safeThreadFutureToFuture(tr->onError(e)));
+			}
+		}
+
+		ASSERT(!tenantList.more);
+		self->metadata.tenantMap =
+		    std::map<TenantName, TenantMapEntry>(tenantList.results.begin(), tenantList.results.end());
+
+		ASSERT(!tenantIdIndexList.more);
+		self->metadata.tenantIdIndex =
+		    std::map<int64_t, TenantName>(tenantIdIndexList.results.begin(), tenantIdIndexList.results.end());
+
+		ASSERT(!tenantTombstoneList.more);
+		self->metadata.tenantTombstones =
+		    std::set<int64_t>(tenantTombstoneList.results.begin(), tenantTombstoneList.results.end());
+
+		ASSERT(!tenantGroupList.more);
+		self->metadata.tenantGroupMap =
+		    std::map<TenantGroupName, TenantGroupEntry>(tenantGroupList.results.begin(), tenantGroupList.results.end());
+
+		for (auto t : tenantGroupTenantTuples.results) {
+			ASSERT(t.size() == 2);
+			TenantGroupName tenantGroupName = t.getString(0);
+			TenantName tenantName = t.getString(1);
+			ASSERT(self->metadata.tenantGroupMap.count(tenantGroupName));
+			ASSERT(self->metadata.tenantMap.count(tenantName));
+			self->metadata.tenantGroupIndex[tenantGroupName].insert(tenantName);
+			ASSERT(self->metadata.tenantsInTenantGroupIndex.insert(tenantName).second);
+		}
+		ASSERT(self->metadata.tenantGroupIndex.size() == self->metadata.tenantGroupMap.size());
+
+		return Void();
+	}
+
+	void validateTenantMetadata() {
+		if (metadata.clusterType == ClusterType::METACLUSTER_MANAGEMENT) {
+			ASSERT(metadata.tenantMap.size() <= metaclusterMaxTenants);
+		} else {
+			ASSERT(metadata.tenantMap.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER);
+		}
+
+		ASSERT(metadata.tenantMap.size() == metadata.tenantCount);
+		ASSERT(metadata.tenantIdIndex.size() == metadata.tenantCount);
+
+		for (auto [tenantName, tenantMapEntry] : metadata.tenantMap) {
+			if (metadata.clusterType != ClusterType::METACLUSTER_DATA) {
+				ASSERT(tenantMapEntry.id <= metadata.lastTenantId);
+			}
+			ASSERT(metadata.tenantIdIndex[tenantMapEntry.id] == tenantName);
+
+			if (tenantMapEntry.tenantGroup.present()) {
+				auto tenantGroupMapItr = metadata.tenantGroupMap.find(tenantMapEntry.tenantGroup.get());
+				ASSERT(tenantGroupMapItr != metadata.tenantGroupMap.end());
+				ASSERT(tenantMapEntry.assignedCluster == tenantGroupMapItr->second.assignedCluster);
+				ASSERT(metadata.tenantGroupIndex[tenantMapEntry.tenantGroup.get()].count(tenantName));
+			} else {
+				ASSERT(!metadata.tenantsInTenantGroupIndex.count(tenantName));
+			}
+
+			if (metadata.clusterType == ClusterType::METACLUSTER_MANAGEMENT) {
+				ASSERT(tenantMapEntry.assignedCluster.present());
+				// If the rename pair is present, it should be in the map and match our current entry
+				if (tenantMapEntry.renamePair.present()) {
+					auto pairMapEntry = metadata.tenantMap[tenantMapEntry.renamePair.get()];
+					ASSERT(pairMapEntry.id == tenantMapEntry.id);
+					ASSERT(pairMapEntry.prefix == tenantMapEntry.prefix);
+					ASSERT(pairMapEntry.encrypted == tenantMapEntry.encrypted);
+					ASSERT(pairMapEntry.configurationSequenceNum == tenantMapEntry.configurationSequenceNum);
+					ASSERT(pairMapEntry.assignedCluster.present());
+					ASSERT(pairMapEntry.assignedCluster.get() == tenantMapEntry.assignedCluster.get());
+					ASSERT(pairMapEntry.renamePair.present());
+					ASSERT(pairMapEntry.renamePair.get() == tenantName);
+					if (tenantMapEntry.tenantState == TenantState::RENAMING_FROM) {
+						ASSERT(pairMapEntry.tenantState == TenantState::RENAMING_TO);
+					} else if (tenantMapEntry.tenantState == TenantState::RENAMING_TO) {
+						ASSERT(pairMapEntry.tenantState == TenantState::RENAMING_FROM);
+					} else if (tenantMapEntry.tenantState == TenantState::REMOVING) {
+						ASSERT(pairMapEntry.tenantState == TenantState::REMOVING);
+					} else {
+						ASSERT(false); // Entry in an invalid state if we have a rename pair
+					}
+				}
+			} else {
+				ASSERT(tenantMapEntry.tenantState == TenantState::READY);
+				ASSERT(!tenantMapEntry.assignedCluster.present());
+				ASSERT(!tenantMapEntry.renamePair.present());
+			}
+		}
+	}
+
+	// Check that the tenant tombstones are properly cleaned up and only present on a metacluster data cluster
+	void checkTenantTombstones() {
+		if (metadata.clusterType == ClusterType::METACLUSTER_DATA) {
+			if (!metadata.tombstoneCleanupData.present()) {
+				ASSERT(metadata.tenantTombstones.empty());
+			}
+
+			if (!metadata.tenantTombstones.empty()) {
+				ASSERT(*metadata.tenantTombstones.begin() >
+				       metadata.tombstoneCleanupData.get().tombstonesErasedThrough);
+			}
+		} else {
+			ASSERT(metadata.tenantTombstones.empty() && !metadata.tombstoneCleanupData.present());
+		}
+	}
+
+	ACTOR static Future<Void> run(TenantConsistencyCheck* self) {
+		wait(loadTenantMetadata(self));
+		self->validateTenantMetadata();
+		self->checkTenantTombstones();
+
+		return Void();
+	}
+
+public:
+	TenantConsistencyCheck() {}
+	TenantConsistencyCheck(Reference<DB> db) : db(db) {}
+
+	Future<Void> run() { return run(this); }
+};
+
+#endif
\ No newline at end of file
diff --git a/fdbserver/include/fdbserver/workloads/workloads.actor.h b/fdbserver/include/fdbserver/workloads/workloads.actor.h
index df08911fc7..36d3112ae4 100644
--- a/fdbserver/include/fdbserver/workloads/workloads.actor.h
+++ b/fdbserver/include/fdbserver/workloads/workloads.actor.h
@@ -296,6 +296,8 @@ Future<Void> testExpectedError(Future<Void> test,
                                Optional<Error> throwOnError = Optional<Error>(),
                                UID id = UID());
 
+std::string getTestEncryptionFileName();
+
 #include "flow/unactorcompiler.h"
 
 #endif
diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp
index 1af905c71c..0978f36d3b 100644
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@@ -489,10 +489,3 @@ TEST_CASE("/fdbserver/MasterServer/FigureVersion/NegativeReferenceVersion") {
 	ASSERT_EQ(figureVersion(0, 2.0, -1e6, 5e5, 0.1, 1e6), 550000);
 	return Void();
 }
-
-TEST_CASE("/fdbserver/MasterServer/FigureVersion/Overflow") {
-	// The upper range used in std::clamp should overflow.
-	ASSERT_EQ(figureVersion(std::numeric_limits<Version>::max() - static_cast<Version>(1e6), 1.0, 0, 1e6, 0.1, 1e6),
-	          std::numeric_limits<Version>::max() - static_cast<Version>(1e6 * 0.1));
-	return Void();
-}
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index d3f9ad0991..5822ea41d9 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -23,6 +23,7 @@
 #include <type_traits>
 #include <unordered_map>
 
+#include "fdbclient/BlobGranuleCommon.h"
 #include "fmt/format.h"
 #include "fdbclient/CommitTransaction.h"
 #include "fdbclient/FDBTypes.h"
@@ -41,6 +42,8 @@
 #include "fdbclient/Tracing.h"
 #include "flow/Util.h"
 #include "fdbclient/Atomic.h"
+#include "fdbclient/BlobConnectionProvider.h"
+#include "fdbclient/BlobGranuleReader.actor.h"
 #include "fdbclient/CommitProxyInterface.h"
 #include "fdbclient/DatabaseContext.h"
 #include "fdbclient/FDBTypes.h"
@@ -58,9 +61,8 @@
 #include "fdbrpc/sim_validation.h"
 #include "fdbrpc/Smoother.h"
 #include "fdbrpc/Stats.h"
-#include "fdbserver/EncryptedMutationMessage.h"
 #include "fdbserver/FDBExecHelper.actor.h"
-#include "fdbserver/GetEncryptCipherKeys.h"
+#include "fdbclient/GetEncryptCipherKeys.actor.h"
 #include "fdbserver/IKeyValueStore.h"
 #include "fdbserver/Knobs.h"
 #include "fdbserver/LatencyBandConfig.h"
@@ -399,29 +401,26 @@ struct StorageServerDisk {
 	//  - "a", if key "a" exist
 	//  - "b", if key "a" doesn't exist, and "b" is the next existing key in total order
 	//  - allKeys.end, if keyrange [a, allKeys.end) is empty
-	Future<Key> readNextKeyInclusive(KeyRef key, ReadType type = ReadType::NORMAL) {
+	Future<Key> readNextKeyInclusive(KeyRef key, Optional<ReadOptions> options = Optional<ReadOptions>()) {
 		++(*kvScans);
-		return readFirstKey(storage, KeyRangeRef(key, allKeys.end), type);
+		return readFirstKey(storage, KeyRangeRef(key, allKeys.end), options);
 	}
-	Future<Optional<Value>> readValue(KeyRef key,
-	                                  ReadType type = ReadType::NORMAL,
-	                                  Optional<UID> debugID = Optional<UID>()) {
+	Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> options = Optional<ReadOptions>()) {
 		++(*kvGets);
-		return storage->readValue(key, type, debugID);
+		return storage->readValue(key, options);
 	}
 	Future<Optional<Value>> readValuePrefix(KeyRef key,
 	                                        int maxLength,
-	                                        ReadType type = ReadType::NORMAL,
-	                                        Optional<UID> debugID = Optional<UID>()) {
+	                                        Optional<ReadOptions> options = Optional<ReadOptions>()) {
 		++(*kvGets);
-		return storage->readValuePrefix(key, maxLength, type, debugID);
+		return storage->readValuePrefix(key, maxLength, options);
 	}
 	Future<RangeResult> readRange(KeyRangeRef keys,
 	                              int rowLimit = 1 << 30,
 	                              int byteLimit = 1 << 30,
-	                              ReadType type = ReadType::NORMAL) {
+	                              Optional<ReadOptions> options = Optional<ReadOptions>()) {
 		++(*kvScans);
-		return storage->readRange(keys, rowLimit, byteLimit, type);
+		return storage->readRange(keys, rowLimit, byteLimit, options);
 	}
 
 	Future<CheckpointMetaData> checkpoint(const CheckpointRequest& request) { return storage->checkpoint(request); }
@@ -448,8 +447,8 @@ private:
 	IKeyValueStore* storage;
 	void writeMutations(const VectorRef<MutationRef>& mutations, Version debugVersion, const char* debugContext);
 
-	ACTOR static Future<Key> readFirstKey(IKeyValueStore* storage, KeyRangeRef range, ReadType type) {
-		RangeResult r = wait(storage->readRange(range, 1, 1 << 30, type));
+	ACTOR static Future<Key> readFirstKey(IKeyValueStore* storage, KeyRangeRef range, Optional<ReadOptions> options) {
+		RangeResult r = wait(storage->readRange(range, 1, 1 << 30, options));
 		if (r.size())
 			return r[0].key;
 		else
@@ -553,7 +552,6 @@ struct ChangeFeedInfo : ReferenceCounted<ChangeFeedInfo> {
 	Version storageVersion = invalidVersion; // The version between the storage version and the durable version are
 	                                         // being written to disk as part of the current commit in updateStorage.
 	Version durableVersion = invalidVersion; // All versions before the durable version are durable on disk
-	// FIXME: this needs to get persisted to disk to still fix same races across restart!
 	Version metadataVersion = invalidVersion; // Last update to the change feed metadata. Used for reasoning about
 	                                          // fetched metadata vs local metadata
 	Version emptyVersion = 0; // The change feed does not have any mutations before emptyVersion
@@ -699,12 +697,14 @@ public:
 	std::map<Version, std::vector<CheckpointMetaData>> pendingCheckpoints; // Pending checkpoint requests
 	std::unordered_map<UID, CheckpointMetaData> checkpoints; // Existing and deleting checkpoints
 	TenantMap tenantMap;
-	TenantPrefixIndex tenantPrefixIndex;
+	Reference<TenantPrefixIndex> tenantPrefixIndex;
 	std::map<Version, std::vector<PendingNewShard>>
 	    pendingAddRanges; // Pending requests to add ranges to physical shards
 	std::map<Version, std::vector<KeyRange>>
 	    pendingRemoveRanges; // Pending requests to remove ranges from physical shards
 
+	Reference<IEncryptionKeyProvider> encryptionKeyProvider;
+
 	bool shardAware; // True if the storage server is aware of the physical shards.
 
 	// Histograms
@@ -1188,11 +1188,6 @@ public:
 			});
 			specialCounter(
 			    cc, "FetchKeysFullFetchWaiting", [self]() { return self->fetchKeysParallelismFullLock.waiters(); });
-			specialCounter(cc, "FetchChangeFeedFetchActive", [self]() {
-				return self->fetchChangeFeedParallelismLock.activePermits();
-			});
-			specialCounter(
-			    cc, "FetchChangeFeedWaiting", [self]() { return self->fetchChangeFeedParallelismLock.waiters(); });
 			specialCounter(cc, "ServeFetchCheckpointActive", [self]() {
 				return self->serveFetchCheckpointParallelismLock.activePermits();
 			});
@@ -1216,10 +1211,15 @@ public:
 
 	Reference<EventCacheHolder> storageServerSourceTLogIDEventHolder;
 
+	// Connection to blob store for fetchKeys()
+	Reference<BlobConnectionProvider> blobConn;
+
 	StorageServer(IKeyValueStore* storage,
 	              Reference<AsyncVar<ServerDBInfo> const> const& db,
-	              StorageServerInterface const& ssi)
-	  : shardAware(false), tlogCursorReadsLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP,
+	              StorageServerInterface const& ssi,
+	              Reference<IEncryptionKeyProvider> encryptionKeyProvider)
+	  : tenantPrefixIndex(makeReference<TenantPrefixIndex>()), encryptionKeyProvider(encryptionKeyProvider),
+	    shardAware(false), tlogCursorReadsLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP,
 	                                                                               TLOG_CURSOR_READS_LATENCY_HISTOGRAM,
 	                                                                               Histogram::Unit::microseconds)),
 	    ssVersionLockLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP,
@@ -1252,7 +1252,6 @@ public:
 	    numWatches(0), noRecentUpdates(false), lastUpdate(now()), updateEagerReads(nullptr),
 	    fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM),
 	    fetchKeysParallelismFullLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_FULL),
-	    fetchChangeFeedParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM),
 	    fetchKeysBytesBudget(SERVER_KNOBS->STORAGE_FETCH_BYTES), fetchKeysBudgetUsed(false),
 	    serveFetchCheckpointParallelismLock(SERVER_KNOBS->SERVE_FETCH_CHECKPOINT_PARALLELISM),
 	    ssLock(FLOW_KNOBS->MAX_OUTSTANDING,
@@ -1287,6 +1286,14 @@ public:
 		this->storage.kvGets = &counters.kvGets;
 		this->storage.kvScans = &counters.kvScans;
 		this->storage.kvCommits = &counters.kvCommits;
+
+		if (SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
+			try {
+				blobConn = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL);
+			} catch (Error& e) {
+				// Skip any error when establishing blob connection
+			}
+		}
 	}
 
 	//~StorageServer() { fclose(log); }
@@ -1810,6 +1817,8 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 	// Temporarily disabled -- this path is hit a lot
 	// getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.first();
 
+	state ReadType type = req.options.present() ? req.options.get().type : ReadType::NORMAL;
+
 	try {
 		++data->counters.getValueQueries;
 		++data->counters.allQueries;
@@ -1819,19 +1828,19 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 		// Active load balancing runs at a very high priority (to obtain accurate queue lengths)
 		// so we need to downgrade here
 		wait(data->getQueryDelay());
-		wait(store(lock, data->ssLock.lock(data->readPriorityRanks[(int)req.readType])));
+		wait(store(lock, data->ssLock.lock(data->readPriorityRanks[(int)type])));
 
-		if (req.debugID.present())
+		if (req.options.present() && req.options.get().debugID.present())
 			g_traceBatch.addEvent("GetValueDebug",
-			                      req.debugID.get().first(),
+			                      req.options.get().debugID.get().first(),
 			                      "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask());
 
 		state Optional<Value> v;
 		Version commitVersion = getLatestCommitVersion(req.ssLatestCommitVersions, data->tag);
 		state Version version = wait(waitForVersion(data, commitVersion, req.version, req.spanContext));
-		if (req.debugID.present())
+		if (req.options.present() && req.options.get().debugID.present())
 			g_traceBatch.addEvent("GetValueDebug",
-			                      req.debugID.get().first(),
+			                      req.options.get().debugID.get().first(),
 			                      "getValueQ.AfterVersion"); //.detail("TaskID", g_network->getCurrentTask());
 
 		Optional<TenantMapEntry> entry = data->getTenantEntry(version, req.tenantInfo);
@@ -1852,7 +1861,7 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 			path = 1;
 		} else if (!i || !i->isClearTo() || i->getEndKey() <= req.key) {
 			path = 2;
-			Optional<Value> vv = wait(data->storage.readValue(req.key, req.readType, req.debugID));
+			Optional<Value> vv = wait(data->storage.readValue(req.key, req.options));
 			data->counters.kvGetBytes += vv.expectedSize();
 			// Validate that while we were reading the data we didn't lose the version or shard
 			if (version < data->storageVersion()) {
@@ -1899,9 +1908,9 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 			data->metrics.notifyBytesReadPerKSecond(req.key, bytesReadPerKSecond);
 		}
 
-		if (req.debugID.present())
+		if (req.options.present() && req.options.get().debugID.present())
 			g_traceBatch.addEvent("GetValueDebug",
-			                      req.debugID.get().first(),
+			                      req.options.get().debugID.get().first(),
 			                      "getValueQ.AfterRead"); //.detail("TaskID", g_network->getCurrentTask());
 
 		// Check if the desired key might be cached
@@ -1956,21 +1965,17 @@ ACTOR Future<Version> watchWaitForValueChange(StorageServer* data, SpanContext p
 
 	state Version minVersion = data->data().latestVersion;
 	state Future<Void> watchFuture = data->watches.onChange(metadata->key);
+	state ReadOptions options;
 	loop {
 		try {
 			metadata = data->getWatchMetadata(key);
 			state Version latest = data->version.get();
+			options.debugID = metadata->debugID;
+
 			CODE_PROBE(latest >= minVersion && latest < data->data().latestVersion,
 			           "Starting watch loop with latestVersion > data->version");
-			// perhaps use a lower priority?
-			GetValueRequest getReq(span.context,
-			                       TenantInfo(),
-			                       metadata->key,
-			                       latest,
-			                       ReadType::NORMAL,
-			                       metadata->tags,
-			                       metadata->debugID,
-			                       VersionVector());
+			GetValueRequest getReq(
+			    span.context, TenantInfo(), metadata->key, latest, metadata->tags, options, VersionVector());
 			state Future<Void> getValue = getValueQ(
 			    data, getReq); // we are relying on the delay zero at the top of getValueQ, if removed we need one here
 			GetValueReply reply = wait(getReq.reply.getFuture());
@@ -2284,7 +2289,14 @@ ACTOR Future<Void> fetchCheckpointKeyValuesQ(StorageServer* self, FetchCheckpoin
 
 ACTOR Future<Void> overlappingChangeFeedsQ(StorageServer* data, OverlappingChangeFeedsRequest req) {
 	wait(delay(0));
-	wait(data->version.whenAtLeast(req.minVersion));
+	try {
+		wait(success(waitForVersionNoTooOld(data, req.minVersion)));
+	} catch (Error& e) {
+		if (!canReplyWith(e))
+			throw;
+		req.reply.sendError(e);
+		return Void();
+	}
 
 	if (!data->isReadable(req.range)) {
 		req.reply.sendError(wrong_shard_server());
@@ -2314,6 +2326,11 @@ ACTOR Future<Void> overlappingChangeFeedsQ(StorageServer* data, OverlappingChang
 				}
 
 				rangeIds[it->id] = std::tuple(it->range, it->emptyVersion, stopVersion, it->metadataVersion);
+			} else if (it->destroyed && it->metadataVersion > metadataWaitVersion) {
+				// if we communicate the lack of a change feed because it's destroying, ensure the feed destroy isn't
+				// rolled back first
+				CODE_PROBE(true, "Overlapping Change Feeds ensuring destroy isn't rolled back");
+				metadataWaitVersion = it->metadataVersion;
 			}
 		}
 	}
@@ -2336,8 +2353,8 @@ ACTOR Future<Void> overlappingChangeFeedsQ(StorageServer* data, OverlappingChang
 	}
 
 	// Make sure all of the metadata we are sending won't get rolled back
-	if (metadataWaitVersion != invalidVersion && metadataWaitVersion > data->knownCommittedVersion.get()) {
-		CODE_PROBE(true, "overlapping change feeds waiting for metadata version to be committed");
+	if (metadataWaitVersion != invalidVersion && metadataWaitVersion > data->desiredOldestVersion.get()) {
+		CODE_PROBE(true, "overlapping change feeds waiting for metadata version to be safe from rollback");
 		wait(data->desiredOldestVersion.whenAtLeast(metadataWaitVersion));
 	}
 	req.reply.send(reply);
@@ -2503,6 +2520,9 @@ ACTOR Future<std::pair<ChangeFeedStreamReply, bool>> getChangeFeedMutations(Stor
 	state int remainingLimitBytes = CLIENT_KNOBS->REPLY_BYTE_LIMIT;
 	state int remainingDurableBytes = CLIENT_KNOBS->REPLY_BYTE_LIMIT;
 	state Version startVersion = data->version.get();
+	// TODO: Change feed reads should probably at least set cacheResult to false, possibly set a different ReadType as
+	// well, perhaps high priority?
+	state ReadOptions options;
 
 	if (DEBUG_CF_TRACE) {
 		TraceEvent(SevDebug, "TraceChangeFeedMutationsBegin", data->thisServerID)
@@ -2510,7 +2530,8 @@ ACTOR Future<std::pair<ChangeFeedStreamReply, bool>> getChangeFeedMutations(Stor
 		    .detail("StreamUID", streamUID)
 		    .detail("Range", req.range)
 		    .detail("Begin", req.begin)
-		    .detail("End", req.end);
+		    .detail("End", req.end)
+		    .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());
 	}
 
 	if (data->version.get() < req.begin) {
@@ -2538,6 +2559,7 @@ ACTOR Future<std::pair<ChangeFeedStreamReply, bool>> getChangeFeedMutations(Stor
 	state Version dequeVersion = data->version.get();
 	state Version dequeKnownCommit = data->knownCommittedVersion.get();
 	state Version emptyVersion = feedInfo->emptyVersion;
+	state Version durableValidationVersion = std::min(data->durableVersion.get(), feedInfo->durableFetchVersion.get());
 	Version fetchStorageVersion = std::max(feedInfo->fetchVersion, feedInfo->durableFetchVersion.get());
 
 	if (DEBUG_CF_TRACE) {
@@ -2554,7 +2576,9 @@ ACTOR Future<std::pair<ChangeFeedStreamReply, bool>> getChangeFeedMutations(Stor
 		    .detail("DurableVersion", feedInfo->durableVersion)
 		    .detail("FetchStorageVersion", fetchStorageVersion)
 		    .detail("FetchVersion", feedInfo->fetchVersion)
-		    .detail("DurableFetchVersion", feedInfo->durableFetchVersion.get());
+		    .detail("DurableFetchVersion", feedInfo->durableFetchVersion.get())
+		    .detail("DurableValidationVersion", durableValidationVersion)
+		    .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());
 	}
 
 	if (req.end > emptyVersion + 1) {
@@ -2594,7 +2618,8 @@ ACTOR Future<std::pair<ChangeFeedStreamReply, bool>> getChangeFeedMutations(Stor
 		    data->storage.readRange(KeyRangeRef(changeFeedDurableKey(req.rangeID, std::max(req.begin, emptyVersion)),
 		                                        changeFeedDurableKey(req.rangeID, req.end)),
 		                            1 << 30,
-		                            remainingDurableBytes));
+		                            remainingDurableBytes,
+		                            options));
 
 		data->counters.kvScanBytes += res.logicalSize();
 		++data->counters.changeFeedDiskReads;
@@ -2669,18 +2694,19 @@ ACTOR Future<std::pair<ChangeFeedStreamReply, bool>> getChangeFeedMutations(Stor
 				}
 			} else if (memoryVerifyIdx < memoryReply.mutations.size() &&
 			           version == memoryReply.mutations[memoryVerifyIdx].version) {
-				if (version > feedInfo->storageVersion && version > feedInfo->fetchVersion) {
+				if (version > durableValidationVersion) {
 					// Another validation case - feed was popped, data was fetched, fetched data was persisted but pop
 					// wasn't yet, then SS restarted. Now SS has the data without the popped version. This looks wrong
 					// here but is fine.
 					memoryVerifyIdx++;
 				} else {
-					fmt::print(
-					    "ERROR: SS {0} CF {1} SQ {2} has mutation at {3} in memory but all filtered out on disk!\n",
-					    data->thisServerID.toString().substr(0, 4),
-					    req.rangeID.printable().substr(0, 6),
-					    streamUID.toString().substr(0, 8),
-					    version);
+					fmt::print("ERROR: SS {0} CF {1} SQ {2} has mutation at {3} in memory but all filtered out on "
+					           "disk! (durable validation = {4})\n",
+					           data->thisServerID.toString().substr(0, 4),
+					           req.rangeID.printable().substr(0, 6),
+					           streamUID.toString().substr(0, 8),
+					           version,
+					           durableValidationVersion);
 
 					fmt::print("  Memory: ({})\n", memoryReply.mutations[memoryVerifyIdx].mutations.size());
 					for (auto& it : memoryReply.mutations[memoryVerifyIdx].mutations) {
@@ -2698,7 +2724,7 @@ ACTOR Future<std::pair<ChangeFeedStreamReply, bool>> getChangeFeedMutations(Stor
 							fmt::print("    {} - {}\n", it.param1.printable().c_str(), it.param2.printable().c_str());
 						}
 					}
-					ASSERT(false);
+					ASSERT_WE_THINK(false);
 				}
 			}
 			remainingDurableBytes -=
@@ -2794,7 +2820,8 @@ ACTOR Future<std::pair<ChangeFeedStreamReply, bool>> getChangeFeedMutations(Stor
 		    .detail("FirstVersion", reply.mutations.empty() ? invalidVersion : reply.mutations.front().version)
 		    .detail("LastVersion", reply.mutations.empty() ? invalidVersion : reply.mutations.back().version)
 		    .detail("Count", reply.mutations.size())
-		    .detail("GotAll", gotAll);
+		    .detail("GotAll", gotAll)
+		    .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());
 	}
 
 	if (DEBUG_CF_MISSING(req.rangeID, req.range, req.begin, reply.mutations.back().version) && !req.canReadPopped) {
@@ -2898,8 +2925,6 @@ ACTOR Future<Void> localChangeFeedStream(StorageServer* data,
 
 // Change feed stream must be sent an error as soon as it is moved away, or change feed can get incorrect results
 ACTOR Future<Void> stopChangeFeedOnMove(StorageServer* data, ChangeFeedStreamRequest req, UID streamUID) {
-	wait(delay(0, TaskPriority::DefaultEndpoint));
-
 	auto feed = data->uidChangeFeed.find(req.rangeID);
 	if (feed == data->uidChangeFeed.end() || feed->second->removing) {
 		req.reply.sendError(unknown_change_feed());
@@ -2949,10 +2974,13 @@ ACTOR Future<Void> changeFeedStreamQ(StorageServer* data, ChangeFeedStreamReques
 			    .detail("Range", req.range)
 			    .detail("Begin", req.begin)
 			    .detail("End", req.end)
-			    .detail("CanReadPopped", req.canReadPopped);
+			    .detail("CanReadPopped", req.canReadPopped)
+			    .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());
 		}
 		data->activeFeedQueries++;
 
+		wait(success(waitForVersionNoTooOld(data, req.begin)));
+
 		// send an empty version at begin - 1 to establish the stream quickly
 		ChangeFeedStreamReply emptyInitialReply;
 		MutationsAndVersionRef emptyInitialVersion;
@@ -2970,7 +2998,8 @@ ACTOR Future<Void> changeFeedStreamQ(StorageServer* data, ChangeFeedStreamReques
 			    .detail("Begin", req.begin)
 			    .detail("End", req.end)
 			    .detail("CanReadPopped", req.canReadPopped)
-			    .detail("Version", req.begin - 1);
+			    .detail("Version", req.begin - 1)
+			    .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());
 		}
 
 		loop {
@@ -2986,7 +3015,8 @@ ACTOR Future<Void> changeFeedStreamQ(StorageServer* data, ChangeFeedStreamReques
 					    .detail("Begin", req.begin)
 					    .detail("End", req.end)
 					    .detail("CanReadPopped", req.canReadPopped)
-					    .detail("Version", blockedVersion.present() ? blockedVersion.get() : data->prevVersion);
+					    .detail("Version", blockedVersion.present() ? blockedVersion.get() : data->prevVersion)
+					    .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());
 				}
 				removeUID = true;
 			}
@@ -3006,7 +3036,8 @@ ACTOR Future<Void> changeFeedStreamQ(StorageServer* data, ChangeFeedStreamReques
 					    .detail("Begin", req.begin)
 					    .detail("End", req.end)
 					    .detail("CanReadPopped", req.canReadPopped)
-					    .detail("Version", blockedVersion.present() ? blockedVersion.get() : data->prevVersion);
+					    .detail("Version", blockedVersion.present() ? blockedVersion.get() : data->prevVersion)
+					    .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());
 				}
 			}
 			std::pair<ChangeFeedStreamReply, bool> _feedReply = wait(feedReplyFuture);
@@ -3237,6 +3268,7 @@ ACTOR Future<GetValueReqAndResultRef> quickGetValue(StorageServer* data,
 	state GetValueReqAndResultRef getValue;
 	state double getValueStart = g_network->timer();
 	getValue.key = key;
+	state Optional<ReadOptions> options = pOriginalReq->options;
 
 	if (data->shards[key]->isReadable()) {
 		try {
@@ -3245,9 +3277,8 @@ ACTOR Future<GetValueReqAndResultRef> quickGetValue(StorageServer* data,
 			                    pOriginalReq->tenantInfo,
 			                    key,
 			                    version,
-			                    ReadType::HIGH,
 			                    pOriginalReq->tags,
-			                    pOriginalReq->debugID,
+			                    options,
 			                    VersionVector());
 			// Note that it does not use readGuard to avoid server being overloaded here. Throttling is enforced at the
 			// original request level, rather than individual underlying lookups. The reason is that throttle any
@@ -3294,7 +3325,7 @@ ACTOR Future<GetKeyValuesReply> readRange(StorageServer* data,
                                           int limit,
                                           int* pLimitBytes,
                                           SpanContext parentSpan,
-                                          ReadType type,
+                                          Optional<ReadOptions> options,
                                           Optional<Key> tenantPrefix) {
 	state GetKeyValuesReply result;
 	state StorageServer::VersionedData::ViewAtVersion view = data->data().at(version);
@@ -3370,7 +3401,7 @@ ACTOR Future<GetKeyValuesReply> readRange(StorageServer* data,
 			// Read the data on disk up to vCurrent (or the end of the range)
 			readEnd = vCurrent ? std::min(vCurrent.key(), range.end) : range.end;
 			RangeResult atStorageVersion =
-			    wait(data->storage.readRange(KeyRangeRef(readBegin, readEnd), limit, *pLimitBytes, type));
+			    wait(data->storage.readRange(KeyRangeRef(readBegin, readEnd), limit, *pLimitBytes, options));
 			data->counters.kvScanBytes += atStorageVersion.logicalSize();
 
 			ASSERT(atStorageVersion.size() <= limit);
@@ -3465,7 +3496,7 @@ ACTOR Future<GetKeyValuesReply> readRange(StorageServer* data,
 			readBegin = vCurrent ? std::max(vCurrent->isClearTo() ? vCurrent->getEndKey() : vCurrent.key(), range.begin)
 			                     : range.begin;
 			RangeResult atStorageVersion =
-			    wait(data->storage.readRange(KeyRangeRef(readBegin, readEnd), limit, *pLimitBytes, type));
+			    wait(data->storage.readRange(KeyRangeRef(readBegin, readEnd), limit, *pLimitBytes, options));
 			data->counters.kvScanBytes += atStorageVersion.logicalSize();
 
 			ASSERT(atStorageVersion.size() <= -limit);
@@ -3535,7 +3566,7 @@ ACTOR Future<Key> findKey(StorageServer* data,
                           KeyRange range,
                           int* pOffset,
                           SpanContext parentSpan,
-                          ReadType type)
+                          Optional<ReadOptions> options)
 // Attempts to find the key indicated by sel in the data at version, within range.
 // Precondition: selectorInRange(sel, range)
 // If it is found, offset is set to 0 and a key is returned which falls inside range.
@@ -3574,7 +3605,7 @@ ACTOR Future<Key> findKey(StorageServer* data,
 	              (distance + skipEqualKey) * sign,
 	              &maxBytes,
 	              span.context,
-	              type,
+	              options,
 	              Optional<Key>()));
 	state bool more = rep.more && rep.data.size() != distance + skipEqualKey;
 
@@ -3589,7 +3620,7 @@ ACTOR Future<Key> findKey(StorageServer* data,
 		                                        -2,
 		                                        &maxBytes,
 		                                        span.context,
-		                                        type,
+		                                        options,
 		                                        Optional<Key>()));
 		rep = rep2;
 		more = rep.more && rep.data.size() != distance + skipEqualKey;
@@ -3673,7 +3704,8 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 {
 	state Span span("SS:getKeyValues"_loc, req.spanContext);
 	state int64_t resultSize = 0;
-	state ReadType type = req.readType;
+	state Optional<ReadOptions> options = req.options;
+	state ReadType type = options.present() ? options.get().type : ReadType::NORMAL;
 
 	if (req.tenantInfo.name.present()) {
 		span.addAttribute("tenant"_sr, req.tenantInfo.name.get());
@@ -3695,8 +3727,9 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 	state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(data->readPriorityRanks[(int)type]));
 
 	try {
-		if (req.debugID.present())
-			g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.Before");
+		if (req.options.present() && req.options.get().debugID.present())
+			g_traceBatch.addEvent(
+			    "TransactionDebug", req.options.get().debugID.get().first(), "storageserver.getKeyValues.Before");
 
 		Version commitVersion = getLatestCommitVersion(req.ssLatestCommitVersions, data->tag);
 		state Version version = wait(waitForVersion(data, commitVersion, req.version, span.context));
@@ -3712,9 +3745,9 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 		//		try {
 		state KeyRange shard = getShardKeyRange(data, req.begin);
 
-		if (req.debugID.present())
+		if (req.options.present() && req.options.get().debugID.present())
 			g_traceBatch.addEvent(
-			    "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterVersion");
+			    "TransactionDebug", req.options.get().debugID.get().first(), "storageserver.getKeyValues.AfterVersion");
 		//.detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end);
 		//} catch (Error& e) { TraceEvent("WrongShardServer", data->thisServerID).detail("Begin",
 		// req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("Shard",
@@ -3731,18 +3764,19 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 
 		state int offset1 = 0;
 		state int offset2;
-		state Future<Key> fBegin = req.begin.isFirstGreaterOrEqual()
-		                               ? Future<Key>(req.begin.getKey())
-		                               : findKey(data, req.begin, version, searchRange, &offset1, span.context, type);
+		state Future<Key> fBegin =
+		    req.begin.isFirstGreaterOrEqual()
+		        ? Future<Key>(req.begin.getKey())
+		        : findKey(data, req.begin, version, searchRange, &offset1, span.context, options);
 		state Future<Key> fEnd = req.end.isFirstGreaterOrEqual()
 		                             ? Future<Key>(req.end.getKey())
-		                             : findKey(data, req.end, version, searchRange, &offset2, span.context, type);
+		                             : findKey(data, req.end, version, searchRange, &offset2, span.context, options);
 		state Key begin = wait(fBegin);
 		state Key end = wait(fEnd);
 
-		if (req.debugID.present())
+		if (req.options.present() && req.options.get().debugID.present())
 			g_traceBatch.addEvent(
-			    "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterKeys");
+			    "TransactionDebug", req.options.get().debugID.get().first(), "storageserver.getKeyValues.AfterKeys");
 		//.detail("Off1",offset1).detail("Off2",offset2).detail("ReqBegin",req.begin.getKey()).detail("ReqEnd",req.end.getKey());
 
 		// Offsets of zero indicate begin/end keys in this shard, which obviously means we can answer the query
@@ -3760,8 +3794,9 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 		}
 
 		if (begin >= end) {
-			if (req.debugID.present())
-				g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.Send");
+			if (req.options.present() && req.options.get().debugID.present())
+				g_traceBatch.addEvent(
+				    "TransactionDebug", req.options.get().debugID.get().first(), "storageserver.getKeyValues.Send");
 			//.detail("Begin",begin).detail("End",end);
 
 			GetKeyValuesReply none;
@@ -3782,13 +3817,14 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 			                                      req.limit,
 			                                      &remainingLimitBytes,
 			                                      span.context,
-			                                      type,
+			                                      options,
 			                                      tenantPrefix));
 			GetKeyValuesReply r = _r;
 
-			if (req.debugID.present())
-				g_traceBatch.addEvent(
-				    "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterReadRange");
+			if (req.options.present() && req.options.get().debugID.present())
+				g_traceBatch.addEvent("TransactionDebug",
+				                      req.options.get().debugID.get().first(),
+				                      "storageserver.getKeyValues.AfterReadRange");
 			//.detail("Begin",begin).detail("End",end).detail("SizeOf",r.data.size());
 			data->checkChangeCounter(
 			    changeCounter,
@@ -3865,11 +3901,12 @@ ACTOR Future<GetRangeReqAndResultRef> quickGetKeyValues(
 	state double getValuesStart = g_network->timer();
 	getRange.begin = firstGreaterOrEqual(KeyRef(*a, prefix));
 	getRange.end = firstGreaterOrEqual(strinc(prefix, *a));
-	if (pOriginalReq->debugID.present())
-		g_traceBatch.addEvent(
-		    "TransactionDebug", pOriginalReq->debugID.get().first(), "storageserver.quickGetKeyValues.Before");
+	if (pOriginalReq->options.present() && pOriginalReq->options.get().debugID.present())
+		g_traceBatch.addEvent("TransactionDebug",
+		                      pOriginalReq->options.get().debugID.get().first(),
+		                      "storageserver.quickGetKeyValues.Before");
 	try {
-		// TODO: Use a lower level API may be better? Or tweak priorities?
+		// TODO: Use a lower level API may be better?
 		GetKeyValuesRequest req;
 		req.spanContext = pOriginalReq->spanContext;
 		req.arena = *a;
@@ -3881,10 +3918,10 @@ ACTOR Future<GetRangeReqAndResultRef> quickGetKeyValues(
 		// TODO: Use remainingLimit, remainingLimitBytes rather than separate knobs.
 		req.limit = SERVER_KNOBS->QUICK_GET_KEY_VALUES_LIMIT;
 		req.limitBytes = SERVER_KNOBS->QUICK_GET_KEY_VALUES_LIMIT_BYTES;
-		req.readType = pOriginalReq->readType;
+		req.options = pOriginalReq->options;
+		// TODO: tweak priorities in req.options.get().type?
 		req.tags = pOriginalReq->tags;
 		req.ssLatestCommitVersions = VersionVector();
-		req.debugID = pOriginalReq->debugID;
 
 		// Note that it does not use readGuard to avoid server being overloaded here. Throttling is enforced at the
 		// original request level, rather than individual underlying lookups. The reason is that throttle any individual
@@ -3898,9 +3935,9 @@ ACTOR Future<GetRangeReqAndResultRef> quickGetKeyValues(
 			getRange.result = RangeResultRef(reply.data, reply.more);
 			const double duration = g_network->timer() - getValuesStart;
 			data->counters.mappedRangeLocalSample.addMeasurement(duration);
-			if (pOriginalReq->debugID.present())
+			if (pOriginalReq->options.present() && pOriginalReq->options.get().debugID.present())
 				g_traceBatch.addEvent("TransactionDebug",
-				                      pOriginalReq->debugID.get().first(),
+				                      pOriginalReq->options.get().debugID.get().first(),
 				                      "storageserver.quickGetKeyValues.AfterLocalFetch");
 			return getRange;
 		}
@@ -3913,8 +3950,8 @@ ACTOR Future<GetRangeReqAndResultRef> quickGetKeyValues(
 	if (SERVER_KNOBS->QUICK_GET_KEY_VALUES_FALLBACK) {
 		state Transaction tr(data->cx, pOriginalReq->tenantInfo.name.castTo<TenantName>());
 		tr.setVersion(version);
-		if (pOriginalReq->debugID.present()) {
-			tr.debugTransaction(pOriginalReq->debugID.get());
+		if (pOriginalReq->options.present() && pOriginalReq->options.get().debugID.present()) {
+			tr.debugTransaction(pOriginalReq->options.get().debugID.get());
 		}
 		// TODO: is DefaultPromiseEndpoint the best priority for this?
 		tr.trState->taskID = TaskPriority::DefaultPromiseEndpoint;
@@ -3926,9 +3963,9 @@ ACTOR Future<GetRangeReqAndResultRef> quickGetKeyValues(
 		getRange.result = rangeResult;
 		const double duration = g_network->timer() - getValuesStart;
 		data->counters.mappedRangeRemoteSample.addMeasurement(duration);
-		if (pOriginalReq->debugID.present())
+		if (pOriginalReq->options.present() && pOriginalReq->options.get().debugID.present())
 			g_traceBatch.addEvent("TransactionDebug",
-			                      pOriginalReq->debugID.get().first(),
+			                      pOriginalReq->options.get().debugID.get().first(),
 			                      "storageserver.quickGetKeyValues.AfterRemoteFetch");
 		return getRange;
 	} else {
@@ -4217,9 +4254,9 @@ ACTOR Future<GetMappedKeyValuesReply> mapKeyValues(StorageServer* data,
 	result.arena.dependsOn(input.arena);
 
 	result.data.reserve(result.arena, input.data.size());
-	if (pOriginalReq->debugID.present())
+	if (pOriginalReq->options.present() && pOriginalReq->options.get().debugID.present())
 		g_traceBatch.addEvent(
-		    "TransactionDebug", pOriginalReq->debugID.get().first(), "storageserver.mapKeyValues.Start");
+		    "TransactionDebug", pOriginalReq->options.get().debugID.get().first(), "storageserver.mapKeyValues.Start");
 	state Tuple mappedKeyFormatTuple;
 	state Tuple mappedKeyTuple;
 
@@ -4238,9 +4275,10 @@ ACTOR Future<GetMappedKeyValuesReply> mapKeyValues(StorageServer* data,
 	state std::vector<MappedKeyValueRef> kvms(k);
 	state std::vector<Future<Void>> subqueries;
 	state int offset = 0;
-	if (pOriginalReq->debugID.present())
-		g_traceBatch.addEvent(
-		    "TransactionDebug", pOriginalReq->debugID.get().first(), "storageserver.mapKeyValues.BeforeLoop");
+	if (pOriginalReq->options.present() && pOriginalReq->options.get().debugID.present())
+		g_traceBatch.addEvent("TransactionDebug",
+		                      pOriginalReq->options.get().debugID.get().first(),
+		                      "storageserver.mapKeyValues.BeforeLoop");
 	for (; offset < sz; offset += SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE) {
 		// Divide into batches of MAX_PARALLEL_QUICK_GET_VALUE subqueries
 		for (int i = 0; i + offset < sz && i < SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE; i++) {
@@ -4276,17 +4314,19 @@ ACTOR Future<GetMappedKeyValuesReply> mapKeyValues(StorageServer* data,
 			                                 mappedKey));
 		}
 		wait(waitForAll(subqueries));
-		if (pOriginalReq->debugID.present())
-			g_traceBatch.addEvent(
-			    "TransactionDebug", pOriginalReq->debugID.get().first(), "storageserver.mapKeyValues.AfterBatch");
+		if (pOriginalReq->options.present() && pOriginalReq->options.get().debugID.present())
+			g_traceBatch.addEvent("TransactionDebug",
+			                      pOriginalReq->options.get().debugID.get().first(),
+			                      "storageserver.mapKeyValues.AfterBatch");
 		subqueries.clear();
 		for (int i = 0; i + offset < sz && i < SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE; i++) {
 			result.data.push_back(result.arena, kvms[i]);
 		}
 	}
-	if (pOriginalReq->debugID.present())
-		g_traceBatch.addEvent(
-		    "TransactionDebug", pOriginalReq->debugID.get().first(), "storageserver.mapKeyValues.AfterAll");
+	if (pOriginalReq->options.present() && pOriginalReq->options.get().debugID.present())
+		g_traceBatch.addEvent("TransactionDebug",
+		                      pOriginalReq->options.get().debugID.get().first(),
+		                      "storageserver.mapKeyValues.AfterAll");
 	return result;
 }
 
@@ -4412,7 +4452,8 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
 {
 	state Span span("SS:getMappedKeyValues"_loc, req.spanContext);
 	state int64_t resultSize = 0;
-	state ReadType type = req.readType;
+	state Optional<ReadOptions> options = req.options;
+	state ReadType type  = options.present() ? options.get().type : ReadType::NORMAL;
 
 	if (req.tenantInfo.name.present()) {
 		span.addAttribute("tenant"_sr, req.tenantInfo.name.get());
@@ -4434,9 +4475,9 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
 	state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(data->readPriorityRanks[(int)type]));
 
 	try {
-		if (req.debugID.present())
+		if (req.options.present() && req.options.get().debugID.present())
 			g_traceBatch.addEvent(
-			    "TransactionDebug", req.debugID.get().first(), "storageserver.getMappedKeyValues.Before");
+			    "TransactionDebug", req.options.get().debugID.get().first(), "storageserver.getMappedKeyValues.Before");
 		// VERSION_VECTOR change
 		Version commitVersion = getLatestCommitVersion(req.ssLatestCommitVersions, data->tag);
 		state Version version = wait(waitForVersion(data, commitVersion, req.version, span.context));
@@ -4452,9 +4493,10 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
 		//		try {
 		state KeyRange shard = getShardKeyRange(data, req.begin);
 
-		if (req.debugID.present())
-			g_traceBatch.addEvent(
-			    "TransactionDebug", req.debugID.get().first(), "storageserver.getMappedKeyValues.AfterVersion");
+		if (req.options.present() && req.options.get().debugID.present())
+			g_traceBatch.addEvent("TransactionDebug",
+			                      req.options.get().debugID.get().first(),
+			                      "storageserver.getMappedKeyValues.AfterVersion");
 		//.detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end);
 		//} catch (Error& e) { TraceEvent("WrongShardServer", data->thisServerID).detail("Begin",
 		// req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("Shard",
@@ -4471,12 +4513,13 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
 
 		state int offset1 = 0;
 		state int offset2;
-		state Future<Key> fBegin = req.begin.isFirstGreaterOrEqual()
-		                               ? Future<Key>(req.begin.getKey())
-		                               : findKey(data, req.begin, version, searchRange, &offset1, span.context, type);
+		state Future<Key> fBegin =
+		    req.begin.isFirstGreaterOrEqual()
+		        ? Future<Key>(req.begin.getKey())
+		        : findKey(data, req.begin, version, searchRange, &offset1, span.context, options);
 		state Future<Key> fEnd = req.end.isFirstGreaterOrEqual()
 		                             ? Future<Key>(req.end.getKey())
-		                             : findKey(data, req.end, version, searchRange, &offset2, span.context, type);
+		                             : findKey(data, req.end, version, searchRange, &offset2, span.context, options);
 		state Key begin = wait(fBegin);
 		state Key end = wait(fEnd);
 
@@ -4488,14 +4531,15 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
 				throw tenant_name_required();
 			}
 
-			if (rangeIntersectsAnyTenant(data->tenantPrefixIndex, KeyRangeRef(begin, end), req.version)) {
+			if (rangeIntersectsAnyTenant(*(data->tenantPrefixIndex), KeyRangeRef(begin, end), req.version)) {
 				throw tenant_name_required();
 			}
 		}
 
-		if (req.debugID.present())
-			g_traceBatch.addEvent(
-			    "TransactionDebug", req.debugID.get().first(), "storageserver.getMappedKeyValues.AfterKeys");
+		if (req.options.present() && req.options.get().debugID.present())
+			g_traceBatch.addEvent("TransactionDebug",
+			                      req.options.get().debugID.get().first(),
+			                      "storageserver.getMappedKeyValues.AfterKeys");
 		//.detail("Off1",offset1).detail("Off2",offset2).detail("ReqBegin",req.begin.getKey()).detail("ReqEnd",req.end.getKey());
 
 		// Offsets of zero indicate begin/end keys in this shard, which obviously means we can answer the query
@@ -4513,9 +4557,10 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
 		}
 
 		if (begin >= end) {
-			if (req.debugID.present())
-				g_traceBatch.addEvent(
-				    "TransactionDebug", req.debugID.get().first(), "storageserver.getMappedKeyValues.Send");
+			if (req.options.present() && req.options.get().debugID.present())
+				g_traceBatch.addEvent("TransactionDebug",
+				                      req.options.get().debugID.get().first(),
+				                      "storageserver.getMappedKeyValues.Send");
 			//.detail("Begin",begin).detail("End",end);
 
 			GetMappedKeyValuesReply none;
@@ -4536,7 +4581,7 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
 			                                                     req.limit,
 			                                                     &remainingLimitBytes,
 			                                                     span.context,
-			                                                     type,
+			                                                     options,
 			                                                     tenantPrefix));
 
 			state GetMappedKeyValuesReply r;
@@ -4550,9 +4595,10 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
 				throw;
 			}
 
-			if (req.debugID.present())
-				g_traceBatch.addEvent(
-				    "TransactionDebug", req.debugID.get().first(), "storageserver.getMappedKeyValues.AfterReadRange");
+			if (req.options.present() && req.options.get().debugID.present())
+				g_traceBatch.addEvent("TransactionDebug",
+				                      req.options.get().debugID.get().first(),
+				                      "storageserver.getMappedKeyValues.AfterReadRange");
 			//.detail("Begin",begin).detail("End",end).detail("SizeOf",r.data.size());
 			data->checkChangeCounter(
 			    changeCounter,
@@ -4618,7 +4664,8 @@ ACTOR Future<Void> getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe
 {
 	state Span span("SS:getKeyValuesStream"_loc, req.spanContext);
 	state int64_t resultSize = 0;
-	state ReadType type = req.readType;
+	state Optional<ReadOptions> options = req.options;
+	state ReadType type = options.present() ? options.get().type : ReadType::NORMAL;
 
 	if (req.tenantInfo.name.present()) {
 		span.addAttribute("tenant"_sr, req.tenantInfo.name.get());
@@ -4640,9 +4687,9 @@ ACTOR Future<Void> getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe
 	state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(readPriority));
 
 	try {
-		if (req.debugID.present())
+		if (req.options.present() && req.options.get().debugID.present())
 			g_traceBatch.addEvent(
-			    "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValuesStream.Before");
+			    "TransactionDebug", req.options.get().debugID.get().first(), "storageserver.getKeyValuesStream.Before");
 
 		Version commitVersion = getLatestCommitVersion(req.ssLatestCommitVersions, data->tag);
 		state Version version = wait(waitForVersion(data, commitVersion, req.version, span.context));
@@ -4658,9 +4705,10 @@ ACTOR Future<Void> getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe
 		//		try {
 		state KeyRange shard = getShardKeyRange(data, req.begin);
 
-		if (req.debugID.present())
-			g_traceBatch.addEvent(
-			    "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValuesStream.AfterVersion");
+		if (req.options.present() && req.options.get().debugID.present())
+			g_traceBatch.addEvent("TransactionDebug",
+			                      req.options.get().debugID.get().first(),
+			                      "storageserver.getKeyValuesStream.AfterVersion");
 		//.detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end);
 		//} catch (Error& e) { TraceEvent("WrongShardServer", data->thisServerID).detail("Begin",
 		// req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("Shard",
@@ -4677,17 +4725,19 @@ ACTOR Future<Void> getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe
 
 		state int offset1 = 0;
 		state int offset2;
-		state Future<Key> fBegin = req.begin.isFirstGreaterOrEqual()
-		                               ? Future<Key>(req.begin.getKey())
-		                               : findKey(data, req.begin, version, searchRange, &offset1, span.context, type);
+		state Future<Key> fBegin =
+		    req.begin.isFirstGreaterOrEqual()
+		        ? Future<Key>(req.begin.getKey())
+		        : findKey(data, req.begin, version, searchRange, &offset1, span.context, options);
 		state Future<Key> fEnd = req.end.isFirstGreaterOrEqual()
 		                             ? Future<Key>(req.end.getKey())
-		                             : findKey(data, req.end, version, searchRange, &offset2, span.context, type);
+		                             : findKey(data, req.end, version, searchRange, &offset2, span.context, options);
 		state Key begin = wait(fBegin);
 		state Key end = wait(fEnd);
-		if (req.debugID.present())
-			g_traceBatch.addEvent(
-			    "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValuesStream.AfterKeys");
+		if (req.options.present() && req.options.get().debugID.present())
+			g_traceBatch.addEvent("TransactionDebug",
+			                      req.options.get().debugID.get().first(),
+			                      "storageserver.getKeyValuesStream.AfterKeys");
 		//.detail("Off1",offset1).detail("Off2",offset2).detail("ReqBegin",req.begin.getKey()).detail("ReqEnd",req.end.getKey());
 
 		// Offsets of zero indicate begin/end keys in this shard, which obviously means we can answer the query
@@ -4705,9 +4755,10 @@ ACTOR Future<Void> getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe
 		}
 
 		if (begin >= end) {
-			if (req.debugID.present())
-				g_traceBatch.addEvent(
-				    "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValuesStream.Send");
+			if (req.options.present() && req.options.get().debugID.present())
+				g_traceBatch.addEvent("TransactionDebug",
+				                      req.options.get().debugID.get().first(),
+				                      "storageserver.getKeyValuesStream.Send");
 			//.detail("Begin",begin).detail("End",end);
 
 			GetKeyValuesStreamReply none;
@@ -4733,13 +4784,19 @@ ACTOR Future<Void> getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe
 				                       !data->isTss() && !data->isSSWithTSSPair())
 				                          ? 1
 				                          : CLIENT_KNOBS->REPLY_BYTE_LIMIT;
-				GetKeyValuesReply _r = wait(readRange(
-				    data, version, KeyRangeRef(begin, end), req.limit, &byteLimit, span.context, type, tenantPrefix));
+				GetKeyValuesReply _r = wait(readRange(data,
+				                                      version,
+				                                      KeyRangeRef(begin, end),
+				                                      req.limit,
+				                                      &byteLimit,
+				                                      span.context,
+				                                      options,
+				                                      tenantPrefix));
 				GetKeyValuesStreamReply r(_r);
 
-				if (req.debugID.present())
+				if (req.options.present() && req.options.get().debugID.present())
 					g_traceBatch.addEvent("TransactionDebug",
-					                      req.debugID.get().first(),
+					                      req.options.get().debugID.get().first(),
 					                      "storageserver.getKeyValuesStream.AfterReadRange");
 				//.detail("Begin",begin).detail("End",end).detail("SizeOf",r.data.size());
 				data->checkChangeCounter(
@@ -4821,6 +4878,14 @@ ACTOR Future<Void> getKeyQ(StorageServer* data, GetKeyRequest req) {
 		span.addAttribute("tenant"_sr, req.tenantInfo.name.get());
 	}
 	state int64_t resultSize = 0;
+	state ReadOptions options;
+	state ReadType type = ReadType::NORMAL;
+
+	if (req.options.present()) {
+		options = req.options.get();
+		type = options.type;
+	}
+
 	getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.traceID;
 
 	++data->counters.getKeyQueries;
@@ -4831,7 +4896,7 @@ ACTOR Future<Void> getKeyQ(StorageServer* data, GetKeyRequest req) {
 	// Active load balancing runs at a very high priority (to obtain accurate queue lengths)
 	// so we need to downgrade here
 	wait(data->getQueryDelay());
-	wait(store(lock, data->ssLock.lock(data->readPriorityRanks[(int)req.readType])));
+	wait(store(lock, data->ssLock.lock(data->readPriorityRanks[(int)type])));
 
 	try {
 		Version commitVersion = getLatestCommitVersion(req.ssLatestCommitVersions, data->tag);
@@ -4847,7 +4912,13 @@ ACTOR Future<Void> getKeyQ(StorageServer* data, GetKeyRequest req) {
 		KeyRangeRef searchRange = data->clampRangeToTenant(shard, tenantEntry, req.arena);
 
 		state int offset;
-		Key absoluteKey = wait(findKey(data, req.sel, version, searchRange, &offset, req.spanContext, req.readType));
+		Key absoluteKey = wait(findKey(data,
+		                               req.sel,
+		                               version,
+		                               searchRange,
+		                               &offset,
+		                               req.spanContext,
+		                               req.options.present() ? options : Optional<ReadOptions>()));
 
 		data->checkChangeCounter(changeCounter,
 		                         KeyRangeRef(std::min<KeyRef>(req.sel.getKey(), absoluteKey),
@@ -4942,11 +5013,12 @@ void getQueuingMetrics(StorageServer* self, StorageQueuingMetricsRequest const&
 
 ACTOR Future<Void> doEagerReads(StorageServer* data, UpdateEagerReadInfo* eager) {
 	eager->finishKeyBegin();
-
+	state ReadOptions options;
+	options.type = ReadType::EAGER;
 	if (SERVER_KNOBS->ENABLE_CLEAR_RANGE_EAGER_READS) {
 		std::vector<Future<Key>> keyEnd(eager->keyBegin.size());
 		for (int i = 0; i < keyEnd.size(); i++)
-			keyEnd[i] = data->storage.readNextKeyInclusive(eager->keyBegin[i], ReadType::EAGER);
+			keyEnd[i] = data->storage.readNextKeyInclusive(eager->keyBegin[i], options);
 		data->counters.eagerReadsKeys += keyEnd.size();
 
 		state Future<std::vector<Key>> futureKeyEnds = getAll(keyEnd);
@@ -4959,7 +5031,7 @@ ACTOR Future<Void> doEagerReads(StorageServer* data, UpdateEagerReadInfo* eager)
 
 	std::vector<Future<Optional<Value>>> value(eager->keys.size());
 	for (int i = 0; i < value.size(); i++)
-		value[i] = data->storage.readValuePrefix(eager->keys[i].first, eager->keys[i].second, ReadType::EAGER);
+		value[i] = data->storage.readValuePrefix(eager->keys[i].first, eager->keys[i].second, options);
 
 	state Future<std::vector<Optional<Value>>> futureValues = getAll(value);
 	std::vector<Optional<Value>> optionalValues = wait(futureValues);
@@ -5457,6 +5529,11 @@ public:
 };
 
 ACTOR Future<Void> tryGetRange(PromiseStream<RangeResult> results, Transaction* tr, KeyRange keys) {
+	if (SERVER_KNOBS->FETCH_USING_STREAMING) {
+		wait(tr->getRangeStream(results, keys, GetRangeLimits(), Snapshot::True));
+		return Void();
+	}
+
 	state KeySelectorRef begin = firstGreaterOrEqual(keys.begin);
 	state KeySelectorRef end = firstGreaterOrEqual(keys.end);
 
@@ -5490,6 +5567,75 @@ ACTOR Future<Void> tryGetRange(PromiseStream<RangeResult> results, Transaction*
 	}
 }
 
+// Read blob granules mapping from system keyspace. It keeps retrying until reaching maxRetryCount.
+ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> tryReadBlobGranules(Transaction* tr,
+                                                                             KeyRange keys,
+                                                                             Version fetchVersion,
+                                                                             int maxRetryCount = 10) {
+	state int retryCount = 0;
+	state Version readVersion = fetchVersion;
+	loop {
+		try {
+			Standalone<VectorRef<BlobGranuleChunkRef>> chunks = wait(tr->readBlobGranules(keys, 0, readVersion));
+			return chunks;
+		} catch (Error& e) {
+			if (retryCount >= maxRetryCount) {
+				throw e;
+			}
+			wait(tr->onError(e));
+			retryCount += 1;
+		}
+	}
+}
+
+// Read keys from blob storage if they exist. Fail back to tryGetRange, which reads keys
+// from storage servers with locally attached disks
+ACTOR Future<Void> tryGetRangeFromBlob(PromiseStream<RangeResult> results,
+                                       Transaction* tr,
+                                       KeyRange keys,
+                                       Version fetchVersion,
+                                       Reference<BlobConnectionProvider> blobConn) {
+	ASSERT(blobConn.isValid());
+	try {
+
+		state Standalone<VectorRef<BlobGranuleChunkRef>> chunks = wait(tryReadBlobGranules(tr, keys, fetchVersion));
+
+		if (chunks.size() == 0) {
+			throw blob_granule_transaction_too_old(); // no data on blob
+		}
+
+		if (!isRangeFullyCovered(keys, chunks)) {
+			throw blob_granule_transaction_too_old();
+		}
+
+		for (const BlobGranuleChunkRef& chunk : chunks) {
+			state KeyRangeRef chunkRange = chunk.keyRange;
+			state RangeResult rows = wait(readBlobGranule(chunk, keys, 0, fetchVersion, blobConn));
+			TraceEvent("ReadBlobData")
+			    .detail("Rows", rows.size())
+			    .detail("ChunkRange", chunkRange.toString())
+			    .detail("Keys", keys.toString());
+
+			if (rows.size() == 0) {
+				rows.readThrough = KeyRef(rows.arena(), chunkRange.end);
+			}
+			results.send(rows);
+		}
+		results.sendError(end_of_stream()); // end of range read
+	} catch (Error& e) {
+		TraceEvent(SevWarn, "ReadBlobDataFailure")
+		    .suppressFor(5.0)
+		    .detail("Keys", keys.toString())
+		    .detail("FetchVersion", fetchVersion)
+		    .detail("Error", e.what());
+		tr->reset();
+		tr->setVersion(fetchVersion);
+		tr->trState->taskID = TaskPriority::FetchKeys;
+		wait(tryGetRange(results, tr, keys)); // fail back to storage server
+	}
+	return Void();
+}
+
 // We have to store the version the change feed was stopped at in the SS instead of just the stopped status
 // In addition to simplifying stopping logic, it enables communicating stopped status when fetching change feeds
 // from other SS correctly
@@ -5532,7 +5678,7 @@ ACTOR Future<Void> changeFeedPopQ(StorageServer* self, ChangeFeedPopRequest req)
 	}
 
 	TraceEvent(SevDebug, "ChangeFeedPopQuery", self->thisServerID)
-	    .detail("RangeID", req.rangeID)
+	    .detail("FeedID", req.rangeID)
 	    .detail("Version", req.version)
 	    .detail("SSVersion", self->version.get())
 	    .detail("Range", req.range);
@@ -5591,7 +5737,7 @@ ACTOR Future<Version> fetchChangeFeedApplier(StorageServer* data,
 	if (startVersion >= endVersion || (changeFeedInfo->removing)) {
 		CODE_PROBE(true, "Change Feed popped before fetch");
 		TraceEvent(SevDebug, "FetchChangeFeedNoOp", data->thisServerID)
-		    .detail("RangeID", rangeId)
+		    .detail("FeedID", rangeId)
 		    .detail("Range", range)
 		    .detail("StartVersion", startVersion)
 		    .detail("EndVersion", endVersion)
@@ -5743,7 +5889,7 @@ ACTOR Future<Version> fetchChangeFeedApplier(StorageServer* data,
 		if (e.code() != error_code_end_of_stream) {
 			TraceEvent(SevDebug, "FetchChangeFeedError", data->thisServerID)
 			    .errorUnsuppressed(e)
-			    .detail("RangeID", rangeId)
+			    .detail("FeedID", rangeId)
 			    .detail("Range", range)
 			    .detail("EndVersion", endVersion)
 			    .detail("Removing", changeFeedInfo->removing)
@@ -5792,7 +5938,7 @@ ACTOR Future<Version> fetchChangeFeedApplier(StorageServer* data,
 	}
 
 	TraceEvent(SevDebug, "FetchChangeFeedDone", data->thisServerID)
-	    .detail("RangeID", rangeId)
+	    .detail("FeedID", rangeId)
 	    .detail("Range", range)
 	    .detail("StartVersion", startVersion)
 	    .detail("EndVersion", endVersion)
@@ -5811,12 +5957,8 @@ ACTOR Future<Version> fetchChangeFeed(StorageServer* data,
                                       Version endVersion) {
 	wait(delay(0)); // allow this actor to be cancelled by removals
 
-	// bound active change feed fetches
-	wait(data->fetchChangeFeedParallelismLock.take(TaskPriority::DefaultYield));
-	state FlowLock::Releaser holdingFCFPL(data->fetchChangeFeedParallelismLock);
-
 	TraceEvent(SevDebug, "FetchChangeFeed", data->thisServerID)
-	    .detail("RangeID", changeFeedInfo->id)
+	    .detail("FeedID", changeFeedInfo->id)
 	    .detail("Range", changeFeedInfo->range)
 	    .detail("BeginVersion", beginVersion)
 	    .detail("EndVersion", endVersion);
@@ -5825,7 +5967,7 @@ ACTOR Future<Version> fetchChangeFeed(StorageServer* data,
 	if (cleanupPending != data->changeFeedCleanupDurable.end()) {
 		CODE_PROBE(true, "Change feed waiting for dirty previous move to finish");
 		TraceEvent(SevDebug, "FetchChangeFeedWaitCleanup", data->thisServerID)
-		    .detail("RangeID", changeFeedInfo->id)
+		    .detail("FeedID", changeFeedInfo->id)
 		    .detail("Range", changeFeedInfo->range)
 		    .detail("CleanupVersion", cleanupPending->second)
 		    .detail("EmptyVersion", changeFeedInfo->emptyVersion)
@@ -5838,7 +5980,7 @@ ACTOR Future<Version> fetchChangeFeed(StorageServer* data,
 		if (cleanupPendingAfter != data->changeFeedCleanupDurable.end()) {
 			ASSERT(cleanupPendingAfter->second >= endVersion);
 			TraceEvent(SevDebug, "FetchChangeFeedCancelledByCleanup", data->thisServerID)
-			    .detail("RangeID", changeFeedInfo->id)
+			    .detail("FeedID", changeFeedInfo->id)
 			    .detail("Range", changeFeedInfo->range)
 			    .detail("BeginVersion", beginVersion)
 			    .detail("EndVersion", endVersion);
@@ -5875,7 +6017,7 @@ ACTOR Future<Version> fetchChangeFeed(StorageServer* data,
 			Version cleanupVersion = data->data().getLatestVersion();
 
 			TraceEvent(SevDebug, "DestroyingChangeFeedFromFetch", data->thisServerID)
-			    .detail("RangeID", changeFeedInfo->id)
+			    .detail("FeedID", changeFeedInfo->id)
 			    .detail("Range", changeFeedInfo->range)
 			    .detail("Version", cleanupVersion);
 
@@ -5956,6 +6098,8 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
 			}
 		}
 	}
+	// FIXME: might want to inject delay here sometimes in simulation, so that races that would only happen when a feed
+	// destroy causes a wait are more prominent?
 
 	std::vector<Key> feedIds;
 	feedIds.reserve(feedMetadata.feeds.size());
@@ -5967,7 +6111,7 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
 		bool existing = existingEntry != data->uidChangeFeed.end();
 
 		TraceEvent(SevDebug, "FetchedChangeFeedInfo", data->thisServerID)
-		    .detail("RangeID", cfEntry.feedId)
+		    .detail("FeedID", cfEntry.feedId)
 		    .detail("Range", cfEntry.range)
 		    .detail("FetchVersion", fetchVersion)
 		    .detail("EmptyVersion", cfEntry.emptyVersion)
@@ -6008,27 +6152,30 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
 
 			auto fid = missingFeeds.find(cfEntry.feedId);
 			if (fid != missingFeeds.end()) {
-				TraceEvent(SevDebug, "ResetChangeFeedInfo", data->thisServerID)
-				    .detail("RangeID", changeFeedInfo->id.printable())
-				    .detail("Range", changeFeedInfo->range)
-				    .detail("FetchVersion", fetchVersion)
-				    .detail("EmptyVersion", changeFeedInfo->emptyVersion)
-				    .detail("StopVersion", changeFeedInfo->stopVersion)
-				    .detail("PreviousMetadataVersion", changeFeedInfo->metadataVersion)
-				    .detail("NewMetadataVersion", cfEntry.feedMetadataVersion)
-				    .detail("FKID", fetchKeysID);
-
 				missingFeeds.erase(fid);
 				ASSERT(!changeFeedInfo->destroyed);
-				ASSERT(changeFeedInfo->removing);
-				CODE_PROBE(true, "re-fetching feed scheduled for deletion! Un-mark it as removing");
+				// could possibly be not removing because it was reset  while
+				// waiting on destroyedFeeds by a private mutation or another fetch
+				if (changeFeedInfo->removing) {
+					TraceEvent(SevDebug, "ResetChangeFeedInfoFromFetch", data->thisServerID)
+					    .detail("FeedID", changeFeedInfo->id.printable())
+					    .detail("Range", changeFeedInfo->range)
+					    .detail("FetchVersion", fetchVersion)
+					    .detail("EmptyVersion", changeFeedInfo->emptyVersion)
+					    .detail("StopVersion", changeFeedInfo->stopVersion)
+					    .detail("PreviousMetadataVersion", changeFeedInfo->metadataVersion)
+					    .detail("NewMetadataVersion", cfEntry.feedMetadataVersion)
+					    .detail("FKID", fetchKeysID);
 
-				changeFeedInfo->removing = false;
-				// reset fetch versions because everything previously fetched was cleaned up
-				changeFeedInfo->fetchVersion = invalidVersion;
-				changeFeedInfo->durableFetchVersion = NotifiedVersion();
+					CODE_PROBE(true, "re-fetching feed scheduled for deletion! Un-mark it as removing");
 
-				addMutationToLog = true;
+					// TODO only reset data if feed is still removing
+					changeFeedInfo->removing = false;
+					// reset fetch versions because everything previously fetched was cleaned up
+					changeFeedInfo->fetchVersion = invalidVersion;
+					changeFeedInfo->durableFetchVersion = NotifiedVersion();
+					addMutationToLog = true;
+				}
 			}
 
 			if (changeFeedInfo->destroyed) {
@@ -6098,7 +6245,7 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
 
 		CODE_PROBE(true, "Destroying change feed from fetch metadata"); //
 		TraceEvent(SevDebug, "DestroyingChangeFeedFromFetchMetadata", data->thisServerID)
-		    .detail("RangeID", feed.first)
+		    .detail("FeedID", feed.first)
 		    .detail("Range", existingEntry->second->range)
 		    .detail("Version", cleanupVersion)
 		    .detail("FKID", fetchKeysID);
@@ -6315,6 +6462,12 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
 		state int debug_nextRetryToLog = 1;
 		state Error lastError;
 
+		// TODO: update to FETCH once the priority multi lock is used.
+		// leaving the readtype off for now to prevent data fetches stall under heavy load
+		// it is used to inform the storage that the rangeRead is for Fetch
+		// state ReadOptions options = ReadOptions(Optional<UID>(), ReadType::FETCH);
+		state ReadOptions options = ReadOptions(Optional<UID>(), ReadType::NORMAL);
+
 		// FIXME: The client cache does not notice when servers are added to a team. To read from a local storage server
 		// we must refresh the cache manually.
 		data->cx->invalidateCache(Key(), keys);
@@ -6344,7 +6497,7 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
 					Version grvVersion = wait(tr.getRawReadVersion());
 					if (g_network->isSimulated() && BUGGIFY_WITH_PROB(0.01)) {
 						// Test failed GRV request.
-						throw proxy_memory_limit_exceeded();
+						throw grv_proxy_memory_limit_exceeded();
 					}
 					fetchVersion = std::max(grvVersion, fetchVersion);
 				} catch (Error& e) {
@@ -6368,12 +6521,16 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
 			while (!shard->updates.empty() && shard->updates[0].version <= fetchVersion)
 				shard->updates.pop_front();
 			tr.setVersion(fetchVersion);
-			tr.trState->readType = ReadType::FETCH;
-
+			tr.trState->taskID = TaskPriority::FetchKeys;
+			tr.trState->readOptions = options;
 			state PromiseStream<RangeResult> results;
-			state Future<Void> hold = SERVER_KNOBS->FETCH_USING_STREAMING
-			                              ? tr.getRangeStream(results, keys, GetRangeLimits(), Snapshot::True)
-			                              : tryGetRange(results, &tr, keys);
+			state Future<Void> hold;
+			if (SERVER_KNOBS->FETCH_USING_BLOB) {
+				hold = tryGetRangeFromBlob(results, &tr, keys, fetchVersion, data->blobConn);
+			} else {
+				hold = tryGetRange(results, &tr, keys);
+			}
+
 			state Key nfk = keys.begin;
 
 			try {
@@ -6503,8 +6660,10 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
 		}
 
 		// FIXME: remove when we no longer support upgrades from 5.X
-		data->cx->enableLocalityLoadBalance = EnableLocalityLoadBalance::True;
-		TraceEvent(SevWarnAlways, "FKReenableLB").detail("FKID", fetchKeysID);
+		if (!data->cx->enableLocalityLoadBalance) {
+			data->cx->enableLocalityLoadBalance = EnableLocalityLoadBalance::True;
+			TraceEvent(SevWarnAlways, "FKReenableLB").detail("FKID", fetchKeysID);
+		}
 
 		// We have completed the fetch and write of the data, now we wait for MVCC window to pass.
 		//  As we have finished this work, we will allow more work to start...
@@ -7574,7 +7733,7 @@ private:
 			auto feed = data->uidChangeFeed.find(changeFeedId);
 
 			TraceEvent(SevDebug, "ChangeFeedPrivateMutation", data->thisServerID)
-			    .detail("RangeID", changeFeedId)
+			    .detail("FeedID", changeFeedId)
 			    .detail("Range", changeFeedRange)
 			    .detail("Version", currentVersion)
 			    .detail("PopVersion", popVersion)
@@ -7603,7 +7762,7 @@ private:
 				ASSERT(feed != data->uidChangeFeed.end());
 
 				TraceEvent(SevDebug, "AddingChangeFeed", data->thisServerID)
-				    .detail("RangeID", changeFeedId)
+				    .detail("FeedID", changeFeedId)
 				    .detail("Range", changeFeedRange)
 				    .detail("EmptyVersion", feed->second->emptyVersion);
 
@@ -7612,6 +7771,29 @@ private:
 					r->value().push_back(changeFeedInfo);
 				}
 				data->keyChangeFeed.coalesce(changeFeedRange.contents());
+			} else if (feed != data->uidChangeFeed.end() && feed->second->removing && !feed->second->destroyed &&
+			           status != ChangeFeedStatus::CHANGE_FEED_DESTROY) {
+				// Because we got a private mutation for this change feed, the feed must have moved back after being
+				// moved away. Normally we would later find out about this via a fetch, but in the particular case where
+				// the private mutation is the creation of the change feed, and the following race occurred, we must
+				// refresh it here:
+				// 1. This SS found out about the feed from a fetch, from a SS with a higher version that already got
+				// the feed create mutation
+				// 2. The shard was moved away
+				// 3. The shard was moved back, and this SS fetched change feed metadata from a different SS that did
+				// not yet recieve the private mutation, so the feed was not refreshed
+				// 4. This SS gets the private mutation, the feed is still marked as removing
+				TraceEvent(SevDebug, "ResetChangeFeedInfoFromPrivateMutation", data->thisServerID)
+				    .detail("FeedID", changeFeedId)
+				    .detail("Range", changeFeedRange)
+				    .detail("Version", currentVersion);
+
+				CODE_PROBE(true, "private mutation for feed scheduled for deletion! Un-mark it as removing");
+
+				feed->second->removing = false;
+				// reset fetch versions because everything previously fetched was cleaned up
+				feed->second->fetchVersion = invalidVersion;
+				feed->second->durableFetchVersion = NotifiedVersion();
 			}
 			if (feed != data->uidChangeFeed.end()) {
 				feed->second->updateMetadataVersion(currentVersion);
@@ -7644,7 +7826,7 @@ private:
 
 			} else if (status == ChangeFeedStatus::CHANGE_FEED_CREATE && createdFeed) {
 				TraceEvent(SevDebug, "CreatingChangeFeed", data->thisServerID)
-				    .detail("RangeID", changeFeedId)
+				    .detail("FeedID", changeFeedId)
 				    .detail("Range", changeFeedRange)
 				    .detail("Version", currentVersion);
 				// no-op, already created metadata
@@ -7652,7 +7834,7 @@ private:
 			}
 			if (status == ChangeFeedStatus::CHANGE_FEED_STOP && currentVersion < feed->second->stopVersion) {
 				TraceEvent(SevDebug, "StoppingChangeFeed", data->thisServerID)
-				    .detail("RangeID", changeFeedId)
+				    .detail("FeedID", changeFeedId)
 				    .detail("Range", changeFeedRange)
 				    .detail("Version", currentVersion);
 				feed->second->stopVersion = currentVersion;
@@ -7660,7 +7842,7 @@ private:
 			}
 			if (status == ChangeFeedStatus::CHANGE_FEED_DESTROY && !createdFeed && feed != data->uidChangeFeed.end()) {
 				TraceEvent(SevDebug, "DestroyingChangeFeed", data->thisServerID)
-				    .detail("RangeID", changeFeedId)
+				    .detail("FeedID", changeFeedId)
 				    .detail("Range", changeFeedRange)
 				    .detail("Version", currentVersion);
 				Key beginClearKey = changeFeedId.withPrefix(persistChangeFeedKeys.begin);
@@ -7712,13 +7894,13 @@ private:
 				}
 			}
 		} else if ((m.type == MutationRef::SetValue || m.type == MutationRef::ClearRange) &&
-		           m.param1.startsWith(TenantMetadata::tenantMapPrivatePrefix)) {
+		           m.param1.startsWith(TenantMetadata::tenantMapPrivatePrefix())) {
 			if (m.type == MutationRef::SetValue) {
 				data->insertTenant(
-				    m.param1.removePrefix(TenantMetadata::tenantMapPrivatePrefix), m.param2, currentVersion);
+				    m.param1.removePrefix(TenantMetadata::tenantMapPrivatePrefix()), m.param2, currentVersion);
 			} else if (m.type == MutationRef::ClearRange) {
-				data->clearTenants(m.param1.removePrefix(TenantMetadata::tenantMapPrivatePrefix),
-				                   m.param2.removePrefix(TenantMetadata::tenantMapPrivatePrefix),
+				data->clearTenants(m.param1.removePrefix(TenantMetadata::tenantMapPrivatePrefix()),
+				                   m.param2.removePrefix(TenantMetadata::tenantMapPrivatePrefix()),
 				                   currentVersion);
 			}
 		} else if (m.param1.substr(1).startsWith(tssMappingKeys.begin) &&
@@ -7816,10 +7998,10 @@ private:
 bool StorageServer::insertTenant(TenantNameRef tenantName, TenantMapEntry tenantEntry, Version version) {
 	if (version >= tenantMap.getLatestVersion()) {
 		tenantMap.createNewVersion(version);
-		tenantPrefixIndex.createNewVersion(version);
+		tenantPrefixIndex->createNewVersion(version);
 
 		tenantMap.insert(tenantName, tenantEntry);
-		tenantPrefixIndex.insert(tenantEntry.prefix, tenantName);
+		tenantPrefixIndex->insert(tenantEntry.prefix, tenantName);
 
 		TraceEvent("InsertTenant", thisServerID).detail("Tenant", tenantName).detail("Version", version);
 		return true;
@@ -7839,13 +8021,13 @@ void StorageServer::insertTenant(TenantNameRef tenantName, ValueRef value, Versi
 void StorageServer::clearTenants(TenantNameRef startTenant, TenantNameRef endTenant, Version version) {
 	if (version >= tenantMap.getLatestVersion()) {
 		tenantMap.createNewVersion(version);
-		tenantPrefixIndex.createNewVersion(version);
+		tenantPrefixIndex->createNewVersion(version);
 
 		auto view = tenantMap.at(version);
 		for (auto itr = view.lower_bound(startTenant); itr != view.lower_bound(endTenant); ++itr) {
 			// Trigger any watches on the prefix associated with the tenant.
 			watches.triggerRange(itr->prefix, strinc(itr->prefix));
-			tenantPrefixIndex.erase(itr->prefix);
+			tenantPrefixIndex->erase(itr->prefix);
 			TraceEvent("EraseTenant", thisServerID).detail("Tenant", itr.key()).detail("Version", version);
 		}
 
@@ -7970,7 +8152,7 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 		start = now();
 		state UpdateEagerReadInfo eager;
 		state FetchInjectionInfo fii;
-		state Reference<ILogSystem::IPeekCursor> cloneCursor2;
+		state Reference<ILogSystem::IPeekCursor> cloneCursor2 = cursor->cloneNoMore();
 		state Optional<std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>> cipherKeys;
 		state bool collectingCipherKeys = false;
 
@@ -7985,8 +8167,7 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 
 			std::unordered_set<BlobCipherDetails> cipherDetails;
 
-			Reference<ILogSystem::IPeekCursor> cloneCursor1 = cursor->cloneNoMore();
-			cloneCursor2 = cursor->cloneNoMore();
+			Reference<ILogSystem::IPeekCursor> cloneCursor1 = cloneCursor2->cloneNoMore();
 
 			cloneCursor1->setProtocolVersion(data->logProtocol);
 
@@ -8007,23 +8188,18 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 				           OTELSpanContextMessage::isNextIn(cloneReader)) {
 					OTELSpanContextMessage scm;
 					cloneReader >> scm;
-				} else if (cloneReader.protocolVersion().hasEncryptionAtRest() &&
-				           EncryptedMutationMessage::isNextIn(cloneReader) && !cipherKeys.present()) {
-					// Encrypted mutation found, but cipher keys haven't been fetch.
-					// Collect cipher details to fetch cipher keys in one batch.
-					EncryptedMutationMessage emm;
-					cloneReader >> emm;
-					cipherDetails.insert(emm.header.cipherTextDetails);
-					cipherDetails.insert(emm.header.cipherHeaderDetails);
-					collectingCipherKeys = true;
 				} else {
 					MutationRef msg;
-					if (cloneReader.protocolVersion().hasEncryptionAtRest() &&
-					    EncryptedMutationMessage::isNextIn(cloneReader)) {
-						assert(cipherKeys.present());
-						msg = EncryptedMutationMessage::decrypt(cloneReader, eager.arena, cipherKeys.get());
-					} else {
-						cloneReader >> msg;
+					cloneReader >> msg;
+					if (msg.isEncrypted()) {
+						if (!cipherKeys.present()) {
+							const BlobCipherEncryptHeader* header = msg.encryptionHeader();
+							cipherDetails.insert(header->cipherTextDetails);
+							cipherDetails.insert(header->cipherHeaderDetails);
+							collectingCipherKeys = true;
+						} else {
+							msg = msg.decrypt(cipherKeys.get(), eager.arena);
+						}
 					}
 					// TraceEvent(SevDebug, "SSReadingLog", data->thisServerID).detail("Mutation", msg);
 
@@ -8073,6 +8249,7 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 				// SOMEDAY: Theoretically we could check the change counters of individual shards and retry the reads
 				// only selectively
 				eager = UpdateEagerReadInfo();
+				cloneCursor2 = cursor->cloneNoMore();
 			}
 		}
 		data->eagerReadsLatencyHistogram->sampleSeconds(now() - start);
@@ -8166,11 +8343,10 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 				spanContext = scm.spanContext;
 			} else {
 				MutationRef msg;
-				if (rd.protocolVersion().hasEncryptionAtRest() && EncryptedMutationMessage::isNextIn(rd)) {
+				rd >> msg;
+				if (msg.isEncrypted()) {
 					ASSERT(cipherKeys.present());
-					msg = EncryptedMutationMessage::decrypt(rd, rd.arena(), cipherKeys.get());
-				} else {
-					rd >> msg;
+					msg = msg.decrypt(cipherKeys.get(), rd.arena());
 				}
 
 				Span span("SS:update"_loc, spanContext);
@@ -8513,7 +8689,7 @@ ACTOR Future<Void> updateStorage(StorageServer* data) {
 			    newOldestVersion, desiredVersion, bytesLeft, unlimitedCommitBytes);
 			if (data->tenantMap.getLatestVersion() < newOldestVersion) {
 				data->tenantMap.createNewVersion(newOldestVersion);
-				data->tenantPrefixIndex.createNewVersion(newOldestVersion);
+				data->tenantPrefixIndex->createNewVersion(newOldestVersion);
 			}
 			// We want to forget things from these data structures atomically with changing oldestVersion (and "before",
 			// since oldestVersion.set() may trigger waiting actors) forgetVersionsBeforeAsync visibly forgets
@@ -8521,7 +8697,7 @@ ACTOR Future<Void> updateStorage(StorageServer* data) {
 			Future<Void> finishedForgetting =
 			    data->mutableData().forgetVersionsBeforeAsync(newOldestVersion, TaskPriority::UpdateStorage) &&
 			    data->tenantMap.forgetVersionsBeforeAsync(newOldestVersion, TaskPriority::UpdateStorage) &&
-			    data->tenantPrefixIndex.forgetVersionsBeforeAsync(newOldestVersion, TaskPriority::UpdateStorage);
+			    data->tenantPrefixIndex->forgetVersionsBeforeAsync(newOldestVersion, TaskPriority::UpdateStorage);
 			data->oldestVersion.set(newOldestVersion);
 			wait(finishedForgetting);
 			wait(yield(TaskPriority::UpdateStorage));
@@ -9339,7 +9515,7 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
 		std::tie(changeFeedRange, popVersion, stopVersion, metadataVersion) =
 		    decodeChangeFeedSSValue(changeFeeds[feedLoc].value);
 		TraceEvent(SevDebug, "RestoringChangeFeed", data->thisServerID)
-		    .detail("RangeID", changeFeedId)
+		    .detail("FeedID", changeFeedId)
 		    .detail("Range", changeFeedRange)
 		    .detail("StopVersion", stopVersion)
 		    .detail("PopVer", popVersion)
@@ -9369,7 +9545,7 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
 		TenantMapEntry tenantEntry = TenantMapEntry::decode(result.value);
 
 		data->tenantMap.insert(tenantName, tenantEntry);
-		data->tenantPrefixIndex.insert(tenantEntry.prefix, tenantName);
+		data->tenantPrefixIndex->insert(tenantEntry.prefix, tenantName);
 
 		TraceEvent("RestoringTenant", data->thisServerID)
 		    .detail("Key", tenantMap[tenantMapLoc].key)
@@ -9765,9 +9941,9 @@ ACTOR Future<Void> serveGetValueRequests(StorageServer* self, FutureStream<GetVa
 		GetValueRequest req = waitNext(getValue);
 		// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so
 		// downgrade before doing real work
-		if (req.debugID.present())
+		if (req.options.present() && req.options.get().debugID.present())
 			g_traceBatch.addEvent("GetValueDebug",
-			                      req.debugID.get().first(),
+			                      req.options.get().debugID.get().first(),
 			                      "storageServer.received"); //.detail("TaskID", g_network->getCurrentTask());
 
 		if (SHORT_CIRCUT_ACTUAL_STORAGE && normalKeys.contains(req.key))
@@ -9852,6 +10028,7 @@ ACTOR Future<Void> serveWatchValueRequestsImpl(StorageServer* self, FutureStream
 		state Reference<ServerWatchMetadata> metadata = self->getWatchMetadata(req.key.contents());
 		state Span span("SS:serveWatchValueRequestsImpl"_loc, req.spanContext);
 		getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.traceID;
+		state ReadOptions options;
 
 		// case 1: no watch set for the current key
 		if (!metadata.isValid()) {
@@ -9892,14 +10069,10 @@ ACTOR Future<Void> serveWatchValueRequestsImpl(StorageServer* self, FutureStream
 			loop {
 				try {
 					state Version latest = self->version.get();
-					GetValueRequest getReq(span.context,
-					                       TenantInfo(),
-					                       metadata->key,
-					                       latest,
-					                       ReadType::NORMAL,
-					                       metadata->tags,
-					                       metadata->debugID,
-					                       VersionVector());
+					options.debugID = metadata->debugID;
+
+					GetValueRequest getReq(
+					    span.context, TenantInfo(), metadata->key, latest, metadata->tags, options, VersionVector());
 					state Future<Void> getValue = getValueQ(self, getReq);
 					GetValueReply reply = wait(getReq.reply.getFuture());
 					metadata = self->getWatchMetadata(req.key.contents());
@@ -10253,7 +10426,7 @@ ACTOR Future<Void> initTenantMap(StorageServer* self) {
 			// This limits the number of tenants, but eventually we shouldn't need to do this at all
 			// when SSs store only the local tenants
 			KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> entries =
-			    wait(TenantMetadata::tenantMap.getRange(
+			    wait(TenantMetadata::tenantMap().getRange(
 			        tr, Optional<TenantName>(), Optional<TenantName>(), CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1));
 			ASSERT(entries.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER && !entries.more);
 
@@ -10441,8 +10614,9 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
                                  Version tssSeedVersion,
                                  ReplyPromise<InitializeStorageReply> recruitReply,
                                  Reference<AsyncVar<ServerDBInfo> const> db,
-                                 std::string folder) {
-	state StorageServer self(persistentData, db, ssi);
+                                 std::string folder,
+                                 Reference<IEncryptionKeyProvider> encryptionKeyProvider) {
+	state StorageServer self(persistentData, db, ssi, encryptionKeyProvider);
 	self.shardAware = SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA &&
 	                  (SERVER_KNOBS->STORAGE_SERVER_SHARD_AWARE || persistentData->shardAware());
 	state Future<Void> ssCore;
@@ -10481,6 +10655,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
 			self.tag = seedTag;
 		}
 
+		self.encryptionKeyProvider->setTenantPrefixIndex(self.tenantPrefixIndex);
 		self.storage.makeNewStorageServerDurable(self.shardAware);
 		wait(self.storage.commit());
 		++self.counters.kvCommits;
@@ -10534,8 +10709,9 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
                                  Reference<AsyncVar<ServerDBInfo> const> db,
                                  std::string folder,
                                  Promise<Void> recovered,
-                                 Reference<IClusterConnectionRecord> connRecord) {
-	state StorageServer self(persistentData, db, ssi);
+                                 Reference<IClusterConnectionRecord> connRecord,
+                                 Reference<IEncryptionKeyProvider> encryptionKeyProvider) {
+	state StorageServer self(persistentData, db, ssi, encryptionKeyProvider);
 	state Future<Void> ssCore;
 	self.folder = folder;
 
@@ -10562,6 +10738,13 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
 				recovered.send(Void());
 			return Void();
 		}
+		// Pass a reference of tenantPrefixIndex to the storage engine to support per-tenant data encryption,
+		// after the tenant map is recovered in restoreDurableState. In case of a storage server reboot,
+		// it is possible that the storage engine is still holding a pre-reboot tenantPrefixIndex, and use that
+		// for its own recovery, before we set the tenantPrefixIndex here.
+		if (self.encryptionKeyProvider.isValid()) {
+			self.encryptionKeyProvider->setTenantPrefixIndex(self.tenantPrefixIndex);
+		}
 		TraceEvent("SSTimeRestoreDurableState", self.thisServerID).detail("TimeTaken", now() - start);
 
 		// if this is a tss storage file, use that as source of truth for this server being a tss instead of the
diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp
index 80e319cce5..86616ee5d8 100644
--- a/fdbserver/tester.actor.cpp
+++ b/fdbserver/tester.actor.cpp
@@ -1146,7 +1146,13 @@ ACTOR Future<bool> runTest(Database cx,
 std::map<std::string, std::function<void(const std::string&)>> testSpecGlobalKeys = {
 	// These are read by SimulatedCluster and used before testers exist.  Thus, they must
 	// be recognized and accepted, but there's no point in placing them into a testSpec.
-	{ "extraDB", [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedExtraDB", ""); } },
+	// testClass and testPriority are only used for TestHarness, we'll ignore those here
+	{ "testClass", [](std::string const&) {} },
+	{ "testPriority", [](std::string const&) {} },
+	{ "extraDatabaseMode",
+	  [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedExtraDatabaseMode", ""); } },
+	{ "extraDatabaseCount",
+	  [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedExtraDatabaseCount", ""); } },
 	{ "configureLocked",
 	  [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedConfigureLocked", ""); } },
 	{ "minimumReplication",
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 1fc141bfdc..b6477b5d6a 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -41,7 +41,7 @@
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/MetricLogger.actor.h"
 #include "fdbserver/BackupInterface.h"
-#include "fdbserver/EncryptKeyProxyInterface.h"
+#include "fdbclient/EncryptKeyProxyInterface.h"
 #include "fdbserver/RoleLineage.actor.h"
 #include "fdbserver/WorkerInterface.actor.h"
 #include "fdbserver/IKeyValueStore.h"
@@ -226,9 +226,9 @@ ACTOR Future<Void> handleIOErrors(Future<Void> actor, IClosable* store, UID id,
 			if (e.isError() && e.getError().code() == error_code_broken_promise && !storeError.isReady()) {
 				wait(delay(0.00001 + FLOW_KNOBS->MAX_BUGGIFIED_DELAY));
 			}
-			if (storeError.isReady() &&
-			    !((storeError.get().isError() && storeError.get().getError().code() == error_code_file_not_found))) {
-				throw storeError.get().isError() ? storeError.get().getError() : actor_cancelled();
+			if (storeError.isReady() && storeError.isError() &&
+			    storeError.getError().code() != error_code_file_not_found) {
+				throw storeError.get().getError();
 			}
 			if (e.isError()) {
 				throw e.getError();
@@ -236,18 +236,18 @@ ACTOR Future<Void> handleIOErrors(Future<Void> actor, IClosable* store, UID id,
 				return e.get();
 		}
 		when(ErrorOr<Void> e = wait(storeError)) {
-			// for remote kv store, worker can terminate without an error, so throws actor_cancelled
-			// (there's probably a better way tho)
-			TraceEvent("WorkerTerminatingByIOError", id)
-			    .errorUnsuppressed(e.isError() ? e.getError() : actor_cancelled());
+			TraceEvent("WorkerTerminatingByIOError", id).errorUnsuppressed(e.getError());
 			actor.cancel();
 			// file_not_found can occur due to attempting to open a partially deleted DiskQueue, which should not be
 			// reported SevError.
-			if (e.isError() && e.getError().code() == error_code_file_not_found) {
+			if (e.getError().code() == error_code_file_not_found) {
 				CODE_PROBE(true, "Worker terminated with file_not_found error");
 				return Void();
+			} else if (e.getError().code() == error_code_lock_file_failure) {
+				CODE_PROBE(true, "Unable to lock file");
+				throw please_reboot_kv_store();
 			}
-			throw e.isError() ? e.getError() : actor_cancelled();
+			throw e.getError();
 		}
 	}
 }
@@ -269,7 +269,6 @@ ACTOR Future<Void> workerHandleErrors(FutureStream<ErrorInfo> errors) {
 			endRole(err.role, err.id, "Error", ok, err.error);
 
 			if (err.error.code() == error_code_please_reboot ||
-			    err.error.code() == error_code_please_reboot_remote_kv_store ||
 			    (err.role == Role::SHARED_TRANSACTION_LOG &&
 			     (err.error.code() == error_code_io_error || err.error.code() == error_code_io_timeout)))
 				throw err.error;
@@ -1236,13 +1235,9 @@ struct TrackRunningStorage {
 	                    KeyValueStoreType storeType,
 	                    std::set<std::pair<UID, KeyValueStoreType>>* runningStorages)
 	  : self(self), storeType(storeType), runningStorages(runningStorages) {
-		TraceEvent(SevDebug, "TrackingRunningStorageConstruction").detail("StorageID", self);
 		runningStorages->emplace(self, storeType);
 	}
-	~TrackRunningStorage() {
-		runningStorages->erase(std::make_pair(self, storeType));
-		TraceEvent(SevDebug, "TrackingRunningStorageDesctruction").detail("StorageID", self);
-	};
+	~TrackRunningStorage() { runningStorages->erase(std::make_pair(self, storeType)); };
 };
 
 ACTOR Future<Void> storageServerRollbackRebooter(std::set<std::pair<UID, KeyValueStoreType>>* runningStorages,
@@ -1256,16 +1251,48 @@ ACTOR Future<Void> storageServerRollbackRebooter(std::set<std::pair<UID, KeyValu
                                                  std::string folder,
                                                  ActorCollection* filesClosed,
                                                  int64_t memoryLimit,
-                                                 IKeyValueStore* store) {
+                                                 IKeyValueStore* store,
+                                                 bool validateDataFiles,
+                                                 Promise<Void>* rebootKVStore,
+                                                 Reference<IEncryptionKeyProvider> encryptionKeyProvider) {
 	state TrackRunningStorage _(id, storeType, runningStorages);
 	loop {
 		ErrorOr<Void> e = wait(errorOr(prevStorageServer));
 		if (!e.isError())
 			return Void();
-		else if (e.getError().code() != error_code_please_reboot)
+		else if (e.getError().code() != error_code_please_reboot &&
+		         e.getError().code() != error_code_please_reboot_kv_store)
 			throw e.getError();
 
-		TraceEvent("StorageServerRequestedReboot", id).log();
+		TraceEvent("StorageServerRequestedReboot", id)
+		    .detail("RebootStorageEngine", e.getError().code() == error_code_please_reboot_kv_store)
+		    .log();
+
+		if (e.getError().code() == error_code_please_reboot_kv_store) {
+			// Add the to actorcollection to make sure filesClosed not return
+			filesClosed->add(rebootKVStore->getFuture());
+			wait(delay(SERVER_KNOBS->REBOOT_KV_STORE_DELAY));
+			// reopen KV store
+			store = openKVStore(
+			    storeType,
+			    filename,
+			    id,
+			    memoryLimit,
+			    false,
+			    validateDataFiles,
+			    SERVER_KNOBS->REMOTE_KV_STORE && /* testing mixed mode in simulation if remote kvs enabled */
+			        (g_network->isSimulated()
+			             ? (/* Disable for RocksDB */ storeType != KeyValueStoreType::SSD_ROCKSDB_V1 &&
+			                deterministicRandom()->coinflip())
+			             : true));
+			Promise<Void> nextRebootKVStorePromise;
+			filesClosed->add(store->onClosed() ||
+			                 nextRebootKVStorePromise
+			                     .getFuture() /* clear the onClosed() Future in actorCollection when rebooting */);
+			// remove the original onClosed signal from the actorCollection
+			rebootKVStore->send(Void());
+			rebootKVStore->swap(nextRebootKVStorePromise);
+		}
 
 		StorageServerInterface recruited;
 		recruited.uniqueID = id;
@@ -1294,8 +1321,13 @@ ACTOR Future<Void> storageServerRollbackRebooter(std::set<std::pair<UID, KeyValu
 		DUMPTOKEN(recruited.changeFeedPop);
 		DUMPTOKEN(recruited.changeFeedVersionUpdate);
 
-		prevStorageServer =
-		    storageServer(store, recruited, db, folder, Promise<Void>(), Reference<IClusterConnectionRecord>(nullptr));
+		prevStorageServer = storageServer(store,
+		                                  recruited,
+		                                  db,
+		                                  folder,
+		                                  Promise<Void>(),
+		                                  Reference<IClusterConnectionRecord>(nullptr),
+		                                  encryptionKeyProvider);
 		prevStorageServer = handleIOErrors(prevStorageServer, store, id, store->onClosed());
 	}
 }
@@ -1580,12 +1612,16 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 	state Reference<AsyncVar<Optional<RatekeeperInterface>>> rkInterf(new AsyncVar<Optional<RatekeeperInterface>>());
 	state Reference<AsyncVar<Optional<std::pair<int64_t, BlobManagerInterface>>>> bmEpochAndInterf(
 	    new AsyncVar<Optional<std::pair<int64_t, BlobManagerInterface>>>());
+	state UID lastBMRecruitRequestId;
 	state Reference<AsyncVar<Optional<EncryptKeyProxyInterface>>> ekpInterf(
 	    new AsyncVar<Optional<EncryptKeyProxyInterface>>());
 	state Future<Void> handleErrors = workerHandleErrors(errors.getFuture()); // Needs to be stopped last
 	state ActorCollection errorForwarders(false);
 	state Future<Void> loggingTrigger = Void();
 	state double loggingDelay = SERVER_KNOBS->WORKER_LOGGING_INTERVAL;
+	// These two promises are destroyed after the "filesClosed" below to avoid broken_promise
+	state Promise<Void> rebootKVSPromise;
+	state Promise<Void> rebootKVSPromise2;
 	state ActorCollection filesClosed(true);
 	state Promise<Void> stopping;
 	state WorkerCache<InitializeStorageReply> storageCache;
@@ -1688,6 +1724,8 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 			if (s.storedComponent == DiskStore::Storage) {
 				LocalLineage _;
 				getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Storage;
+				Reference<IEncryptionKeyProvider> encryptionKeyProvider =
+				    makeReference<TenantAwareEncryptionKeyProvider>(dbInfo);
 				IKeyValueStore* kv = openKVStore(
 				    s.storeType,
 				    s.filename,
@@ -1700,8 +1738,11 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 				             ? (/* Disable for RocksDB */ s.storeType != KeyValueStoreType::SSD_ROCKSDB_V1 &&
 				                s.storeType != KeyValueStoreType::SSD_SHARDED_ROCKSDB &&
 				                deterministicRandom()->coinflip())
-				             : true));
-				Future<Void> kvClosed = kv->onClosed();
+				             : true),
+				    encryptionKeyProvider);
+				Future<Void> kvClosed =
+				    kv->onClosed() ||
+				    rebootKVSPromise.getFuture() /* clear the onClosed() Future in actorCollection when rebooting */;
 				filesClosed.add(kvClosed);
 
 				// std::string doesn't have startsWith
@@ -1746,7 +1787,8 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 				DUMPTOKEN(recruited.changeFeedVersionUpdate);
 
 				Promise<Void> recovery;
-				Future<Void> f = storageServer(kv, recruited, dbInfo, folder, recovery, connRecord);
+				Future<Void> f =
+				    storageServer(kv, recruited, dbInfo, folder, recovery, connRecord, encryptionKeyProvider);
 				recoveries.push_back(recovery.getFuture());
 				f = handleIOErrors(f, kv, s.storeID, kvClosed);
 				f = storageServerRollbackRebooter(&runningStorages,
@@ -1760,7 +1802,10 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 				                                  folder,
 				                                  &filesClosed,
 				                                  memoryLimit,
-				                                  kv);
+				                                  kv,
+				                                  validateDataFiles,
+				                                  &rebootKVSPromise,
+				                                  encryptionKeyProvider);
 				errorForwarders.add(forwardError(errors, ssRole, recruited.id(), f));
 			} else if (s.storedComponent == DiskStore::TLogData) {
 				LocalLineage _;
@@ -1774,7 +1819,6 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 					logQueueBasename = fileLogQueuePrefix.toString() + optionsString.toString() + "-";
 				}
 				ASSERT_WE_THINK(abspath(parentDirectory(s.filename)) == folder);
-				// TraceEvent(SevDebug, "openRemoteKVStore").detail("storeType", "TlogData");
 				IKeyValueStore* kv = openKVStore(s.storeType, s.filename, s.storeID, memoryLimit, validateDataFiles);
 				const DiskQueueVersion dqv = s.tLogOptions.getDiskQueueVersion();
 				const int64_t diskQueueWarnSize =
@@ -2090,13 +2134,21 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 			when(InitializeBlobManagerRequest req = waitNext(interf.blobManager.getFuture())) {
 				LocalLineage _;
 				getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::BlobManager;
-				BlobManagerInterface recruited(locality, req.reqId);
+				BlobManagerInterface recruited(locality, req.reqId, req.epoch);
 				recruited.initEndpoints();
 
 				if (bmEpochAndInterf->get().present() && bmEpochAndInterf->get().get().first == req.epoch) {
+					ASSERT(req.reqId == lastBMRecruitRequestId);
 					recruited = bmEpochAndInterf->get().get().second;
 
 					CODE_PROBE(true, "Recruited while already a blob manager.");
+				} else if (lastBMRecruitRequestId == req.reqId && !bmEpochAndInterf->get().present()) {
+					// The previous blob manager WAS present, like the above case, but it died before the CC got the
+					// response to the recruitment request, so the CC retried to recruit the same blob manager id/epoch
+					// from the same reqId. To keep epoch safety between different managers, instead of restarting the
+					// same manager id at the same epoch, we should just tell it the original request succeeded, and let
+					// it realize this manager died via failure detection and start a new one.
+					CODE_PROBE(true, "Recruited while formerly the same blob manager.");
 				} else {
 					// TODO: it'd be more optimal to halt the last manager if present here, but it will figure it out
 					// via the epoch check
@@ -2109,6 +2161,8 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 					DUMPTOKEN(recruited.haltBlobGranules);
 					DUMPTOKEN(recruited.blobManagerExclCheckReq);
 
+					lastBMRecruitRequestId = req.reqId;
+
 					Future<Void> blobManagerProcess = blobManager(recruited, dbInfo, req.epoch);
 					errorForwarders.add(
 					    forwardError(errors,
@@ -2200,7 +2254,6 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 					    req.logVersion > TLogVersion::V2 ? fileVersionedLogDataPrefix : fileLogDataPrefix;
 					std::string filename =
 					    filenameFromId(req.storeType, folder, prefix.toString() + tLogOptions.toPrefix(), logId);
-					// TraceEvent(SevDebug, "openRemoteKVStore").detail("storeType", "3");
 					IKeyValueStore* data = openKVStore(req.storeType, filename, logId, memoryLimit);
 					const DiskQueueVersion dqv = tLogOptions.getDiskQueueVersion();
 					IDiskQueue* queue = openDiskQueue(
@@ -2281,14 +2334,14 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 					DUMPTOKEN(recruited.changeFeedStream);
 					DUMPTOKEN(recruited.changeFeedPop);
 					DUMPTOKEN(recruited.changeFeedVersionUpdate);
-					// printf("Recruited as storageServer\n");
 
 					std::string filename =
 					    filenameFromId(req.storeType,
 					                   folder,
 					                   isTss ? testingStoragePrefix.toString() : fileStoragePrefix.toString(),
 					                   recruited.id());
-
+					Reference<IEncryptionKeyProvider> encryptionKeyProvider =
+					    makeReference<TenantAwareEncryptionKeyProvider>(dbInfo);
 					IKeyValueStore* data = openKVStore(
 					    req.storeType,
 					    filename,
@@ -2301,9 +2354,13 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 					             ? (/* Disable for RocksDB */ req.storeType != KeyValueStoreType::SSD_ROCKSDB_V1 &&
 					                req.storeType != KeyValueStoreType::SSD_SHARDED_ROCKSDB &&
 					                deterministicRandom()->coinflip())
-					             : true));
+					             : true),
+					    encryptionKeyProvider);
 
-					Future<Void> kvClosed = data->onClosed();
+					Future<Void> kvClosed =
+					    data->onClosed() ||
+					    rebootKVSPromise2
+					        .getFuture() /* clear the onClosed() Future in actorCollection when rebooting */;
 					filesClosed.add(kvClosed);
 					ReplyPromise<InitializeStorageReply> storageReady = req.reply;
 					storageCache.set(req.reqId, storageReady.getFuture());
@@ -2315,7 +2372,8 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 					                               isTss ? req.tssPairIDAndVersion.get().second : 0,
 					                               storageReady,
 					                               dbInfo,
-					                               folder);
+					                               folder,
+					                               encryptionKeyProvider);
 					s = handleIOErrors(s, data, recruited.id(), kvClosed);
 					s = storageCache.removeOnReady(req.reqId, s);
 					s = storageServerRollbackRebooter(&runningStorages,
@@ -2329,7 +2387,10 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 					                                  folder,
 					                                  &filesClosed,
 					                                  memoryLimit,
-					                                  data);
+					                                  data,
+					                                  false,
+					                                  &rebootKVSPromise2,
+					                                  encryptionKeyProvider);
 					errorForwarders.add(forwardError(errors, ssRole, recruited.id(), s));
 				} else if (storageCache.exists(req.reqId)) {
 					forwardPromise(req.reply, storageCache.get(req.reqId));
@@ -2354,6 +2415,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 					DUMPTOKEN(recruited.granuleAssignmentsRequest);
 					DUMPTOKEN(recruited.granuleStatusStreamRequest);
 					DUMPTOKEN(recruited.haltBlobWorker);
+					DUMPTOKEN(recruited.minBlobVersionRequest);
 
 					ReplyPromise<InitializeBlobWorkerReply> blobWorkerReady = req.reply;
 					Future<Void> bw = blobWorker(recruited, blobWorkerReady, dbInfo);
@@ -2543,28 +2605,30 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 				loggingTrigger = delay(loggingDelay, TaskPriority::FlushTrace);
 			}
 			when(state WorkerSnapRequest snapReq = waitNext(interf.workerSnapReq.getFuture())) {
-				std::string snapUID = snapReq.snapUID.toString() + snapReq.role.toString();
-				if (snapReqResultMap.count(snapUID)) {
-					CODE_PROBE(true, "Worker received a duplicate finished snap request");
-					auto result = snapReqResultMap[snapUID];
+				std::string snapReqKey = snapReq.snapUID.toString() + snapReq.role.toString();
+				if (snapReqResultMap.count(snapReqKey)) {
+					CODE_PROBE(true, "Worker received a duplicate finished snapshot request");
+					auto result = snapReqResultMap[snapReqKey];
 					result.isError() ? snapReq.reply.sendError(result.getError()) : snapReq.reply.send(result.get());
 					TraceEvent("RetryFinishedWorkerSnapRequest")
-					    .detail("SnapUID", snapUID)
+					    .detail("SnapUID", snapReq.snapUID.toString())
 					    .detail("Role", snapReq.role)
-					    .detail("Result", result.isError() ? result.getError().code() : 0);
-				} else if (snapReqMap.count(snapUID)) {
-					CODE_PROBE(true, "Worker received a duplicate ongoing snap request");
-					TraceEvent("RetryOngoingWorkerSnapRequest").detail("SnapUID", snapUID).detail("Role", snapReq.role);
-					ASSERT(snapReq.role == snapReqMap[snapUID].role);
-					ASSERT(snapReq.snapPayload == snapReqMap[snapUID].snapPayload);
-					snapReqMap[snapUID] = snapReq;
+					    .detail("Result", result.isError() ? result.getError().code() : success().code());
+				} else if (snapReqMap.count(snapReqKey)) {
+					CODE_PROBE(true, "Worker received a duplicate ongoing snapshot request");
+					TraceEvent("RetryOngoingWorkerSnapRequest")
+					    .detail("SnapUID", snapReq.snapUID.toString())
+					    .detail("Role", snapReq.role);
+					ASSERT(snapReq.role == snapReqMap[snapReqKey].role);
+					ASSERT(snapReq.snapPayload == snapReqMap[snapReqKey].snapPayload);
+					snapReqMap[snapReqKey] = snapReq;
 				} else {
-					snapReqMap[snapUID] = snapReq; // set map point to the request
+					snapReqMap[snapReqKey] = snapReq; // set map point to the request
 					if (g_network->isSimulated() && (now() - lastSnapTime) < SERVER_KNOBS->SNAP_MINIMUM_TIME_GAP) {
 						// only allow duplicate snapshots on same process in a short time for different roles
 						auto okay = (lastSnapReq.snapUID == snapReq.snapUID) && lastSnapReq.role != snapReq.role;
 						TraceEvent(okay ? SevInfo : SevError, "RapidSnapRequestsOnSameProcess")
-						    .detail("CurrSnapUID", snapUID)
+						    .detail("CurrSnapUID", snapReqKey)
 						    .detail("PrevSnapUID", lastSnapReq.snapUID)
 						    .detail("CurrRole", snapReq.role)
 						    .detail("PrevRole", lastSnapReq.role)
@@ -2576,8 +2640,8 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 					                                     &snapReqResultMap));
 					auto* snapReqResultMapPtr = &snapReqResultMap;
 					errorForwarders.add(fmap(
-					    [snapReqResultMapPtr, snapUID](Void _) {
-						    snapReqResultMapPtr->erase(snapUID);
+					    [snapReqResultMapPtr, snapReqKey](Void _) {
+						    snapReqResultMapPtr->erase(snapReqKey);
 						    return Void();
 					    },
 					    delay(SERVER_KNOBS->SNAP_MINIMUM_TIME_GAP)));
@@ -2591,26 +2655,21 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 			when(wait(handleErrors)) {}
 		}
 	} catch (Error& err) {
-		TraceEvent(SevDebug, "WorkerServer").detail("Error", err.code()).backtrace();
 		// Make sure actors are cancelled before "recovery" promises are destructed.
 		for (auto f : recoveries)
 			f.cancel();
 		state Error e = err;
 		bool ok = e.code() == error_code_please_reboot || e.code() == error_code_actor_cancelled ||
-		          e.code() == error_code_please_reboot_delete || e.code() == error_code_please_reboot_remote_kv_store;
+		          e.code() == error_code_please_reboot_delete;
 
 		endRole(Role::WORKER, interf.id(), "WorkerError", ok, e);
 		errorForwarders.clear(false);
 		sharedLogs.clear();
 
-		if (e.code() != error_code_actor_cancelled && e.code() != error_code_please_reboot_remote_kv_store) {
+		if (e.code() != error_code_actor_cancelled) {
 			// actor_cancelled:
 			// We get cancelled e.g. when an entire simulation times out, but in that case
 			// we won't be restarted and don't need to wait for shutdown
-			// reboot_remote_kv_store:
-			// The child process running the storage engine died abnormally,
-			// the current solution is to reboot the worker.
-			// Some refactoring work in the future can make it only reboot the storage server
 			stopping.send(Void());
 			wait(filesClosed.getResult()); // Wait for complete shutdown of KV stores
 			wait(delay(0.0)); // Unwind the callstack to make sure that IAsyncFile references are all gone
@@ -2808,7 +2867,8 @@ ACTOR Future<Void> updateNewestSoftwareVersion(std::string folder,
 }
 
 ACTOR Future<Void> testAndUpdateSoftwareVersionCompatibility(std::string dataFolder, UID processIDUid) {
-	ErrorOr<SWVersion> swVersion = wait(errorOr(testSoftwareVersionCompatibility(dataFolder, currentProtocolVersion)));
+	ErrorOr<SWVersion> swVersion =
+	    wait(errorOr(testSoftwareVersionCompatibility(dataFolder, currentProtocolVersion())));
 	if (swVersion.isError()) {
 		TraceEvent(SevWarnAlways, "SWVersionCompatibilityCheckError", processIDUid).error(swVersion.getError());
 		throw swVersion.getError();
@@ -2817,16 +2877,16 @@ ACTOR Future<Void> testAndUpdateSoftwareVersionCompatibility(std::string dataFol
 	TraceEvent(SevInfo, "SWVersionCompatible", processIDUid).detail("SWVersion", swVersion.get());
 
 	if (!swVersion.get().isValid() ||
-	    currentProtocolVersion > ProtocolVersion(swVersion.get().newestProtocolVersion())) {
+	    currentProtocolVersion() > ProtocolVersion(swVersion.get().newestProtocolVersion())) {
 		ErrorOr<Void> updatedSWVersion = wait(errorOr(updateNewestSoftwareVersion(
-		    dataFolder, currentProtocolVersion, currentProtocolVersion, minCompatibleProtocolVersion)));
+		    dataFolder, currentProtocolVersion(), currentProtocolVersion(), minCompatibleProtocolVersion)));
 		if (updatedSWVersion.isError()) {
 			throw updatedSWVersion.getError();
 		}
-	} else if (currentProtocolVersion < ProtocolVersion(swVersion.get().newestProtocolVersion())) {
+	} else if (currentProtocolVersion() < ProtocolVersion(swVersion.get().newestProtocolVersion())) {
 		ErrorOr<Void> updatedSWVersion = wait(
 		    errorOr(updateNewestSoftwareVersion(dataFolder,
-		                                        currentProtocolVersion,
+		                                        currentProtocolVersion(),
 		                                        ProtocolVersion(swVersion.get().newestProtocolVersion()),
 		                                        ProtocolVersion(swVersion.get().lowestCompatibleProtocolVersion()))));
 		if (updatedSWVersion.isError()) {
@@ -2835,7 +2895,7 @@ ACTOR Future<Void> testAndUpdateSoftwareVersionCompatibility(std::string dataFol
 	}
 
 	ErrorOr<SWVersion> newSWVersion =
-	    wait(errorOr(testSoftwareVersionCompatibility(dataFolder, currentProtocolVersion)));
+	    wait(errorOr(testSoftwareVersionCompatibility(dataFolder, currentProtocolVersion())));
 	if (newSWVersion.isError()) {
 		TraceEvent(SevWarnAlways, "SWVersionCompatibilityCheckError", processIDUid).error(newSWVersion.getError());
 		throw newSWVersion.getError();
@@ -3311,7 +3371,11 @@ ACTOR Future<Void> fdbd(Reference<IClusterConnectionRecord> connRecord,
 		auto ci = makeReference<AsyncVar<Optional<ClusterInterface>>>();
 		auto asyncPriorityInfo =
 		    makeReference<AsyncVar<ClusterControllerPriorityInfo>>(getCCPriorityInfo(fitnessFilePath, processClass));
-		auto dbInfo = makeReference<AsyncVar<ServerDBInfo>>();
+		auto serverDBInfo = ServerDBInfo();
+		serverDBInfo.client.isEncryptionEnabled = SERVER_KNOBS->ENABLE_ENCRYPTION;
+		serverDBInfo.myLocality = localities;
+		auto dbInfo = makeReference<AsyncVar<ServerDBInfo>>(serverDBInfo);
+		TraceEvent("MyLocality").detail("Locality", dbInfo->get().myLocality.toString());
 
 		actors.push_back(reportErrors(monitorAndWriteCCPriorityInfo(fitnessFilePath, asyncPriorityInfo),
 		                              "MonitorAndWriteCCPriorityInfo"));
diff --git a/fdbserver/workloads/AtomicSwitchover.actor.cpp b/fdbserver/workloads/AtomicSwitchover.actor.cpp
index 0337da51b4..ce98195dac 100644
--- a/fdbserver/workloads/AtomicSwitchover.actor.cpp
+++ b/fdbserver/workloads/AtomicSwitchover.actor.cpp
@@ -39,7 +39,9 @@ struct AtomicSwitchoverWorkload : TestWorkload {
 
 		backupRanges.push_back_deep(backupRanges.arena(), normalKeys);
 
-		auto extraFile = makeReference<ClusterConnectionMemoryRecord>(*g_simulator.extraDB);
+		ASSERT(g_simulator.extraDatabases.size() == 1);
+		auto extraFile =
+		    makeReference<ClusterConnectionMemoryRecord>(ClusterConnectionString(g_simulator.extraDatabases[0]));
 		extraDB = Database::createDatabase(extraFile, -1);
 	}
 
diff --git a/fdbserver/workloads/BackupCorrectness.actor.cpp b/fdbserver/workloads/BackupCorrectness.actor.cpp
index 0be64bd065..5f2e7b120c 100644
--- a/fdbserver/workloads/BackupCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupCorrectness.actor.cpp
@@ -73,7 +73,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 		restorePrefixesToInclude = getOption(options, "restorePrefixesToInclude"_sr, std::vector<std::string>());
 		shouldSkipRestoreRanges = deterministicRandom()->random01() < 0.3 ? true : false;
 		if (getOption(options, "encrypted"_sr, deterministicRandom()->random01() < 0.1)) {
-			encryptionKeyFileName = "simfdb/test_encryption_key_file";
+			encryptionKeyFileName = "simfdb/" + getTestEncryptionFileName();
 		}
 
 		TraceEvent("BARW_ClientId").detail("Id", wcx.clientId);
@@ -883,5 +883,9 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 
 int BackupAndRestoreCorrectnessWorkload::backupAgentRequests = 0;
 
+std::string getTestEncryptionFileName() {
+	return "test_encryption_key_file";
+}
+
 WorkloadFactory<BackupAndRestoreCorrectnessWorkload> BackupAndRestoreCorrectnessWorkloadFactory(
     "BackupAndRestoreCorrectness");
diff --git a/fdbserver/workloads/BackupToDBAbort.actor.cpp b/fdbserver/workloads/BackupToDBAbort.actor.cpp
index a505d8e02e..5a26abffa0 100644
--- a/fdbserver/workloads/BackupToDBAbort.actor.cpp
+++ b/fdbserver/workloads/BackupToDBAbort.actor.cpp
@@ -36,7 +36,9 @@ struct BackupToDBAbort : TestWorkload {
 
 		backupRanges.push_back_deep(backupRanges.arena(), normalKeys);
 
-		auto extraFile = makeReference<ClusterConnectionMemoryRecord>(*g_simulator.extraDB);
+		ASSERT(g_simulator.extraDatabases.size() == 1);
+		auto extraFile =
+		    makeReference<ClusterConnectionMemoryRecord>(ClusterConnectionString(g_simulator.extraDatabases[0]));
 		extraDB = Database::createDatabase(extraFile, -1);
 
 		lockid = UID(0xbeeffeed, 0xdecaf00d);
diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
index cdd0d8ae70..f71906324f 100644
--- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
@@ -128,7 +128,9 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 			}
 		}
 
-		auto extraFile = makeReference<ClusterConnectionMemoryRecord>(*g_simulator.extraDB);
+		ASSERT(g_simulator.extraDatabases.size() == 1);
+		auto extraFile =
+		    makeReference<ClusterConnectionMemoryRecord>(ClusterConnectionString(g_simulator.extraDatabases[0]));
 		extraDB = Database::createDatabase(extraFile, -1);
 
 		TraceEvent("BARW_Start").detail("Locked", locked);
diff --git a/fdbserver/workloads/BackupToDBUpgrade.actor.cpp b/fdbserver/workloads/BackupToDBUpgrade.actor.cpp
index 6ccb691c9d..2da99f3e37 100644
--- a/fdbserver/workloads/BackupToDBUpgrade.actor.cpp
+++ b/fdbserver/workloads/BackupToDBUpgrade.actor.cpp
@@ -76,7 +76,9 @@ struct BackupToDBUpgradeWorkload : TestWorkload {
 			}
 		}
 
-		auto extraFile = makeReference<ClusterConnectionMemoryRecord>(*g_simulator.extraDB);
+		ASSERT(g_simulator.extraDatabases.size() == 1);
+		auto extraFile =
+		    makeReference<ClusterConnectionMemoryRecord>(ClusterConnectionString(g_simulator.extraDatabases[0]));
 		extraDB = Database::createDatabase(extraFile, -1);
 
 		TraceEvent("DRU_Start").log();
diff --git a/fdbserver/workloads/BlobGranuleCorrectnessWorkload.actor.cpp b/fdbserver/workloads/BlobGranuleCorrectnessWorkload.actor.cpp
index b15cbf6f45..5b73712fd3 100644
--- a/fdbserver/workloads/BlobGranuleCorrectnessWorkload.actor.cpp
+++ b/fdbserver/workloads/BlobGranuleCorrectnessWorkload.actor.cpp
@@ -91,6 +91,9 @@ struct ThreadData : ReferenceCounted<ThreadData>, NonCopyable {
 	Promise<Void> firstWriteSuccessful;
 	Version minSuccessfulReadVersion = MAX_VERSION;
 
+	Future<Void> summaryClient;
+	Promise<Void> triggerSummaryComplete;
+
 	// stats
 	int64_t errors = 0;
 	int64_t mismatches = 0;
@@ -143,7 +146,17 @@ struct ThreadData : ReferenceCounted<ThreadData>, NonCopyable {
 			} catch (Error& e) {
 				// Ignore being unable to parse lastKey as it may be a dummy key.
 			}
+
 			if (t2.size() > 0 && t.getInt(0) != t2.getInt(0)) {
+				if (t.size() > BGW_TUPLE_KEY_SIZE - SERVER_KNOBS->BG_KEY_TUPLE_TRUNCATE_OFFSET) {
+					fmt::print("Tenant: {0}, K={1}, E={2}, LK={3}. {4} != {5}\n",
+					           tenant.prefix.printable(),
+					           k.printable(),
+					           e.printable(),
+					           lastKey.printable(),
+					           t.getInt(0),
+					           t2.getInt(0));
+				}
 				ASSERT(t.size() <= BGW_TUPLE_KEY_SIZE - SERVER_KNOBS->BG_KEY_TUPLE_TRUNCATE_OFFSET);
 			}
 		}
@@ -270,6 +283,8 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
 			self->directories[directoryIdx]->directoryRange =
 			    KeyRangeRef(tenantEntry.prefix, tenantEntry.prefix.withSuffix(normalKeys.end));
 			tenants.push_back({ self->directories[directoryIdx]->tenantName, tenantEntry });
+			bool _success = wait(cx->blobbifyRange(self->directories[directoryIdx]->directoryRange));
+			ASSERT(_success);
 		}
 		tenantData.addTenants(tenants);
 
@@ -874,6 +889,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
 		for (auto& it : directories) {
 			// Wait for blob worker to initialize snapshot before starting test for that range
 			Future<Void> start = waitFirstSnapshot(this, cx, it, true);
+			it->summaryClient = validateGranuleSummaries(cx, normalKeys, it->tenantName, it->triggerSummaryComplete);
 			clients.push_back(timeout(writeWorker(this, start, cx, it), testDuration, Void()));
 			clients.push_back(timeout(readWorker(this, start, cx, it), testDuration, Void()));
 		}
@@ -889,8 +905,8 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
 		loop {
 			state Transaction tr(cx, threadData->tenantName);
 			try {
-				Standalone<VectorRef<KeyRangeRef>> ranges = wait(tr.getBlobGranuleRanges(normalKeys));
-				ASSERT(ranges.size() >= 1);
+				Standalone<VectorRef<KeyRangeRef>> ranges = wait(tr.getBlobGranuleRanges(normalKeys, 1000000));
+				ASSERT(ranges.size() >= 1 && ranges.size() < 1000000);
 				ASSERT(ranges.front().begin == normalKeys.begin);
 				ASSERT(ranges.back().end == normalKeys.end);
 				for (int i = 0; i < ranges.size() - 1; i++) {
@@ -907,6 +923,9 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
 	                                  BlobGranuleCorrectnessWorkload* self,
 	                                  Reference<ThreadData> threadData) {
 
+		if (threadData->triggerSummaryComplete.canBeSet()) {
+			threadData->triggerSummaryComplete.send(Void());
+		}
 		state bool result = true;
 		state int finalRowsValidated;
 		if (threadData->writeVersions.empty()) {
@@ -973,6 +992,9 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
 			wait(clearAndAwaitMerge(cx, threadData->directoryRange));
 		}
 
+		// validate that summary completes without error
+		wait(threadData->summaryClient);
+
 		return result;
 	}
 
diff --git a/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp b/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp
new file mode 100644
index 0000000000..050dfe7fe5
--- /dev/null
+++ b/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp
@@ -0,0 +1,636 @@
+/*
+ * BlobGranuleRangesWorkload.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/FDBOptions.g.h"
+#include "fdbclient/FDBTypes.h"
+#include "fdbclient/ManagementAPI.actor.h"
+#include "fdbclient/NativeAPI.actor.h"
+#include "fdbclient/SystemData.h"
+#include "fdbserver/TesterInterface.actor.h"
+#include "fdbserver/workloads/workloads.actor.h"
+#include "fdbserver/workloads/BulkSetup.actor.h"
+#include "flow/Arena.h"
+#include "flow/IRandom.h"
+#include "flow/Trace.h"
+#include "flow/Util.h"
+#include "flow/serialize.h"
+#include <cstring>
+#include <limits>
+
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+#define BGRW_DEBUG true
+
+// FIXME: need to do multiple changes per commit to properly exercise future change feed logic
+// A workload specifically designed to stress the blob range management of the blob manager + blob worker, and test the
+// blob database api functions
+struct BlobGranuleRangesWorkload : TestWorkload {
+	// test settings
+	double testDuration;
+	int operationsPerSecond;
+	int targetRanges;
+	bool sequential;
+	int sequentialGap;
+
+	Future<Void> client;
+	Future<Void> unitClient;
+	bool stopUnitClient;
+
+	int32_t nextKey;
+
+	std::vector<KeyRange> inactiveRanges;
+	std::vector<KeyRange> activeRanges;
+
+	BlobGranuleRangesWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
+		testDuration = getOption(options, "testDuration"_sr, 30.0);
+		operationsPerSecond = getOption(options, "opsPerSecond"_sr, deterministicRandom()->randomInt(1, 100));
+		operationsPerSecond /= clientCount;
+		if (operationsPerSecond <= 0) {
+			operationsPerSecond = 1;
+		}
+
+		int64_t rand = wcx.sharedRandomNumber;
+		targetRanges = deterministicRandom()->randomExp(1, 1 + rand % 10);
+		targetRanges *= (0.8 + (deterministicRandom()->random01() * 0.4));
+		targetRanges /= clientCount;
+		if (targetRanges <= 0) {
+			targetRanges = 1;
+		}
+		rand /= 10;
+
+		sequential = rand % 2;
+		rand /= 2;
+
+		sequentialGap = 1 + rand % 2;
+		rand /= 2;
+
+		nextKey = 10000000 * clientId;
+
+		stopUnitClient = false;
+
+		TraceEvent("BlobGranuleRangesWorkloadInit").detail("TargetRanges", targetRanges);
+	}
+
+	std::string description() const override { return "BlobGranuleRangesWorkload"; }
+	Future<Void> setup(Database const& cx) override { return _setup(cx, this); }
+
+	std::string newKey() {
+		if (sequential) {
+			nextKey += sequentialGap;
+			return format("%08x", nextKey);
+		} else {
+			return deterministicRandom()->randomUniqueID().toString();
+		}
+	}
+
+	ACTOR Future<bool> setRange(Database cx, KeyRange range, bool active) {
+		if (active) {
+			bool success = wait(cx->blobbifyRange(range));
+			return success;
+		} else {
+			bool success = wait(cx->unblobbifyRange(range));
+			return success;
+		}
+	}
+
+	ACTOR Future<Void> registerNewRange(Database cx, BlobGranuleRangesWorkload* self) {
+		std::string nextRangeKey = "R_" + self->newKey();
+		state KeyRange range(KeyRangeRef(StringRef(nextRangeKey), strinc(StringRef(nextRangeKey))));
+		if (BGRW_DEBUG) {
+			fmt::print("Registering new range [{0} - {1})\n", range.begin.printable(), range.end.printable());
+		}
+
+		// don't put in active ranges until AFTER set range command succeeds, to avoid checking a range that maybe
+		// wasn't initialized
+		bool success = wait(self->setRange(cx, range, true));
+		ASSERT(success);
+
+		if (BGRW_DEBUG) {
+			fmt::print("Registered new range [{0} - {1})\n", range.begin.printable(), range.end.printable());
+		}
+
+		self->activeRanges.push_back(range);
+		return Void();
+	}
+
+	ACTOR Future<Void> unregisterRandomRange(Database cx, BlobGranuleRangesWorkload* self) {
+		int randomRangeIdx = deterministicRandom()->randomInt(0, self->activeRanges.size());
+		state KeyRange range = self->activeRanges[randomRangeIdx];
+		// remove range from active BEFORE committing txn but add to remove AFTER, to avoid checking a range that could
+		// potentially be in either state
+		swapAndPop(&self->activeRanges, randomRangeIdx);
+
+		if (BGRW_DEBUG) {
+			fmt::print("Unregistering new range [{0} - {1})\n", range.begin.printable(), range.end.printable());
+		}
+
+		if (deterministicRandom()->coinflip()) {
+			if (BGRW_DEBUG) {
+				fmt::print("Force purging range before un-registering: [{0} - {1})\n",
+				           range.begin.printable(),
+				           range.end.printable());
+			}
+			Key purgeKey = wait(cx->purgeBlobGranules(range, 1, {}, true));
+			wait(cx->waitPurgeGranulesComplete(purgeKey));
+		}
+		bool success = wait(self->setRange(cx, range, false));
+		ASSERT(success);
+
+		if (BGRW_DEBUG) {
+			fmt::print("Unregistered new range [{0} - {1})\n", range.begin.printable(), range.end.printable());
+		}
+
+		self->inactiveRanges.push_back(range);
+
+		return Void();
+	}
+
+	ACTOR Future<Void> _setup(Database cx, BlobGranuleRangesWorkload* self) {
+		// create initial target ranges
+		TraceEvent("BlobGranuleRangesSetup").detail("InitialRanges", self->targetRanges).log();
+		// set up blob granules
+		wait(success(ManagementAPI::changeConfig(cx.getReference(), "blob_granules_enabled=1", true)));
+
+		state int i;
+		std::vector<Future<Void>> createInitialRanges;
+		for (i = 0; i < self->targetRanges; i++) {
+			wait(self->registerNewRange(cx, self));
+		}
+		TraceEvent("BlobGranuleRangesSetupComplete");
+		return Void();
+	}
+
+	Future<Void> start(Database const& cx) override {
+		client = blobGranuleRangesClient(cx->clone(), this);
+		if (clientId == 0) {
+			unitClient = blobGranuleRangesUnitTests(cx->clone(), this);
+		} else {
+			unitClient = Future<Void>(Void());
+		}
+		return delay(testDuration);
+	}
+
+	Future<bool> check(Database const& cx) override {
+		client = Future<Void>();
+		stopUnitClient = true;
+		return _check(cx, this);
+	}
+
+	ACTOR Future<bool> isRangeActive(Database cx, KeyRange range) {
+		Version v = wait(cx->verifyBlobRange(range, {}));
+		return v != invalidVersion;
+	}
+
+	ACTOR Future<Void> checkRange(Database cx, BlobGranuleRangesWorkload* self, KeyRange range, bool isActive) {
+		// Check that a read completes for the range. If not loop around and try again
+		loop {
+			bool completed = wait(self->isRangeActive(cx, range));
+
+			if (completed == isActive) {
+				break;
+			}
+
+			if (BGRW_DEBUG) {
+				fmt::print("CHECK: {0} range [{1} - {2}) failed!\n",
+				           isActive ? "Active" : "Inactive",
+				           range.begin.printable(),
+				           range.end.printable());
+			}
+
+			wait(delay(1.0));
+		}
+
+		Standalone<VectorRef<KeyRangeRef>> blobRanges = wait(cx->listBlobbifiedRanges(range, 1000000));
+		if (isActive) {
+			ASSERT(blobRanges.size() == 1);
+			ASSERT(blobRanges[0].begin <= range.begin);
+			ASSERT(blobRanges[0].end >= range.end);
+		} else {
+			ASSERT(blobRanges.empty());
+		}
+
+		state Transaction tr(cx);
+		loop {
+			try {
+				Standalone<VectorRef<KeyRangeRef>> granules = wait(tr.getBlobGranuleRanges(range, 1000000));
+				if (isActive) {
+					ASSERT(granules.size() >= 1);
+					ASSERT(granules.front().begin <= range.begin);
+					ASSERT(granules.back().end >= range.end);
+					for (int i = 0; i < granules.size() - 1; i++) {
+						ASSERT(granules[i].end == granules[i + 1].begin);
+					}
+				} else {
+					if (BGRW_DEBUG) {
+						fmt::print("Granules for [{0} - {1}) not empty! ({2}):\n",
+						           range.begin.printable(),
+						           range.end.printable(),
+						           granules.size());
+						for (auto& it : granules) {
+							fmt::print("  [{0} - {1})\n", it.begin.printable(), it.end.printable());
+						}
+					}
+					ASSERT(granules.empty());
+				}
+				break;
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+
+		return Void();
+	}
+
+	ACTOR Future<bool> _check(Database cx, BlobGranuleRangesWorkload* self) {
+		TraceEvent("BlobGranuleRangesCheck")
+		    .detail("ActiveRanges", self->activeRanges.size())
+		    .detail("InactiveRanges", self->inactiveRanges.size())
+		    .log();
+		if (BGRW_DEBUG) {
+			fmt::print("Checking {0} active and {1} inactive ranges\n",
+			           self->activeRanges.size(),
+			           self->inactiveRanges.size());
+		}
+		state std::vector<Future<Void>> checks;
+		for (int i = 0; i < self->activeRanges.size(); i++) {
+			checks.push_back(self->checkRange(cx, self, self->activeRanges[i], true));
+		}
+
+		// FIXME: re-enable! if we don't force purge there are weird races that cause granules to technically still
+		// exist
+		/*for (int i = 0; i < self->inactiveRanges.size(); i++) {
+		    checks.push_back(self->checkRange(cx, self, self->inactiveRanges[i], false));
+		}*/
+		wait(waitForAll(checks));
+		wait(self->unitClient);
+		TraceEvent("BlobGranuleRangesCheckComplete");
+		return true;
+	}
+
+	void getMetrics(std::vector<PerfMetric>& m) override {}
+
+	ACTOR Future<Void> blobGranuleRangesClient(Database cx, BlobGranuleRangesWorkload* self) {
+		state double last = now();
+		loop {
+			state Future<Void> waitNextOp = poisson(&last, 1.0 / self->operationsPerSecond);
+
+			if (self->activeRanges.empty() || deterministicRandom()->coinflip()) {
+				wait(self->registerNewRange(cx, self));
+			} else {
+				wait(self->unregisterRandomRange(cx, self));
+			}
+
+			wait(waitNextOp);
+		}
+	}
+
+	ACTOR Future<Void> tearDownRangeAfterUnit(Database cx, BlobGranuleRangesWorkload* self, KeyRange range) {
+		if (BGRW_DEBUG) {
+			fmt::print("Tearing down [{0} - {1}) after unit!\n", range.begin.printable(), range.end.printable());
+		}
+
+		// tear down range at end
+		Key purgeKey = wait(cx->purgeBlobGranules(range, 1, {}, true));
+		wait(cx->waitPurgeGranulesComplete(purgeKey));
+		bool success = wait(self->setRange(cx, range, false));
+		ASSERT(success);
+
+		if (BGRW_DEBUG) {
+			fmt::print("Range [{0} - {1}) torn down.\n", range.begin.printable(), range.end.printable());
+		}
+
+		return Void();
+	}
+
+	ACTOR Future<Void> verifyRangeUnit(Database cx, BlobGranuleRangesWorkload* self, KeyRange range) {
+		state KeyRange activeRange(KeyRangeRef(range.begin.withSuffix("A"_sr), range.begin.withSuffix("B"_sr)));
+		state Key middleKey = range.begin.withSuffix("AF"_sr);
+
+		if (BGRW_DEBUG) {
+			fmt::print("VerifyRangeUnit: [{0} - {1})\n", range.begin.printable(), range.end.printable());
+		}
+		bool setSuccess = wait(self->setRange(cx, activeRange, true));
+		ASSERT(setSuccess);
+		wait(self->checkRange(cx, self, activeRange, true));
+
+		bool success1 = wait(self->isRangeActive(cx, KeyRangeRef(activeRange.begin, middleKey)));
+		ASSERT(success1);
+
+		bool success2 = wait(self->isRangeActive(cx, KeyRangeRef(middleKey, activeRange.end)));
+		ASSERT(success2);
+
+		bool fail1 = wait(self->isRangeActive(cx, range));
+		ASSERT(!fail1);
+
+		bool fail2 = wait(self->isRangeActive(cx, KeyRangeRef(range.begin, activeRange.begin)));
+		ASSERT(!fail2);
+
+		bool fail3 = wait(self->isRangeActive(cx, KeyRangeRef(activeRange.end, range.end)));
+		ASSERT(!fail3);
+
+		bool fail4 = wait(self->isRangeActive(cx, KeyRangeRef(range.begin, middleKey)));
+		ASSERT(!fail4);
+
+		bool fail5 = wait(self->isRangeActive(cx, KeyRangeRef(middleKey, range.end)));
+		ASSERT(!fail5);
+
+		bool fail6 = wait(self->isRangeActive(cx, KeyRangeRef(range.begin, activeRange.end)));
+		ASSERT(!fail6);
+
+		bool fail7 = wait(self->isRangeActive(cx, KeyRangeRef(activeRange.begin, range.end)));
+		ASSERT(!fail7);
+
+		wait(self->tearDownRangeAfterUnit(cx, self, activeRange));
+
+		return Void();
+	}
+
+	ACTOR Future<Void> verifyRangeGapUnit(Database cx, BlobGranuleRangesWorkload* self, KeyRange range) {
+		state std::vector<Key> boundaries;
+		boundaries.push_back(range.begin);
+		state int rangeCount = deterministicRandom()->randomExp(3, 6) + 1;
+		for (int i = 0; i < rangeCount - 1; i++) {
+			std::string suffix = format("%04x", i);
+			boundaries.push_back(range.begin.withSuffix(suffix));
+		}
+		boundaries.push_back(range.end);
+
+		ASSERT(boundaries.size() - 1 == rangeCount);
+
+		state int rangeToNotBlobbify = deterministicRandom()->randomInt(0, rangeCount);
+		state int i;
+		for (i = 0; i < rangeCount; i++) {
+			state KeyRange subRange(KeyRangeRef(boundaries[i], boundaries[i + 1]));
+			if (i != rangeToNotBlobbify) {
+				bool setSuccess = wait(self->setRange(cx, subRange, true));
+				ASSERT(setSuccess);
+				wait(self->checkRange(cx, self, subRange, true));
+			} else {
+				wait(self->checkRange(cx, self, subRange, false));
+			}
+		}
+
+		bool success = wait(self->isRangeActive(cx, range));
+		ASSERT(!success);
+
+		if (rangeToNotBlobbify != 0) {
+			wait(self->tearDownRangeAfterUnit(cx, self, KeyRangeRef(boundaries[0], boundaries[rangeToNotBlobbify])));
+		}
+		if (rangeToNotBlobbify != rangeCount - 1) {
+			wait(self->tearDownRangeAfterUnit(
+			    cx, self, KeyRangeRef(boundaries[rangeToNotBlobbify + 1], boundaries.back())));
+		}
+
+		return Void();
+	}
+
+	ACTOR Future<Void> rangesMisalignedUnit(Database cx, BlobGranuleRangesWorkload* self, KeyRange range) {
+		// FIXME: parts of this don't work yet
+		bool setSuccess = wait(self->setRange(cx, range, true));
+		ASSERT(setSuccess);
+		state KeyRange subRange(KeyRangeRef(range.begin.withSuffix("A"_sr), range.begin.withSuffix("B"_sr)));
+
+		// getBlobGranules and getBlobRanges on subRange - should return actual granules instead of clipped to subRange
+		Standalone<VectorRef<KeyRangeRef>> blobRanges = wait(cx->listBlobbifiedRanges(range, 1000000));
+		ASSERT(blobRanges.size() == 1);
+		ASSERT(blobRanges[0] == range);
+
+		state Transaction tr(cx);
+		loop {
+			try {
+				Standalone<VectorRef<KeyRangeRef>> granules = wait(tr.getBlobGranuleRanges(range, 1000000));
+				ASSERT(granules.size() == 1);
+				ASSERT(granules[0] == range);
+				break;
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+
+		Key purgeKey = wait(cx->purgeBlobGranules(subRange, 1, {}, false));
+		wait(cx->waitPurgeGranulesComplete(purgeKey));
+
+		// should still be active after normal purge
+		bool success1 = wait(self->isRangeActive(cx, subRange));
+		ASSERT(success1);
+
+		bool success2 = wait(self->isRangeActive(cx, range));
+		ASSERT(success2);
+
+		Key forcePurgeKey = wait(cx->purgeBlobGranules(subRange, 1, {}, false));
+		wait(cx->waitPurgeGranulesComplete(forcePurgeKey));
+
+		// should NOT still be active after force purge
+		bool fail1 = wait(self->isRangeActive(cx, subRange));
+		ASSERT(!fail1);
+
+		bool fail2 = wait(self->isRangeActive(cx, range));
+		ASSERT(!fail2);
+
+		// getBlobGranules should return nothing here after purge
+
+		tr.reset();
+		loop {
+			try {
+				Standalone<VectorRef<KeyRangeRef>> granules2 = wait(tr.getBlobGranuleRanges(range, 1000000));
+				ASSERT(granules2.empty());
+				break;
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+		return Void();
+
+		// TODO - also test purging larger range than blob range
+	}
+
+	ACTOR Future<Void> blobbifyIdempotentUnit(Database cx, BlobGranuleRangesWorkload* self, KeyRange range) {
+		state KeyRange activeRange(KeyRangeRef(range.begin.withSuffix("A"_sr), range.begin.withSuffix("B"_sr)));
+		state Key middleKey = range.begin.withSuffix("AF"_sr);
+		state Key middleKey2 = range.begin.withSuffix("AG"_sr);
+
+		bool success = wait(self->setRange(cx, activeRange, true));
+		ASSERT(success);
+		wait(self->checkRange(cx, self, activeRange, true));
+
+		// check that re-blobbifying same range is successful
+		bool retrySuccess = wait(self->setRange(cx, activeRange, true));
+		ASSERT(retrySuccess);
+		wait(self->checkRange(cx, self, activeRange, true));
+
+		// check that blobbifying range that overlaps but does not match existing blob range fails
+		bool fail1 = wait(self->setRange(cx, range, true));
+		ASSERT(!fail1);
+
+		bool fail2 = wait(self->setRange(cx, KeyRangeRef(range.begin, activeRange.end), true));
+		ASSERT(!fail2);
+
+		bool fail3 = wait(self->setRange(cx, KeyRangeRef(activeRange.begin, range.end), true));
+		ASSERT(!fail3);
+
+		bool fail4 = wait(self->setRange(cx, KeyRangeRef(range.begin, middleKey), true));
+		ASSERT(!fail4);
+
+		bool fail5 = wait(self->setRange(cx, KeyRangeRef(middleKey, range.end), true));
+		ASSERT(!fail5);
+
+		bool fail6 = wait(self->setRange(cx, KeyRangeRef(activeRange.begin, middleKey), true));
+		ASSERT(!fail6);
+
+		bool fail7 = wait(self->setRange(cx, KeyRangeRef(middleKey, activeRange.end), true));
+		ASSERT(!fail7);
+
+		bool fail8 = wait(self->setRange(cx, KeyRangeRef(middleKey, middleKey2), true));
+		ASSERT(!fail8);
+
+		Standalone<VectorRef<KeyRangeRef>> blobRanges = wait(cx->listBlobbifiedRanges(range, 1000000));
+		ASSERT(blobRanges.size() == 1);
+		ASSERT(blobRanges[0] == activeRange);
+
+		state Transaction tr(cx);
+		loop {
+			try {
+				Standalone<VectorRef<KeyRangeRef>> granules = wait(tr.getBlobGranuleRanges(range, 1000000));
+				ASSERT(granules.size() == 1);
+				ASSERT(granules[0] == activeRange);
+				break;
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+
+		// tear down + check that un-blobbifying at a non-aligned range also doesn't work
+		Key purgeKey = wait(cx->purgeBlobGranules(range, 1, {}, true));
+		wait(cx->waitPurgeGranulesComplete(purgeKey));
+
+		bool unblobbifyFail1 = wait(self->setRange(cx, range, false));
+		ASSERT(!unblobbifyFail1);
+
+		bool unblobbifyFail2 = wait(self->setRange(cx, KeyRangeRef(range.begin, activeRange.end), false));
+		ASSERT(!unblobbifyFail2);
+
+		bool unblobbifyFail3 = wait(self->setRange(cx, KeyRangeRef(activeRange.begin, range.end), false));
+		ASSERT(!unblobbifyFail3);
+
+		bool unblobbifyFail4 = wait(self->setRange(cx, KeyRangeRef(activeRange.begin, middleKey), false));
+		ASSERT(!unblobbifyFail4);
+
+		bool unblobbifyFail5 = wait(self->setRange(cx, KeyRangeRef(middleKey, activeRange.end), false));
+		ASSERT(!unblobbifyFail5);
+
+		bool unblobbifyFail6 = wait(self->setRange(cx, KeyRangeRef(activeRange.begin, middleKey), false));
+		ASSERT(!unblobbifyFail6);
+
+		bool unblobbifyFail7 = wait(self->setRange(cx, KeyRangeRef(middleKey, activeRange.end), false));
+		ASSERT(!unblobbifyFail7);
+
+		bool unblobbifyFail8 = wait(self->setRange(cx, KeyRangeRef(middleKey, middleKey2), false));
+		ASSERT(!unblobbifyFail8);
+
+		bool unblobbifySuccess = wait(self->setRange(cx, activeRange, false));
+		ASSERT(!unblobbifySuccess);
+
+		return Void();
+	}
+
+	ACTOR Future<Void> reBlobbifyUnit(Database cx, BlobGranuleRangesWorkload* self, KeyRange range) {
+		bool setSuccess = wait(self->setRange(cx, range, true));
+		ASSERT(setSuccess);
+		wait(self->checkRange(cx, self, range, true));
+
+		// force purge range
+		Key purgeKey = wait(cx->purgeBlobGranules(range, 1, {}, true));
+		wait(cx->waitPurgeGranulesComplete(purgeKey));
+		wait(self->checkRange(cx, self, range, false));
+
+		bool unsetSuccess = wait(self->setRange(cx, range, false));
+		ASSERT(unsetSuccess);
+		wait(self->checkRange(cx, self, range, false));
+
+		bool reSetSuccess = wait(self->setRange(cx, range, true));
+		ASSERT(reSetSuccess);
+		wait(self->checkRange(cx, self, range, true));
+
+		wait(self->tearDownRangeAfterUnit(cx, self, range));
+
+		return Void();
+	}
+
+	enum UnitTestTypes {
+		VERIFY_RANGE_UNIT,
+		VERIFY_RANGE_GAP_UNIT,
+		RANGES_MISALIGNED,
+		BLOBBIFY_IDEMPOTENT,
+		RE_BLOBBIFY,
+		OP_COUNT = 5 /* keep this last */
+	};
+
+	ACTOR Future<Void> blobGranuleRangesUnitTests(Database cx, BlobGranuleRangesWorkload* self) {
+		loop {
+			if (self->stopUnitClient) {
+				return Void();
+			}
+			std::set<UnitTestTypes> excludedTypes;
+			excludedTypes.insert(OP_COUNT);
+
+			// FIXME: fix bugs and enable these tests!
+			excludedTypes.insert(RANGES_MISALIGNED); // TODO - fix in blob manager
+			excludedTypes.insert(BLOBBIFY_IDEMPOTENT); // fix already in progress in a separate PR
+			excludedTypes.insert(RE_BLOBBIFY); // TODO - fix is non-trivial, is desired behavior eventually
+
+			std::string nextRangeKey = "U_" + self->newKey();
+			state KeyRange range(KeyRangeRef(StringRef(nextRangeKey), strinc(StringRef(nextRangeKey))));
+			// prevent infinite loop
+			int loopTries = 1000;
+			int op = OP_COUNT;
+			loop {
+				op = deterministicRandom()->randomInt(0, OP_COUNT);
+				if (!excludedTypes.count((UnitTestTypes)op)) {
+					break;
+				}
+				loopTries--;
+				ASSERT(loopTries >= 0);
+			}
+
+			if (BGRW_DEBUG) {
+				fmt::print(
+				    "Selected range [{0} - {1}) for unit {2}.\n", range.begin.printable(), range.end.printable(), op);
+			}
+
+			if (op == VERIFY_RANGE_UNIT) {
+				wait(self->verifyRangeUnit(cx, self, range));
+			} else if (op == VERIFY_RANGE_GAP_UNIT) {
+				wait(self->verifyRangeGapUnit(cx, self, range));
+			} else if (op == RANGES_MISALIGNED) {
+				wait(self->rangesMisalignedUnit(cx, self, range));
+			} else if (op == BLOBBIFY_IDEMPOTENT) {
+				wait(self->blobbifyIdempotentUnit(cx, self, range));
+			} else if (op == RE_BLOBBIFY) {
+				wait(self->reBlobbifyUnit(cx, self, range));
+			} else {
+				ASSERT(false);
+			}
+
+			wait(delay(1.0));
+		}
+	}
+};
+
+WorkloadFactory<BlobGranuleRangesWorkload> BlobGranuleRangesWorkloadFactory("BlobGranuleRanges");
diff --git a/fdbserver/workloads/BlobGranuleVerifier.actor.cpp b/fdbserver/workloads/BlobGranuleVerifier.actor.cpp
index bca7a0feb4..11913dc6c1 100644
--- a/fdbserver/workloads/BlobGranuleVerifier.actor.cpp
+++ b/fdbserver/workloads/BlobGranuleVerifier.actor.cpp
@@ -29,8 +29,10 @@
 #include "fdbclient/ReadYourWrites.h"
 #include "fdbclient/SystemData.h"
 #include "fdbserver/BlobGranuleValidation.actor.h"
+#include "fdbserver/BlobGranuleServerCommon.actor.h"
 #include "fdbserver/Knobs.h"
 #include "fdbserver/TesterInterface.actor.h"
+#include "fdbserver/QuietDatabase.h"
 #include "fdbserver/workloads/workloads.actor.h"
 #include "flow/Error.h"
 #include "flow/IRandom.h"
@@ -49,8 +51,6 @@
  */
 struct BlobGranuleVerifierWorkload : TestWorkload {
 	bool doSetup;
-	double minDelay;
-	double maxDelay;
 	double testDuration;
 	double timeTravelLimit;
 	uint64_t timeTravelBufferSize;
@@ -65,41 +65,84 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 	int64_t purges = 0;
 	std::vector<Future<Void>> clients;
 	bool enablePurging;
+	bool initAtEnd;
 	bool strictPurgeChecking;
+	bool doForcePurge;
+	bool purgeAtLatest;
+	bool clearAndMergeCheck;
 
 	DatabaseConfiguration config;
 
 	Reference<BlobConnectionProvider> bstore;
 	AsyncVar<Standalone<VectorRef<KeyRangeRef>>> granuleRanges;
 
+	bool startedForcePurge;
+	Optional<Key> forcePurgeKey;
+	Version forcePurgeVersion;
+
+	std::vector<std::tuple<KeyRange, Version, UID, Future<GranuleFiles>>> purgedDataToCheck;
+
+	Future<Void> summaryClient;
+	Promise<Void> triggerSummaryComplete;
+
 	BlobGranuleVerifierWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
 		doSetup = !clientId; // only do this on the "first" client
-		// FIXME: don't do the delay in setup, as that delays the start of all workloads
-		minDelay = getOption(options, LiteralStringRef("minDelay"), 0.0);
-		maxDelay = getOption(options, LiteralStringRef("maxDelay"), 0.0);
 		testDuration = getOption(options, LiteralStringRef("testDuration"), 120.0);
 		timeTravelLimit = getOption(options, LiteralStringRef("timeTravelLimit"), testDuration);
 		timeTravelBufferSize = getOption(options, LiteralStringRef("timeTravelBufferSize"), 100000000);
 		threads = getOption(options, LiteralStringRef("threads"), 1);
-		enablePurging = getOption(options, LiteralStringRef("enablePurging"), false /*sharedRandomNumber % 2 == 0*/);
-		sharedRandomNumber /= 2;
+
+		enablePurging = getOption(options, LiteralStringRef("enablePurging"), sharedRandomNumber % 3 == 0);
+		sharedRandomNumber /= 3;
 		// FIXME: re-enable this! There exist several bugs with purging active granules where a small amount of state
 		// won't be cleaned up.
 		strictPurgeChecking =
 		    getOption(options, LiteralStringRef("strictPurgeChecking"), false /*sharedRandomNumber % 2 == 0*/);
-		ASSERT(threads >= 1);
+		sharedRandomNumber /= 2;
 
-		if (BGV_DEBUG) {
-			printf("Initializing Blob Granule Verifier s3 stuff\n");
+		doForcePurge = getOption(options, LiteralStringRef("doForcePurge"), sharedRandomNumber % 3 == 0);
+		sharedRandomNumber /= 3;
+
+		purgeAtLatest = getOption(options, LiteralStringRef("purgeAtLatest"), sharedRandomNumber % 3 == 0);
+		sharedRandomNumber /= 3;
+
+		// randomly some tests write data first and then turn on blob granules later, to test conversion of existing DB
+		initAtEnd = !enablePurging && sharedRandomNumber % 10 == 0;
+		sharedRandomNumber /= 10;
+
+		clearAndMergeCheck = getOption(options, LiteralStringRef("clearAndMergeCheck"), sharedRandomNumber % 10 == 0);
+		sharedRandomNumber /= 10;
+
+		// don't do strictPurgeChecking or forcePurge if !enablePurging
+		if (!enablePurging) {
+			strictPurgeChecking = false;
+			doForcePurge = false;
+			purgeAtLatest = false;
 		}
+
+		if (doForcePurge) {
+			purgeAtLatest = false;
+		}
+
+		if (purgeAtLatest) {
+			strictPurgeChecking = false;
+		}
+
+		startedForcePurge = false;
+
+		if (doSetup && BGV_DEBUG) {
+			fmt::print("BlobGranuleVerifier starting\n");
+			fmt::print("  enablePurging={0}\n", enablePurging);
+			fmt::print("  purgeAtLatest={0}\n", purgeAtLatest);
+			fmt::print("  strictPurgeChecking={0}\n", strictPurgeChecking);
+			fmt::print("  doForcePurge={0}\n", doForcePurge);
+			fmt::print("  initAtEnd={0}\n", initAtEnd);
+			fmt::print("  clearAndMergeCheck={0}\n", clearAndMergeCheck);
+		}
+
+		ASSERT(threads >= 1);
 		try {
-			if (BGV_DEBUG) {
-				printf("Blob Granule Verifier constructing backup container from %s\n", SERVER_KNOBS->BG_URL.c_str());
-			}
 			bstore = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL);
-			if (BGV_DEBUG) {
-				printf("Blob Granule Verifier constructed backup container\n");
-			}
 		} catch (Error& e) {
 			if (BGV_DEBUG) {
 				printf("Blob Granule Verifier got backup container init error %s\n", e.name());
@@ -110,9 +153,8 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 
 	// FIXME: run the actual FDBCLI command instead of copy/pasting its implementation
 	// Sets the whole user keyspace to be blobified
-	ACTOR Future<Void> setUpBlobRange(Database cx, Future<Void> waitForStart) {
+	ACTOR Future<Void> setUpBlobRange(Database cx) {
 		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
-		wait(waitForStart);
 		loop {
 			try {
 				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
@@ -142,11 +184,9 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 
 		wait(success(ManagementAPI::changeConfig(cx.getReference(), "blob_granules_enabled=1", true)));
 
-		double initialDelay = deterministicRandom()->random01() * (self->maxDelay - self->minDelay) + self->minDelay;
-		if (BGV_DEBUG) {
-			printf("BGW setup initial delay of %.3f\n", initialDelay);
+		if (!self->initAtEnd) {
+			wait(self->setUpBlobRange(cx));
 		}
-		wait(self->setUpBlobRange(cx, delay(initialDelay)));
 		return Void();
 	}
 
@@ -155,7 +195,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 			state Transaction tr(cx);
 			loop {
 				try {
-					Standalone<VectorRef<KeyRangeRef>> allGranules = wait(tr.getBlobGranuleRanges(normalKeys));
+					Standalone<VectorRef<KeyRangeRef>> allGranules = wait(tr.getBlobGranuleRanges(normalKeys, 1000000));
 					self->granuleRanges.set(allGranules);
 					break;
 				} catch (Error& e) {
@@ -216,6 +256,48 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		}
 	}
 
+	// TODO refactor more generally
+	ACTOR Future<Void> loadGranuleMetadataBeforeForcePurge(Database cx, BlobGranuleVerifierWorkload* self) {
+		// load all granule history entries that intersect purged range
+		state KeyRange cur = blobGranuleHistoryKeys;
+		state Transaction tr(cx);
+		loop {
+			try {
+				RangeResult history = wait(tr.getRange(cur, 100));
+				for (auto& it : history) {
+					KeyRange keyRange;
+					Version version;
+					std::tie(keyRange, version) = decodeBlobGranuleHistoryKey(it.key);
+					// TODO: filter by key range for partial key range purge
+					Standalone<BlobGranuleHistoryValue> historyValue = decodeBlobGranuleHistoryValue(it.value);
+
+					Future<GranuleFiles> fileFuture = loadHistoryFiles(cx, historyValue.granuleID);
+					self->purgedDataToCheck.push_back({ keyRange, version, historyValue.granuleID, fileFuture });
+				}
+				if (!history.empty() && history.more) {
+					cur = KeyRangeRef(keyAfter(history.back().key), cur.end);
+				} else {
+					break;
+				}
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+
+		// wait for file loads to finish
+		state int i;
+		for (i = 0; i < self->purgedDataToCheck.size(); i++) {
+			wait(success(std::get<3>(self->purgedDataToCheck[i])));
+		}
+
+		if (BGV_DEBUG) {
+			fmt::print("BGV loaded {0} granules metadata before force purge\n", self->purgedDataToCheck.size());
+		}
+
+		ASSERT(!self->purgedDataToCheck.empty());
+		return Void();
+	}
+
 	ACTOR Future<Void> verifyGranules(Database cx, BlobGranuleVerifierWorkload* self, bool allowPurging) {
 		state double last = now();
 		state double endTime = last + self->testDuration;
@@ -251,7 +333,9 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 
 					// before doing read, purge just before read version
 					state Version newPurgeVersion = 0;
-					state bool doPurging = allowPurging && deterministicRandom()->random01() < 0.5;
+					state bool doPurging =
+					    allowPurging && !self->purgeAtLatest && deterministicRandom()->random01() < 0.5;
+					state bool forcePurge = doPurging && self->doForcePurge && deterministicRandom()->random01() < 0.25;
 					if (doPurging) {
 						CODE_PROBE(true, "BGV considering purge");
 						Version maxPurgeVersion = oldRead.v;
@@ -263,10 +347,22 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 							newPurgeVersion = deterministicRandom()->randomInt64(prevPurgeVersion, maxPurgeVersion);
 							prevPurgeVersion = std::max(prevPurgeVersion, newPurgeVersion);
 							if (BGV_DEBUG) {
-								fmt::print("BGV Purging @ {0}\n", newPurgeVersion);
+								fmt::print("BGV Purging @ {0}{1}\n", newPurgeVersion, forcePurge ? " (force)" : "");
 							}
 							try {
-								Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, newPurgeVersion, {}, false));
+								if (forcePurge) {
+									wait(self->loadGranuleMetadataBeforeForcePurge(cx, self));
+									self->startedForcePurge = true;
+								}
+								Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, newPurgeVersion, {}, forcePurge));
+								if (forcePurge) {
+									self->forcePurgeKey = purgeKey;
+									self->forcePurgeVersion = newPurgeVersion;
+									if (BGV_DEBUG) {
+										fmt::print("BGV Force purge registered, stopping\n");
+									}
+									return Void();
+								}
 								if (BGV_DEBUG) {
 									fmt::print("BGV Purged @ {0}, waiting\n", newPurgeVersion);
 								}
@@ -366,11 +462,33 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 				state KeyRange range = self->granuleRanges.get()[rIndex];
 
 				state std::pair<RangeResult, Version> fdb = wait(readFromFDB(cx, range));
-				std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob =
+				state std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob =
 				    wait(readFromBlob(cx, self->bstore, range, 0, fdb.second));
+				if (self->purgeAtLatest && timeTravelChecks.empty() && deterministicRandom()->random01() < 0.25) {
+					// purge at this version, and make sure it's still readable after on our immediate re-read
+					try {
+						Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, fdb.second, {}, false));
+						if (BGV_DEBUG) {
+							fmt::print("BGV Purged Latest @ {0}, waiting\n", fdb.second);
+						}
+						wait(cx->waitPurgeGranulesComplete(purgeKey));
+					} catch (Error& e) {
+						if (e.code() == error_code_operation_cancelled) {
+							throw e;
+						}
+						// purging shouldn't error, it should retry.
+						if (BGV_DEBUG) {
+							fmt::print("Unexpected error {0} purging latest @ {1}!\n", e.name(), newPurgeVersion);
+						}
+						ASSERT(false);
+					}
+					self->purges++;
+				}
 				if (compareFDBAndBlob(fdb.first, blob, range, fdb.second, BGV_DEBUG)) {
-					// TODO: bias for immediately re-reading to catch rollback cases
-					double reReadTime = currentTime + deterministicRandom()->random01() * self->timeTravelLimit;
+					bool rereadImmediately = self->purgeAtLatest || deterministicRandom()->random01() < 0.25;
+					double reReadTime =
+					    currentTime +
+					    (rereadImmediately ? 0.0 : deterministicRandom()->random01() * self->timeTravelLimit);
 					int memory = fdb.first.expectedSize();
 					if (reReadTime <= endTime &&
 					    timeTravelChecksMemory + memory <= (self->timeTravelBufferSize / self->threads)) {
@@ -393,7 +511,6 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 				}
 				self->errors++;
 			}
-			// wait(poisson(&last, 5.0));
 			wait(poisson(&last, 0.1));
 		}
 	}
@@ -410,6 +527,11 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 				    reportErrors(verifyGranules(cx, this, false), "BlobGranuleVerifier"), testDuration, Void()));
 			}
 		}
+		if (!enablePurging) {
+			summaryClient = validateGranuleSummaries(cx, normalKeys, {}, triggerSummaryComplete);
+		} else {
+			summaryClient = Future<Void>(Void());
+		}
 		return delay(testDuration);
 	}
 
@@ -427,10 +549,485 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		}
 	}
 
-	ACTOR Future<bool> _check(Database cx, BlobGranuleVerifierWorkload* self) {
-		// check error counts, and do an availability check at the end
+	ACTOR Future<bool> checkGranuleMetadataPurged(Transaction* tr,
+	                                              KeyRange granuleRange,
+	                                              Version historyVersion,
+	                                              UID granuleId,
+	                                              bool strictMetadataCheck,
+	                                              bool possiblyInFlight) {
+		// change feed
+		Optional<Value> changeFeed = wait(tr->get(granuleIDToCFKey(granuleId).withPrefix(changeFeedPrefix)));
+		if (possiblyInFlight && changeFeed.present()) {
+			fmt::print("WARN: Change Feed for [{0} - {1}): {2} not purged, retrying\n",
+			           granuleRange.begin.printable(),
+			           granuleRange.end.printable(),
+			           granuleId.toString().substr(0, 6));
+			return false;
+		}
+		ASSERT(!changeFeed.present());
+
+		// file metadata
+		RangeResult fileMetadata = wait(tr->getRange(blobGranuleFileKeyRangeFor(granuleId), 1));
+		if (possiblyInFlight && !fileMetadata.empty()) {
+			fmt::print("WARN: File metadata for [{0} - {1}): {2} not purged, retrying\n",
+			           granuleRange.begin.printable(),
+			           granuleRange.end.printable(),
+			           granuleId.toString().substr(0, 6));
+			return false;
+		}
+		ASSERT(fileMetadata.empty());
+
+		if (strictMetadataCheck) {
+			// lock
+			Optional<Value> lock = wait(tr->get(blobGranuleLockKeyFor(granuleRange)));
+			if (possiblyInFlight && lock.present()) {
+				return false;
+			}
+			ASSERT(!lock.present());
+
+			// history entry
+			Optional<Value> history = wait(tr->get(blobGranuleHistoryKeyFor(granuleRange, historyVersion)));
+			if (possiblyInFlight && history.present()) {
+				return false;
+			}
+			ASSERT(!history.present());
+
+			// split state
+			RangeResult splitData = wait(tr->getRange(blobGranuleSplitKeyRangeFor(granuleId), 1));
+			if (possiblyInFlight && !splitData.empty()) {
+				return false;
+			}
+			ASSERT(splitData.empty());
+
+			// merge state
+			Optional<Value> merge = wait(tr->get(blobGranuleMergeKeyFor(granuleId)));
+			if (possiblyInFlight && merge.present()) {
+				return false;
+			}
+			ASSERT(!merge.present());
+
+			// FIXME: add merge boundaries!
+		}
+
+		return true;
+	}
+
+	ACTOR Future<Void> checkPurgedHistoryEntries(Database cx,
+	                                             BlobGranuleVerifierWorkload* self,
+	                                             KeyRange purgeRange,
+	                                             bool strictMetadataCheck) {
+		// quick check to make sure we didn't miss any new granules generated between the purge metadata load time and
+		// the actual purge, by checking for any new history keys in the range
+		// FIXME: fix this check! The BW granule check is really the important one, this finds occasional leftover
+		// metadata from boundary changes that can race with a force purge, but as long as no blob worker acts on the
+		// boundary changes, a bit of leftover metadata is a much smaller problem. To confirm that it was a race with
+		// force purging, we check that the history version > the force purge version
+		state Transaction tr(cx);
+		state KeyRange cur = blobGranuleHistoryKeys;
+		state std::vector<std::tuple<KeyRange, Version, UID>> granulesToCheck;
+		loop {
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			try {
+				RangeResult history = wait(tr.getRange(cur, 1000));
+				for (auto& it : history) {
+					KeyRange keyRange;
+					Version version;
+					std::tie(keyRange, version) = decodeBlobGranuleHistoryKey(it.key);
+					if (purgeRange.intersects(keyRange)) {
+						if (BGV_DEBUG) {
+							fmt::print("Found range [{0} - {1}) @ {2} that avoided force purge [{3} - {4}) @ {5}!!\n",
+							           keyRange.begin.printable(),
+							           keyRange.end.printable(),
+							           version,
+							           purgeRange.begin.printable(),
+							           purgeRange.end.printable(),
+							           self->forcePurgeVersion);
+						}
+						if (strictMetadataCheck) {
+							ASSERT(!purgeRange.intersects(keyRange));
+						} else {
+							Standalone<BlobGranuleHistoryValue> historyValue = decodeBlobGranuleHistoryValue(it.value);
+							granulesToCheck.emplace_back(keyRange, version, historyValue.granuleID);
+						}
+					}
+				}
+				if (!history.empty() && history.more) {
+					cur = KeyRangeRef(keyAfter(history.back().key), cur.end);
+				} else {
+					break;
+				}
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+
+		tr.reset();
+		state int i;
+		if (BGV_DEBUG && !granulesToCheck.empty()) {
+			fmt::print("Checking metadata for {0} non-purged ranges\n", granulesToCheck.size());
+		}
+		for (i = 0; i < granulesToCheck.size(); i++) {
+			loop {
+				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+				try {
+					bool success = wait(self->checkGranuleMetadataPurged(&tr,
+					                                                     std::get<0>(granulesToCheck[i]),
+					                                                     std::get<1>(granulesToCheck[i]),
+					                                                     std::get<2>(granulesToCheck[i]),
+					                                                     strictMetadataCheck,
+					                                                     true));
+					if (success) {
+						break;
+					}
+					wait(delay(5.0));
+				} catch (Error& e) {
+					wait(tr.onError(e));
+				}
+			}
+		}
+
+		return Void();
+	}
+
+	ACTOR Future<bool> checkPurgedChangeFeeds(Database cx, BlobGranuleVerifierWorkload* self, KeyRange purgeRange) {
+		// quick check to make sure we didn't miss any change feeds
+		state Transaction tr(cx);
+		state KeyRange cur = changeFeedKeys;
+		loop {
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			try {
+				RangeResult feeds = wait(tr.getRange(cur, 1000));
+				for (auto& it : feeds) {
+					KeyRange keyRange;
+					Version version;
+					ChangeFeedStatus status;
+					std::tie(keyRange, version, status) = decodeChangeFeedValue(it.value);
+					if (purgeRange.intersects(keyRange)) {
+						Key feedId = it.key.removePrefix(changeFeedKeys.begin);
+						if (BGV_DEBUG) {
+							fmt::print(
+							    "Found Change Feed {0}: [{1} - {2}) that avoided force purge [{3} - {4}) @ {5}!!\n",
+							    feedId.printable(),
+							    keyRange.begin.printable(),
+							    keyRange.end.printable(),
+							    purgeRange.begin.printable(),
+							    purgeRange.end.printable(),
+							    self->forcePurgeVersion);
+						}
+						// FIXME!!: there is a known race with the existing force purge algorithm that would require a
+						// bit of a redesign. This is mostly an edge case though that we don't anticipate seeing much in
+						// actual use, and the impact of these leaked change feeds is limited because the range is
+						// purged anyway.
+						bool foundAnyHistoryForRange = false;
+						for (auto& purgedData : self->purgedDataToCheck) {
+							KeyRange granuleRange = std::get<0>(purgedData);
+							if (granuleRange.intersects(keyRange)) {
+								foundAnyHistoryForRange = true;
+								break;
+							}
+						}
+
+						if (!foundAnyHistoryForRange) {
+							// if range never existed in blob, and was doing the initial snapshot,  it could have a
+							// change feed but not a history entry/snapshot
+							CODE_PROBE(true, "not failing test for leaked feed with no history");
+							fmt::print("Not failing test b/c feed never had history!\n");
+						}
+						return !foundAnyHistoryForRange;
+					}
+					// ASSERT(!purgeRange.intersects(keyRange));
+				}
+				if (!feeds.empty() && feeds.more) {
+					cur = KeyRangeRef(keyAfter(feeds.back().key), cur.end);
+				} else {
+					break;
+				}
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+
+		return true;
+	}
+
+	ACTOR Future<Void> validateForcePurge(Database cx, BlobGranuleVerifierWorkload* self, KeyRange purgeRange) {
+		// FIXME: enable!! right now leaking metadata isn't nearly as bad as leaking data/change feeds
+		state bool strictMetadataCheck = false;
+		// first, wait for force purge to complete
+		if (BGV_DEBUG) {
+			fmt::print("BGV waiting for force purge to complete\n");
+		}
+		wait(cx->waitPurgeGranulesComplete(self->forcePurgeKey.get()));
+		if (BGV_DEBUG) {
+			fmt::print("BGV force purge completed, checking\n");
+		}
 
 		state Transaction tr(cx);
+
+		// check that force purge range is set and that data is not readable
+		loop {
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			try {
+				ForcedPurgeState forcePurgedState = wait(getForcePurgedState(&tr, purgeRange));
+				ASSERT(forcePurgedState == ForcedPurgeState::AllPurged);
+				break;
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+
+		if (BGV_DEBUG) {
+			fmt::print("BGV force purge checked state\n");
+		}
+
+		// make sure blob read fails
+		tr.reset();
+		loop {
+			try {
+				Version readVersion = wait(self->doGrv(&tr));
+				wait(success(readFromBlob(cx, self->bstore, purgeRange, 0, readVersion)));
+				ASSERT(false);
+			} catch (Error& e) {
+				if (e.code() == error_code_operation_cancelled) {
+					throw e;
+				}
+				ASSERT(e.code() == error_code_blob_granule_transaction_too_old);
+				break;
+			}
+		}
+
+		if (BGV_DEBUG) {
+			fmt::print("BGV force purge checked read\n");
+		}
+
+		// check that metadata is gone for each granule
+		if (BGV_DEBUG) {
+			fmt::print("BGV checking metadata deleted\n");
+		}
+		state int i;
+		state int64_t filesChecked = 0;
+		for (i = 0; i < self->purgedDataToCheck.size(); i++) {
+			state KeyRange granuleRange = std::get<0>(self->purgedDataToCheck[i]);
+			state Version historyVersion = std::get<1>(self->purgedDataToCheck[i]);
+			state UID granuleId = std::get<2>(self->purgedDataToCheck[i]);
+			state GranuleFiles oldFiles = wait(std::get<3>(self->purgedDataToCheck[i]));
+			fmt::print("  Checking [{0} - {1}): {2}\n",
+			           granuleRange.begin.printable(),
+			           granuleRange.end.printable(),
+			           granuleId.toString().substr(0, 6));
+			loop {
+				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+				try {
+					bool success = wait(self->checkGranuleMetadataPurged(
+					    &tr, granuleRange, historyVersion, granuleId, strictMetadataCheck, false));
+					ASSERT(success);
+					break;
+				} catch (Error& e) {
+					wait(tr.onError(e));
+				}
+			}
+
+			// ensure all old files were deleted from blob storage
+			state int fileIdx;
+			for (fileIdx = 0; fileIdx < oldFiles.snapshotFiles.size() + oldFiles.deltaFiles.size(); fileIdx++) {
+				std::string fname = (fileIdx >= oldFiles.snapshotFiles.size())
+				                        ? oldFiles.deltaFiles[fileIdx - oldFiles.snapshotFiles.size()].filename
+				                        : oldFiles.snapshotFiles[fileIdx].filename;
+				state Reference<BackupContainerFileSystem> bstore = self->bstore->getForRead(fname);
+				try {
+					wait(success(bstore->readFile(fname)));
+					ASSERT(false);
+				} catch (Error& e) {
+					if (e.code() == error_code_operation_cancelled) {
+						throw e;
+					}
+					ASSERT(e.code() == error_code_file_not_found);
+					filesChecked++;
+				}
+			}
+		}
+
+		if (BGV_DEBUG) {
+			fmt::print("BGV force purge checked {0} old granules and {1} old files cleaned up\n",
+			           self->purgedDataToCheck.size(),
+			           filesChecked);
+		}
+
+		wait(self->checkPurgedHistoryEntries(cx, self, purgeRange, strictMetadataCheck));
+
+		if (BGV_DEBUG) {
+			fmt::print("BGV force purge checked for new granule history entries\n");
+		}
+
+		loop {
+			bool success = wait(self->checkPurgedChangeFeeds(cx, self, purgeRange));
+			if (success) {
+				break;
+			}
+		}
+
+		if (BGV_DEBUG) {
+			fmt::print("BGV force purge checked for leaked change feeds\n");
+		}
+
+		// ask all workers for all of their open granules and make sure none are in the force purge range
+
+		// Because there could be ranges assigned that havne't yet finished opening to check for purge, some BWs might
+		// still temporarily have ranges assigned. To address this, we just retry the check after a bit
+		loop {
+			state bool anyRangesLeft = false;
+			state std::vector<BlobWorkerInterface> blobWorkers = wait(getBlobWorkers(cx));
+
+			if (BGV_DEBUG) {
+				fmt::print("BGV force purge checking {0} blob worker mappings\n", blobWorkers.size());
+			}
+
+			for (i = 0; i < blobWorkers.size(); i++) {
+				GetGranuleAssignmentsRequest req;
+				req.managerEpoch = -1; // not manager
+				Optional<GetGranuleAssignmentsReply> assignments =
+				    wait(timeout(brokenPromiseToNever(blobWorkers[i].granuleAssignmentsRequest.getReply(req)),
+				                 SERVER_KNOBS->BLOB_WORKER_TIMEOUT));
+				if (assignments.present()) {
+					for (auto& it : assignments.get().assignments) {
+						if (purgeRange.intersects(it.range)) {
+							if (BGV_DEBUG) {
+								fmt::print("BW {0} still has range [{1} - {2})\n",
+								           blobWorkers[i].id().toString(),
+								           it.range.begin.printable(),
+								           it.range.end.printable());
+							}
+							anyRangesLeft = true;
+						}
+					}
+				} else {
+					if (BGV_DEBUG) {
+						fmt::print("BGV mapping check failed to reach BW {0}\n", blobWorkers[i].id().toString());
+					}
+					// if BW timed out, we don't for sure know it didn't still have some range
+					anyRangesLeft = true;
+				}
+			}
+			if (anyRangesLeft) {
+				wait(delay(10.0));
+			} else {
+				break;
+			}
+		}
+
+		if (BGV_DEBUG) {
+			fmt::print("BGV force purge check complete\n");
+		}
+
+		return Void();
+	}
+
+	// Check database against blob granules. This is especially important because during chaos phase this can error, and
+	// initAtEnd doesn't get data checked otherwise
+	ACTOR Future<bool> checkAllData(Database cx, BlobGranuleVerifierWorkload* self) {
+		state Transaction tr(cx);
+		state KeyRange keyRange = normalKeys;
+		state bool gotEOS = false;
+		state int64_t totalRows = 0;
+		loop {
+			state RangeResult output;
+			state Version readVersion = invalidVersion;
+			state int64_t bufferedBytes = 0;
+			try {
+				Version ver = wait(tr.getReadVersion());
+				readVersion = ver;
+
+				state PromiseStream<Standalone<RangeResultRef>> results;
+				state Future<Void> stream = tr.getRangeStream(results, keyRange, GetRangeLimits());
+
+				loop {
+					Standalone<RangeResultRef> res = waitNext(results.getFuture());
+					output.arena().dependsOn(res.arena());
+					output.append(output.arena(), res.begin(), res.size());
+					bufferedBytes += res.expectedSize();
+					// force checking if we have enough data
+					if (bufferedBytes >= 10 * SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES) {
+						break;
+					}
+				}
+			} catch (Error& e) {
+				if (e.code() == error_code_operation_cancelled) {
+					throw e;
+				}
+				if (e.code() == error_code_end_of_stream) {
+					gotEOS = true;
+				} else {
+					wait(tr.onError(e));
+				}
+			}
+
+			if (!output.empty()) {
+				state KeyRange rangeToCheck = KeyRangeRef(keyRange.begin, keyAfter(output.back().key));
+				try {
+					std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob =
+					    wait(readFromBlob(cx, self->bstore, rangeToCheck, 0, readVersion));
+					if (!compareFDBAndBlob(output, blob, rangeToCheck, readVersion, BGV_DEBUG)) {
+						return false;
+					}
+				} catch (Error& e) {
+					if (BGV_DEBUG && e.code() == error_code_blob_granule_transaction_too_old) {
+						fmt::print("CheckAllData got BG_TTO for [{0} - {1}) @ {2}\n",
+						           rangeToCheck.begin.printable(),
+						           rangeToCheck.end.printable(),
+						           readVersion);
+					}
+					ASSERT(e.code() != error_code_blob_granule_transaction_too_old);
+					throw e;
+				}
+				totalRows += output.size();
+				keyRange = KeyRangeRef(rangeToCheck.end, keyRange.end);
+			}
+			if (gotEOS) {
+				break;
+			}
+		}
+
+		if (BGV_DEBUG) {
+			fmt::print("BGV Final data check complete, checked {0} rows\n", totalRows);
+		}
+
+		return true;
+	}
+
+	ACTOR Future<bool> _check(Database cx, BlobGranuleVerifierWorkload* self) {
+		if (self->triggerSummaryComplete.canBeSet()) {
+			self->triggerSummaryComplete.send(Void());
+		}
+		state Transaction tr(cx);
+		if (self->doForcePurge) {
+			if (self->startedForcePurge) {
+				if (self->forcePurgeKey.present()) {
+					wait(self->validateForcePurge(cx, self, normalKeys));
+				} // else if we had already started purge during the test but aren't sure whether it was registered or
+				  // not,
+				// don't validate that data was purged since it may never be
+				return true;
+			}
+		} else if (self->enablePurging && self->purgeAtLatest && deterministicRandom()->coinflip()) {
+			Version latestPurgeVersion = wait(self->doGrv(&tr));
+			if (BGV_DEBUG) {
+				fmt::print("BGV Purging Latest @ {0} before final availability check\n", latestPurgeVersion);
+			}
+			Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, latestPurgeVersion, {}, false));
+			wait(cx->waitPurgeGranulesComplete(purgeKey));
+			if (BGV_DEBUG) {
+				fmt::print("BGV Purged Latest before final availability check complete\n");
+			}
+			self->purges++;
+		}
+
+		// check error counts, and do an availability check at the end
+
+		if (self->doSetup && self->initAtEnd) {
+			// FIXME: this doesn't check the data contents post-conversion, just that it finishes successfully
+			wait(self->setUpBlobRange(cx));
+		}
+
 		state Version readVersion = wait(self->doGrv(&tr));
 		state Version startReadVersion = readVersion;
 		state int checks = 0;
@@ -439,22 +1036,33 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		state bool availabilityPassed = true;
 
 		state Standalone<VectorRef<KeyRangeRef>> allRanges;
-		if (self->granuleRanges.get().empty()) {
-			if (BGV_DEBUG) {
-				fmt::print("Waiting to get granule ranges for check\n");
-			}
-			state Future<Void> rangeFetcher = self->findGranules(cx, self);
-			loop {
-				wait(self->granuleRanges.onChange());
-				if (!self->granuleRanges.get().empty()) {
+
+		state Future<Void> rangeFetcher = self->findGranules(cx, self);
+		loop {
+			// wait until entire keyspace has granules
+			if (!self->granuleRanges.get().empty()) {
+				bool haveAll = true;
+				if (self->granuleRanges.get().front().begin != normalKeys.begin ||
+				    self->granuleRanges.get().back().end != normalKeys.end) {
+					haveAll = false;
+				}
+				for (int i = 0; haveAll && i < self->granuleRanges.get().size() - 1; i++) {
+					if (self->granuleRanges.get()[i].end != self->granuleRanges.get()[i + 1].begin) {
+						haveAll = false;
+					}
+				}
+				if (haveAll) {
 					break;
 				}
 			}
-			rangeFetcher.cancel();
 			if (BGV_DEBUG) {
-				fmt::print("Got granule ranges for check\n");
+				fmt::print("Waiting to get granule ranges for check\n");
 			}
+			wait(self->granuleRanges.onChange());
 		}
+
+		rangeFetcher.cancel();
+
 		allRanges = self->granuleRanges.get();
 		for (auto& range : allRanges) {
 			state KeyRange r = range;
@@ -513,8 +1121,11 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		if (BGV_DEBUG && startReadVersion != readVersion) {
 			fmt::print("Availability check updated read version from {0} to {1}\n", startReadVersion, readVersion);
 		}
+
+		state bool dataPassed = wait(self->checkAllData(cx, self));
+
 		state bool result =
-		    availabilityPassed && self->mismatches == 0 && (checks > 0) && (self->timeTravelTooOld == 0);
+		    availabilityPassed && dataPassed && self->mismatches == 0 && (checks > 0) && (self->timeTravelTooOld == 0);
 		fmt::print("Blob Granule Verifier {0} {1}:\n", self->clientId, result ? "passed" : "failed");
 		fmt::print("  {} successful final granule checks\n", checks);
 		fmt::print("  {} failed final granule checks\n", availabilityPassed ? 0 : 1);
@@ -526,6 +1137,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		fmt::print("  {} rows\n", self->rowsRead);
 		fmt::print("  {} bytes\n", self->bytesRead);
 		fmt::print("  {} purges\n", self->purges);
+		fmt::print("  {} final data check\n", dataPassed ? "passed" : "failed");
 		// FIXME: add above as details to trace event
 
 		TraceEvent("BlobGranuleVerifierChecked").detail("Result", result);
@@ -533,17 +1145,71 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		// For some reason simulation is still passing when this fails?.. so assert for now
 		ASSERT(result);
 
-		// FIXME: if doPurging was set, possibly do one last purge here, and verify it succeeds with no errors
+		if (self->doForcePurge) {
+			// if granules are available, and we didn't do a force purge during the test, do it now
+			ASSERT(!self->startedForcePurge);
+			Version rv = wait(self->doGrv(&tr));
+			self->forcePurgeVersion = rv;
+			self->purgedDataToCheck.clear(); //  in case we started but didn't finish loading it, reset it
+			wait(self->loadGranuleMetadataBeforeForcePurge(cx, self));
+			Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, self->forcePurgeVersion, {}, true));
+			self->forcePurgeKey = purgeKey;
+			wait(self->validateForcePurge(cx, self, normalKeys));
 
-		if (self->clientId == 0 && SERVER_KNOBS->BG_ENABLE_MERGING && deterministicRandom()->random01() < 0.1) {
+			return true;
+		} else if (self->enablePurging && self->purgeAtLatest && deterministicRandom()->coinflip()) {
+			Version latestPurgeVersion = wait(self->doGrv(&tr));
+			if (BGV_DEBUG) {
+				fmt::print("BGV Purging Latest @ {0} after final availability check, waiting\n", latestPurgeVersion);
+			}
+			Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, latestPurgeVersion, {}, false));
+			wait(cx->waitPurgeGranulesComplete(purgeKey));
+			if (BGV_DEBUG) {
+				fmt::print("BGV Purged Latest after final availability check complete\n");
+			}
+		}
+
+		if (self->clientId == 0 && SERVER_KNOBS->BG_ENABLE_MERGING && self->clearAndMergeCheck) {
 			CODE_PROBE(true, "BGV clearing database and awaiting merge");
 			wait(clearAndAwaitMerge(cx, normalKeys));
+
+			if (self->enablePurging && self->purgeAtLatest && deterministicRandom()->coinflip()) {
+				Version latestPurgeVersion = wait(self->doGrv(&tr));
+				if (BGV_DEBUG) {
+					fmt::print("BGV Purging Latest @ {0} after clearAndAwaitMerge, waiting\n", latestPurgeVersion);
+				}
+				Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, latestPurgeVersion, {}, false));
+				wait(cx->waitPurgeGranulesComplete(purgeKey));
+				if (BGV_DEBUG) {
+					fmt::print("BGV Purged Latest after clearAndAwaitMerge complete\n");
+				}
+			}
+
+			// read after merge to make sure it completed, granules are available, and data is empty
+			bool dataCheckAfterMerge = wait(self->checkAllData(cx, self));
+			ASSERT(dataCheckAfterMerge);
+		}
+
+		if (BGV_DEBUG) {
+			fmt::print("BGV check waiting on summarizer to complete\n");
+		}
+
+		// validate that summary completes without error
+		wait(self->summaryClient);
+
+		if (BGV_DEBUG) {
+			fmt::print("BGV check done\n");
 		}
 
 		return result;
 	}
 
-	Future<bool> check(Database const& cx) override { return _check(cx, this); }
+	Future<bool> check(Database const& cx) override {
+		if (clientId == 0 || !doForcePurge) {
+			return _check(cx, this);
+		}
+		return true;
+	}
 	void getMetrics(std::vector<PerfMetric>& m) override {}
 };
 
diff --git a/fdbserver/workloads/ChangeConfig.actor.cpp b/fdbserver/workloads/ChangeConfig.actor.cpp
index 8f9949ec52..0698b001f0 100644
--- a/fdbserver/workloads/ChangeConfig.actor.cpp
+++ b/fdbserver/workloads/ChangeConfig.actor.cpp
@@ -53,37 +53,45 @@ struct ChangeConfigWorkload : TestWorkload {
 
 	void getMetrics(std::vector<PerfMetric>& m) override {}
 
-	// When simulated two clusters for DR tests, this actor sets the starting configuration
-	// for the extra cluster.
-	ACTOR Future<Void> extraDatabaseConfigure(ChangeConfigWorkload* self) {
-		if (g_network->isSimulated() && g_simulator.extraDB) {
-			auto extraFile = makeReference<ClusterConnectionMemoryRecord>(*g_simulator.extraDB);
-			state Database extraDB = Database::createDatabase(extraFile, -1);
-
-			wait(delay(5 * deterministicRandom()->random01()));
-			if (self->configMode.size()) {
-				if (g_simulator.startingDisabledConfiguration != "") {
-					// It is not safe to allow automatic failover to a region which is not fully replicated,
-					// so wait for both regions to be fully replicated before enabling failover
-					wait(success(ManagementAPI::changeConfig(
-					    extraDB.getReference(), g_simulator.startingDisabledConfiguration, true)));
-					TraceEvent("WaitForReplicasExtra").log();
-					wait(waitForFullReplication(extraDB));
-					TraceEvent("WaitForReplicasExtraEnd").log();
-				}
-				wait(success(ManagementAPI::changeConfig(extraDB.getReference(), self->configMode, true)));
+	ACTOR Future<Void> configureExtraDatabase(ChangeConfigWorkload* self, Database db) {
+		wait(delay(5 * deterministicRandom()->random01()));
+		if (self->configMode.size()) {
+			if (g_simulator.startingDisabledConfiguration != "") {
+				// It is not safe to allow automatic failover to a region which is not fully replicated,
+				// so wait for both regions to be fully replicated before enabling failover
+				wait(success(
+				    ManagementAPI::changeConfig(db.getReference(), g_simulator.startingDisabledConfiguration, true)));
+				TraceEvent("WaitForReplicasExtra").log();
+				wait(waitForFullReplication(db));
+				TraceEvent("WaitForReplicasExtraEnd").log();
 			}
-			if (self->networkAddresses.size()) {
-				if (self->networkAddresses == "auto")
-					wait(CoordinatorsChangeActor(extraDB, self, true));
-				else
-					wait(CoordinatorsChangeActor(extraDB, self));
-			}
-			wait(delay(5 * deterministicRandom()->random01()));
+			wait(success(ManagementAPI::changeConfig(db.getReference(), self->configMode, true)));
 		}
+		if (self->networkAddresses.size()) {
+			if (self->networkAddresses == "auto")
+				wait(CoordinatorsChangeActor(db, self, true));
+			else
+				wait(CoordinatorsChangeActor(db, self));
+		}
+
+		wait(delay(5 * deterministicRandom()->random01()));
 		return Void();
 	}
 
+	// When simulating multiple clusters, this actor sets the starting configuration
+	// for the extra clusters.
+	Future<Void> configureExtraDatabases(ChangeConfigWorkload* self) {
+		std::vector<Future<Void>> futures;
+		if (g_network->isSimulated()) {
+			for (auto extraDatabase : g_simulator.extraDatabases) {
+				auto extraFile = makeReference<ClusterConnectionMemoryRecord>(ClusterConnectionString(extraDatabase));
+				Database db = Database::createDatabase(extraFile, -1);
+				futures.push_back(configureExtraDatabase(self, db));
+			}
+		}
+		return waitForAll(futures);
+	}
+
 	// Either changes the database configuration, or changes the coordinators based on the parameters
 	// of the workload.
 	ACTOR Future<Void> ChangeConfigClient(Database cx, ChangeConfigWorkload* self) {
@@ -93,7 +101,7 @@ struct ChangeConfigWorkload : TestWorkload {
 		state bool extraConfigureBefore = deterministicRandom()->random01() < 0.5;
 
 		if (extraConfigureBefore) {
-			wait(self->extraDatabaseConfigure(self));
+			wait(self->configureExtraDatabases(self));
 		}
 
 		if (self->configMode.size()) {
@@ -116,7 +124,7 @@ struct ChangeConfigWorkload : TestWorkload {
 		}
 
 		if (!extraConfigureBefore) {
-			wait(self->extraDatabaseConfigure(self));
+			wait(self->configureExtraDatabases(self));
 		}
 
 		return Void();
diff --git a/fdbserver/workloads/ConfigureDatabase.actor.cpp b/fdbserver/workloads/ConfigureDatabase.actor.cpp
index 6ead37daaf..f7a538f591 100644
--- a/fdbserver/workloads/ConfigureDatabase.actor.cpp
+++ b/fdbserver/workloads/ConfigureDatabase.actor.cpp
@@ -284,6 +284,7 @@ struct ConfigureDatabaseWorkload : TestWorkload {
 	}
 
 	ACTOR Future<bool> _check(ConfigureDatabaseWorkload* self, Database cx) {
+		wait(delay(30.0));
 		// only storage_migration_type=gradual && perpetual_storage_wiggle=1 need this check because in QuietDatabase
 		// perpetual wiggle will be forced to close For other cases, later ConsistencyCheck will check KV store type
 		// there
diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp
index 7ebbe18157..d6d2da101a 100644
--- a/fdbserver/workloads/ConsistencyCheck.actor.cpp
+++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp
@@ -2034,8 +2034,9 @@ struct ConsistencyCheckWorkload : TestWorkload {
 	}
 
 	ACTOR Future<bool> checkWorkerList(Database cx, ConsistencyCheckWorkload* self) {
-		if (g_simulator.extraDB)
+		if (!g_simulator.extraDatabases.empty()) {
 			return true;
+		}
 
 		std::vector<WorkerDetails> workers = wait(getWorkers(self->dbInfo));
 		std::set<NetworkAddress> workerAddresses;
diff --git a/fdbserver/workloads/DifferentClustersSameRV.actor.cpp b/fdbserver/workloads/DifferentClustersSameRV.actor.cpp
index 35db37c671..3a961d642b 100644
--- a/fdbserver/workloads/DifferentClustersSameRV.actor.cpp
+++ b/fdbserver/workloads/DifferentClustersSameRV.actor.cpp
@@ -37,8 +37,9 @@ struct DifferentClustersSameRVWorkload : TestWorkload {
 	bool switchComplete = false;
 
 	DifferentClustersSameRVWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
-		ASSERT(g_simulator.extraDB != nullptr);
-		auto extraFile = makeReference<ClusterConnectionMemoryRecord>(*g_simulator.extraDB);
+		ASSERT(g_simulator.extraDatabases.size() == 1);
+		auto extraFile =
+		    makeReference<ClusterConnectionMemoryRecord>(ClusterConnectionString(g_simulator.extraDatabases[0]));
 		extraDB = Database::createDatabase(extraFile, -1);
 		testDuration = getOption(options, LiteralStringRef("testDuration"), 100.0);
 		switchAfter = getOption(options, LiteralStringRef("switchAfter"), 50.0);
diff --git a/fdbserver/workloads/EncryptKeyProxyTest.actor.cpp b/fdbserver/workloads/EncryptKeyProxyTest.actor.cpp
index c55529a74c..e9fb3caccc 100644
--- a/fdbserver/workloads/EncryptKeyProxyTest.actor.cpp
+++ b/fdbserver/workloads/EncryptKeyProxyTest.actor.cpp
@@ -19,7 +19,7 @@
  */
 
 #include "fdbrpc/Locality.h"
-#include "fdbserver/EncryptKeyProxyInterface.h"
+#include "fdbclient/EncryptKeyProxyInterface.h"
 #include "fdbserver/Knobs.h"
 #include "fdbserver/ServerDBInfo.actor.h"
 #include "fdbserver/WorkerInterface.actor.h"
diff --git a/fdbserver/workloads/EncryptionOps.actor.cpp b/fdbserver/workloads/EncryptionOps.actor.cpp
index 3a124951d2..6c49efec8d 100644
--- a/fdbserver/workloads/EncryptionOps.actor.cpp
+++ b/fdbserver/workloads/EncryptionOps.actor.cpp
@@ -21,14 +21,17 @@
 #include "fdbclient/DatabaseContext.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "flow/EncryptUtils.h"
+#include "flow/Error.h"
 #include "flow/IRandom.h"
 #include "flow/BlobCipher.h"
 #include "fdbserver/workloads/workloads.actor.h"
+#include "flow/flow.h"
 #include "flow/ITrace.h"
 #include "flow/Trace.h"
 
 #include <chrono>
 #include <cstring>
+#include <limits>
 #include <memory>
 #include <random>
 
@@ -111,6 +114,7 @@ struct EncryptionOpsWorkload : TestWorkload {
 	int pageSize;
 	int maxBufSize;
 	std::unique_ptr<uint8_t[]> buff;
+	int enableTTLTest;
 
 	Arena arena;
 	std::unique_ptr<WorkloadMetrics> metrics;
@@ -121,7 +125,7 @@ struct EncryptionOpsWorkload : TestWorkload {
 	EncryptCipherBaseKeyId headerBaseCipherId;
 	EncryptCipherRandomSalt headerRandomSalt;
 
-	EncryptionOpsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
+	EncryptionOpsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx), enableTTLTest(false) {
 		mode = getOption(options, LiteralStringRef("fixedSize"), 1);
 		numIterations = getOption(options, LiteralStringRef("numIterations"), 10);
 		pageSize = getOption(options, LiteralStringRef("pageSize"), 4096);
@@ -136,13 +140,18 @@ struct EncryptionOpsWorkload : TestWorkload {
 
 		metrics = std::make_unique<WorkloadMetrics>();
 
+		if (wcx.clientId == 0 && mode == 1) {
+			enableTTLTest = true;
+		}
+
 		TraceEvent("EncryptionOpsWorkload")
 		    .detail("Mode", getModeStr())
 		    .detail("MinDomainId", minDomainId)
-		    .detail("MaxDomainId", maxDomainId);
+		    .detail("MaxDomainId", maxDomainId)
+		    .detail("EnableTTL", enableTTLTest);
 	}
 
-	~EncryptionOpsWorkload() { TraceEvent("EncryptionOpsWorkload_Done").log(); }
+	~EncryptionOpsWorkload() { TraceEvent("EncryptionOpsWorkload.Done").log(); }
 
 	bool isFixedSizePayload() { return mode == 1; }
 
@@ -165,14 +174,19 @@ struct EncryptionOpsWorkload : TestWorkload {
 	void setupCipherEssentials() {
 		Reference<BlobCipherKeyCache> cipherKeyCache = BlobCipherKeyCache::getInstance();
 
-		TraceEvent("SetupCipherEssentials_Start").detail("MinDomainId", minDomainId).detail("MaxDomainId", maxDomainId);
+		TraceEvent("SetupCipherEssentials.Start").detail("MinDomainId", minDomainId).detail("MaxDomainId", maxDomainId);
 
 		uint8_t buff[AES_256_KEY_LENGTH];
 		std::vector<Reference<BlobCipherKey>> cipherKeys;
 		int cipherLen = 0;
 		for (EncryptCipherDomainId id = minDomainId; id <= maxDomainId; id++) {
 			generateRandomBaseCipher(AES_256_KEY_LENGTH, &buff[0], &cipherLen);
-			cipherKeyCache->insertCipherKey(id, minBaseCipherId, buff, cipherLen);
+			cipherKeyCache->insertCipherKey(id,
+			                                minBaseCipherId,
+			                                buff,
+			                                cipherLen,
+			                                std::numeric_limits<int64_t>::max(),
+			                                std::numeric_limits<int64_t>::max());
 
 			ASSERT(cipherLen > 0 && cipherLen <= AES_256_KEY_LENGTH);
 
@@ -183,13 +197,18 @@ struct EncryptionOpsWorkload : TestWorkload {
 		// insert the Encrypt Header cipherKey; record cipherDetails as getLatestCipher() may not work with multiple
 		// test clients
 		generateRandomBaseCipher(AES_256_KEY_LENGTH, &buff[0], &cipherLen);
-		cipherKeyCache->insertCipherKey(ENCRYPT_HEADER_DOMAIN_ID, headerBaseCipherId, buff, cipherLen);
+		cipherKeyCache->insertCipherKey(ENCRYPT_HEADER_DOMAIN_ID,
+		                                headerBaseCipherId,
+		                                buff,
+		                                cipherLen,
+		                                std::numeric_limits<int64_t>::max(),
+		                                std::numeric_limits<int64_t>::max());
 		Reference<BlobCipherKey> latestCipher = cipherKeyCache->getLatestCipherKey(ENCRYPT_HEADER_DOMAIN_ID);
 		ASSERT_EQ(latestCipher->getBaseCipherId(), headerBaseCipherId);
 		ASSERT_EQ(memcmp(latestCipher->rawBaseCipher(), buff, cipherLen), 0);
 		headerRandomSalt = latestCipher->getSalt();
 
-		TraceEvent("SetupCipherEssentials_Done")
+		TraceEvent("SetupCipherEssentials.Done")
 		    .detail("MinDomainId", minDomainId)
 		    .detail("MaxDomainId", maxDomainId)
 		    .detail("HeaderBaseCipherId", headerBaseCipherId)
@@ -198,9 +217,14 @@ struct EncryptionOpsWorkload : TestWorkload {
 
 	void resetCipherEssentials() {
 		Reference<BlobCipherKeyCache> cipherKeyCache = BlobCipherKeyCache::getInstance();
-		cipherKeyCache->cleanup();
+		for (EncryptCipherDomainId id = minDomainId; id <= maxDomainId; id++) {
+			cipherKeyCache->resetEncryptDomainId(id);
+		}
 
-		TraceEvent("ResetCipherEssentials_Done").log();
+		cipherKeyCache->resetEncryptDomainId(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID);
+		cipherKeyCache->resetEncryptDomainId(ENCRYPT_HEADER_DOMAIN_ID);
+
+		TraceEvent("ResetCipherEssentials.Done").log();
 	}
 
 	void updateLatestBaseCipher(const EncryptCipherDomainId encryptDomainId,
@@ -232,7 +256,9 @@ struct EncryptionOpsWorkload : TestWorkload {
 			                                baseCipherId,
 			                                cipherKey->rawBaseCipher(),
 			                                cipherKey->getBaseCipherLen(),
-			                                cipherKey->getSalt());
+			                                cipherKey->getSalt(),
+			                                std::numeric_limits<int64_t>::max(),
+			                                std::numeric_limits<int64_t>::max());
 			// Ensure the update was a NOP
 			Reference<BlobCipherKey> cKey = cipherKeyCache->getCipherKey(domainId, baseCipherId, salt);
 			ASSERT(cKey->isEqual(cipherKey));
@@ -297,11 +323,7 @@ struct EncryptionOpsWorkload : TestWorkload {
 		metrics->updateDecryptionTime(std::chrono::duration<double, std::nano>(end - start).count());
 	}
 
-	Future<Void> setup(Database const& ctx) override { return Void(); }
-
-	std::string description() const override { return "EncryptionOps"; }
-
-	Future<Void> start(Database const& cx) override {
+	void testBlobCipherKeyCacheOps() {
 		uint8_t baseCipher[AES_256_KEY_LENGTH];
 		int baseCipherLen = 0;
 		EncryptCipherBaseKeyId nextBaseCipherId;
@@ -322,7 +344,12 @@ struct EncryptionOpsWorkload : TestWorkload {
 			if (updateBaseCipher) {
 				// simulate baseCipherId getting refreshed/updated
 				updateLatestBaseCipher(encryptDomainId, &baseCipher[0], &baseCipherLen, &nextBaseCipherId);
-				cipherKeyCache->insertCipherKey(encryptDomainId, nextBaseCipherId, &baseCipher[0], baseCipherLen);
+				cipherKeyCache->insertCipherKey(encryptDomainId,
+				                                nextBaseCipherId,
+				                                &baseCipher[0],
+				                                baseCipherLen,
+				                                std::numeric_limits<int64_t>::max(),
+				                                std::numeric_limits<int64_t>::max());
 			}
 
 			auto start = std::chrono::high_resolution_clock::now();
@@ -368,6 +395,103 @@ struct EncryptionOpsWorkload : TestWorkload {
 
 		// Cleanup cipherKeys
 		resetCipherEssentials();
+	}
+
+	static void compareCipherDetails(Reference<BlobCipherKey> cipherKey,
+	                                 const EncryptCipherDomainId domId,
+	                                 const EncryptCipherBaseKeyId baseCipherId,
+	                                 const uint8_t* baseCipher,
+	                                 const int baseCipherLen,
+	                                 const int64_t refreshAt,
+	                                 const int64_t expAt) {
+		ASSERT(cipherKey.isValid());
+		ASSERT_EQ(cipherKey->getDomainId(), domId);
+		ASSERT_EQ(cipherKey->getBaseCipherId(), baseCipherId);
+		ASSERT_EQ(memcmp(cipherKey->rawBaseCipher(), baseCipher, baseCipherLen), 0);
+		ASSERT_EQ(cipherKey->getRefreshAtTS(), refreshAt);
+		ASSERT_EQ(cipherKey->getExpireAtTS(), expAt);
+	}
+
+	ACTOR Future<Void> testBlobCipherKeyCacheTTL(EncryptionOpsWorkload* self) {
+		state Reference<BlobCipherKeyCache> cipherKeyCache = BlobCipherKeyCache::getInstance();
+
+		state EncryptCipherDomainId domId = deterministicRandom()->randomInt(120000, 150000);
+		state EncryptCipherBaseKeyId baseCipherId = deterministicRandom()->randomInt(786, 1024);
+		state std::unique_ptr<uint8_t[]> baseCipher = std::make_unique<uint8_t[]>(AES_256_KEY_LENGTH);
+		state Reference<BlobCipherKey> cipherKey;
+		state EncryptCipherRandomSalt salt;
+		state int64_t refreshAt;
+		state int64_t expAt;
+
+		TraceEvent("TestBlobCipherCacheTTL.Start").detail("DomId", domId);
+
+		deterministicRandom()->randomBytes(baseCipher.get(), AES_256_KEY_LENGTH);
+
+		// Validate 'non-revocable' cipher with no expiration
+		refreshAt = std::numeric_limits<int64_t>::max();
+		expAt = std::numeric_limits<int64_t>::max();
+		cipherKeyCache->insertCipherKey(domId, baseCipherId, baseCipher.get(), AES_256_KEY_LENGTH, refreshAt, expAt);
+		cipherKey = cipherKeyCache->getLatestCipherKey(domId);
+		compareCipherDetails(cipherKey, domId, baseCipherId, baseCipher.get(), AES_256_KEY_LENGTH, refreshAt, expAt);
+
+		TraceEvent("TestBlobCipherCacheTTL.NonRevocableNoExpiry").detail("DomId", domId);
+
+		// Validate 'non-revocable' cipher with expiration
+		state EncryptCipherBaseKeyId baseCipherId_1 = baseCipherId + 1;
+		refreshAt = now() + 5;
+		cipherKeyCache->insertCipherKey(domId, baseCipherId_1, baseCipher.get(), AES_256_KEY_LENGTH, refreshAt, expAt);
+		cipherKey = cipherKeyCache->getLatestCipherKey(domId);
+		ASSERT(cipherKey.isValid());
+		compareCipherDetails(cipherKey, domId, baseCipherId_1, baseCipher.get(), AES_256_KEY_LENGTH, refreshAt, expAt);
+		salt = cipherKey->getSalt();
+		wait(delayUntil(refreshAt));
+		// Ensure that latest cipherKey needs refresh, however, cipher lookup works (non-revocable)
+		cipherKey = cipherKeyCache->getLatestCipherKey(domId);
+		ASSERT(!cipherKey.isValid());
+		cipherKey = cipherKeyCache->getCipherKey(domId, baseCipherId_1, salt);
+		ASSERT(cipherKey.isValid());
+		compareCipherDetails(cipherKey, domId, baseCipherId_1, baseCipher.get(), AES_256_KEY_LENGTH, refreshAt, expAt);
+
+		TraceEvent("TestBlobCipherCacheTTL.NonRevocableWithExpiry").detail("DomId", domId);
+
+		// Validate 'revocable' cipher with expiration
+		state EncryptCipherBaseKeyId baseCipherId_2 = baseCipherId + 2;
+		refreshAt = now() + 5;
+		expAt = refreshAt + 5;
+		cipherKeyCache->insertCipherKey(domId, baseCipherId_2, baseCipher.get(), AES_256_KEY_LENGTH, refreshAt, expAt);
+		cipherKey = cipherKeyCache->getLatestCipherKey(domId);
+		ASSERT(cipherKey.isValid());
+		compareCipherDetails(cipherKey, domId, baseCipherId_2, baseCipher.get(), AES_256_KEY_LENGTH, refreshAt, expAt);
+		salt = cipherKey->getSalt();
+		wait(delayUntil(refreshAt));
+		// Ensure that latest cipherKey needs refresh, however, cipher lookup works (non-revocable)
+		cipherKey = cipherKeyCache->getLatestCipherKey(domId);
+		ASSERT(!cipherKey.isValid());
+		cipherKey = cipherKeyCache->getCipherKey(domId, baseCipherId_2, salt);
+		ASSERT(cipherKey.isValid());
+		compareCipherDetails(cipherKey, domId, baseCipherId_2, baseCipher.get(), AES_256_KEY_LENGTH, refreshAt, expAt);
+		wait(delayUntil(expAt));
+		// Ensure that cipherKey lookup doesn't work after expiry
+		cipherKey = cipherKeyCache->getLatestCipherKey(domId);
+		ASSERT(!cipherKey.isValid());
+		cipherKey = cipherKeyCache->getCipherKey(domId, baseCipherId_2, salt);
+		ASSERT(!cipherKey.isValid());
+
+		TraceEvent("TestBlobCipherCacheTTL.End").detail("DomId", domId);
+		return Void();
+	}
+
+	Future<Void> setup(Database const& ctx) override { return Void(); }
+
+	std::string description() const override { return "EncryptionOps"; }
+
+	Future<Void> start(Database const& cx) override { return _start(cx, this); }
+
+	ACTOR Future<Void> _start(Database cx, EncryptionOpsWorkload* self) {
+		self->testBlobCipherKeyCacheOps();
+		if (self->enableTTLTest) {
+			wait(self->testBlobCipherKeyCacheTTL(self));
+		}
 		return Void();
 	}
 
diff --git a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp
index a2cd623fa2..eff62949de 100644
--- a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp
@@ -63,7 +63,7 @@ struct ExceptionContract {
 		    e.code() == error_code_future_version || e.code() == error_code_transaction_cancelled ||
 		    e.code() == error_code_key_too_large || e.code() == error_code_value_too_large ||
 		    e.code() == error_code_process_behind || e.code() == error_code_batch_transaction_throttled ||
-		    e.code() == error_code_tag_throttled) {
+		    e.code() == error_code_tag_throttled || e.code() == error_code_grv_proxy_memory_limit_exceeded) {
 			return;
 		}
 
diff --git a/fdbserver/workloads/LocalRatekeeper.actor.cpp b/fdbserver/workloads/LocalRatekeeper.actor.cpp
index 97f8af04bc..67d84d309f 100644
--- a/fdbserver/workloads/LocalRatekeeper.actor.cpp
+++ b/fdbserver/workloads/LocalRatekeeper.actor.cpp
@@ -18,6 +18,7 @@
  * limitations under the License.
  */
 
+#include "fdbclient/FDBTypes.h"
 #include "fdbserver/workloads/workloads.actor.h"
 #include <fdbserver/Knobs.h>
 #include <flow/actorcompiler.h>
@@ -82,7 +83,16 @@ struct LocalRatekeeperWorkload : TestWorkload {
 				    .detail("Actual", metrics.localRateLimit);
 			}
 			tr.reset();
-			Version readVersion = wait(tr.getReadVersion());
+			state Version readVersion = invalidVersion;
+			loop {
+				try {
+					Version v = wait(tr.getReadVersion());
+					readVersion = v;
+					break;
+				} catch (Error& e) {
+					wait(tr.onError(e));
+				}
+			}
 			requests.clear();
 			// we send 100 requests to this storage node and count how many of those get rejected
 			for (int i = 0; i < 100; ++i) {
diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp
new file mode 100644
index 0000000000..db3d305ed5
--- /dev/null
+++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp
@@ -0,0 +1,643 @@
+/*
+ * MetaclusterManagementWorkload.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <limits>
+#include "fdbclient/ClusterConnectionMemoryRecord.h"
+#include "fdbclient/FDBOptions.g.h"
+#include "fdbclient/GenericManagementAPI.actor.h"
+#include "fdbclient/Metacluster.h"
+#include "fdbclient/MetaclusterManagement.actor.h"
+#include "fdbclient/ReadYourWrites.h"
+#include "fdbclient/RunTransaction.actor.h"
+#include "fdbclient/TenantManagement.actor.h"
+#include "fdbclient/ThreadSafeTransaction.h"
+#include "fdbrpc/simulator.h"
+#include "fdbserver/workloads/MetaclusterConsistency.actor.h"
+#include "fdbserver/workloads/workloads.actor.h"
+#include "fdbserver/Knobs.h"
+#include "flow/BooleanParam.h"
+#include "flow/Error.h"
+#include "flow/IRandom.h"
+#include "flow/ThreadHelper.actor.h"
+#include "flow/flow.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+FDB_DEFINE_BOOLEAN_PARAM(AllowPartialMetaclusterOperations);
+
+struct MetaclusterManagementWorkload : TestWorkload {
+
+	struct DataClusterData {
+		Database db;
+		bool registered = false;
+		int tenantGroupCapacity = 0;
+
+		std::set<TenantName> tenants;
+
+		DataClusterData() {}
+		DataClusterData(Database db) : db(db) {}
+	};
+
+	struct TenantData {
+		ClusterName cluster;
+
+		TenantData() {}
+		TenantData(ClusterName cluster) : cluster(cluster) {}
+	};
+
+	Reference<IDatabase> managementDb;
+	std::map<ClusterName, DataClusterData> dataDbs;
+	std::vector<ClusterName> dataDbIndex;
+
+	int64_t totalTenantGroupCapacity = 0;
+	std::map<TenantName, TenantData> createdTenants;
+
+	int maxTenants;
+	int maxTenantGroups;
+	double testDuration;
+
+	MetaclusterManagementWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
+		maxTenants = std::min<int>(1e8 - 1, getOption(options, "maxTenants"_sr, 1000));
+		maxTenantGroups = std::min<int>(2 * maxTenants, getOption(options, "maxTenantGroups"_sr, 20));
+		testDuration = getOption(options, "testDuration"_sr, 120.0);
+	}
+
+	std::string description() const override { return "MetaclusterManagement"; }
+
+	Future<Void> setup(Database const& cx) override {
+		if (clientId == 0) {
+			if (g_network->isSimulated() && BUGGIFY) {
+				IKnobCollection::getMutableGlobalKnobCollection().setKnob(
+				    "max_tenants_per_cluster", KnobValueRef::create(int{ deterministicRandom()->randomInt(20, 100) }));
+			}
+			return _setup(cx, this);
+		} else {
+			return Void();
+		}
+	}
+	ACTOR static Future<Void> _setup(Database cx, MetaclusterManagementWorkload* self) {
+		Reference<IDatabase> threadSafeHandle =
+		    wait(unsafeThreadFutureToFuture(ThreadSafeDatabase::createFromExistingDatabase(cx)));
+
+		MultiVersionApi::api->selectApiVersion(cx->apiVersion);
+		self->managementDb = MultiVersionDatabase::debugCreateFromExistingDatabase(threadSafeHandle);
+
+		ASSERT(g_simulator.extraDatabases.size() > 0);
+		for (auto connectionString : g_simulator.extraDatabases) {
+			ClusterConnectionString ccs(connectionString);
+			auto extraFile = makeReference<ClusterConnectionMemoryRecord>(ccs);
+			self->dataDbIndex.push_back(ClusterName(format("cluster_%08d", self->dataDbs.size())));
+			self->dataDbs[self->dataDbIndex.back()] = DataClusterData(Database::createDatabase(extraFile, -1));
+		}
+
+		wait(success(MetaclusterAPI::createMetacluster(cx.getReference(), "management_cluster"_sr)));
+		return Void();
+	}
+
+	ClusterName chooseClusterName() { return dataDbIndex[deterministicRandom()->randomInt(0, dataDbIndex.size())]; }
+
+	TenantName chooseTenantName() {
+		TenantName tenant(format("tenant%08d", deterministicRandom()->randomInt(0, maxTenants)));
+		return tenant;
+	}
+
+	Optional<TenantGroupName> chooseTenantGroup() {
+		Optional<TenantGroupName> tenantGroup;
+		if (deterministicRandom()->coinflip()) {
+			tenantGroup =
+			    TenantGroupNameRef(format("tenantgroup%08d", deterministicRandom()->randomInt(0, maxTenantGroups)));
+		}
+
+		return tenantGroup;
+	}
+
+	ACTOR static Future<Void> registerCluster(MetaclusterManagementWorkload* self) {
+		state ClusterName clusterName = self->chooseClusterName();
+		state DataClusterData* dataDb = &self->dataDbs[clusterName];
+		state bool retried = false;
+
+		try {
+			state DataClusterEntry entry;
+			entry.capacity.numTenantGroups = deterministicRandom()->randomInt(0, 4);
+
+			loop {
+				try {
+					Future<Void> registerFuture =
+					    MetaclusterAPI::registerCluster(self->managementDb,
+					                                    clusterName,
+					                                    dataDb->db->getConnectionRecord()->getConnectionString(),
+					                                    entry);
+
+					Optional<Void> result = wait(timeout(registerFuture, deterministicRandom()->randomInt(1, 30)));
+					if (result.present()) {
+						break;
+					} else {
+						retried = true;
+					}
+				} catch (Error& e) {
+					if (e.code() == error_code_cluster_already_exists && retried && !dataDb->registered) {
+						Optional<DataClusterMetadata> clusterMetadata =
+						    wait(MetaclusterAPI::tryGetCluster(self->managementDb, clusterName));
+						ASSERT(clusterMetadata.present());
+						break;
+					} else {
+						throw;
+					}
+				}
+			}
+
+			ASSERT(!dataDb->registered);
+
+			dataDb->tenantGroupCapacity = entry.capacity.numTenantGroups;
+			self->totalTenantGroupCapacity += entry.capacity.numTenantGroups;
+			dataDb->registered = true;
+
+			// Get a version to know that the cluster has recovered
+			wait(success(runTransaction(dataDb->db.getReference(),
+			                            [](Reference<ReadYourWritesTransaction> tr) { return tr->getReadVersion(); })));
+
+			return Void();
+		} catch (Error& e) {
+			if (e.code() == error_code_cluster_already_exists) {
+				ASSERT(dataDb->registered);
+				return Void();
+			}
+
+			TraceEvent(SevError, "RegisterClusterFailure").error(e).detail("ClusterName", clusterName);
+			ASSERT(false);
+			throw internal_error();
+		}
+	}
+
+	ACTOR static Future<Void> removeCluster(MetaclusterManagementWorkload* self) {
+		state ClusterName clusterName = self->chooseClusterName();
+		state DataClusterData* dataDb = &self->dataDbs[clusterName];
+		state bool retried = false;
+
+		try {
+			loop {
+				// TODO: check force removal
+				Future<Void> removeFuture = MetaclusterAPI::removeCluster(self->managementDb, clusterName, false);
+				try {
+					Optional<Void> result = wait(timeout(removeFuture, deterministicRandom()->randomInt(1, 30)));
+					if (result.present()) {
+						break;
+					} else {
+						retried = true;
+					}
+				} catch (Error& e) {
+					if (e.code() == error_code_cluster_not_found && retried && dataDb->registered) {
+						Optional<DataClusterMetadata> clusterMetadata =
+						    wait(MetaclusterAPI::tryGetCluster(self->managementDb, clusterName));
+
+						ASSERT(!clusterMetadata.present());
+						break;
+					} else {
+						throw;
+					}
+				}
+			}
+
+			ASSERT(dataDb->registered);
+			ASSERT(dataDb->tenants.empty());
+
+			self->totalTenantGroupCapacity -= dataDb->tenantGroupCapacity;
+			dataDb->tenantGroupCapacity = 0;
+			dataDb->registered = false;
+
+			// Get a version to know that the cluster has recovered
+			wait(success(runTransaction(dataDb->db.getReference(),
+			                            [](Reference<ReadYourWritesTransaction> tr) { return tr->getReadVersion(); })));
+
+			return Void();
+		} catch (Error& e) {
+			if (e.code() == error_code_cluster_not_found) {
+				ASSERT(!dataDb->registered);
+				return Void();
+			} else if (e.code() == error_code_cluster_not_empty) {
+				ASSERT(!dataDb->tenants.empty());
+				return Void();
+			}
+
+			TraceEvent(SevError, "RemoveClusterFailure").error(e).detail("ClusterName", clusterName);
+			ASSERT(false);
+			throw internal_error();
+		}
+	}
+
+	ACTOR static Future<Void> listClusters(MetaclusterManagementWorkload* self) {
+		state ClusterName clusterName1 = self->chooseClusterName();
+		state ClusterName clusterName2 = self->chooseClusterName();
+		state int limit = deterministicRandom()->randomInt(1, self->dataDbs.size() + 1);
+
+		try {
+			std::map<ClusterName, DataClusterMetadata> clusterList =
+			    wait(MetaclusterAPI::listClusters(self->managementDb, clusterName1, clusterName2, limit));
+
+			ASSERT(clusterName1 <= clusterName2);
+
+			auto resultItr = clusterList.begin();
+
+			int count = 0;
+			for (auto localItr = self->dataDbs.find(clusterName1);
+			     localItr != self->dataDbs.find(clusterName2) && count < limit;
+			     ++localItr) {
+				if (localItr->second.registered) {
+					ASSERT(resultItr != clusterList.end());
+					ASSERT(resultItr->first == localItr->first);
+					ASSERT(resultItr->second.connectionString ==
+					       localItr->second.db->getConnectionRecord()->getConnectionString());
+					++resultItr;
+					++count;
+				}
+			}
+
+			ASSERT(resultItr == clusterList.end());
+
+			return Void();
+		} catch (Error& e) {
+			if (e.code() == error_code_inverted_range) {
+				ASSERT(clusterName1 > clusterName2);
+				return Void();
+			}
+			TraceEvent(SevError, "ListClustersFailure")
+			    .error(e)
+			    .detail("BeginClusterName", clusterName1)
+			    .detail("EndClusterName", clusterName2)
+			    .detail("Limit", limit);
+			ASSERT(false);
+			throw internal_error();
+		}
+	}
+
+	ACTOR static Future<Void> getCluster(MetaclusterManagementWorkload* self) {
+		state ClusterName clusterName = self->chooseClusterName();
+		state DataClusterData* dataDb = &self->dataDbs[clusterName];
+
+		try {
+			DataClusterMetadata clusterMetadata = wait(MetaclusterAPI::getCluster(self->managementDb, clusterName));
+			ASSERT(dataDb->registered);
+			ASSERT(dataDb->db->getConnectionRecord()->getConnectionString() == clusterMetadata.connectionString);
+			return Void();
+		} catch (Error& e) {
+			if (e.code() == error_code_cluster_not_found) {
+				ASSERT(!dataDb->registered);
+				return Void();
+			}
+			TraceEvent(SevError, "GetClusterFailure").error(e).detail("ClusterName", clusterName);
+			ASSERT(false);
+			throw internal_error();
+		}
+	}
+
+	ACTOR static Future<Optional<DataClusterEntry>> configureImpl(MetaclusterManagementWorkload* self,
+	                                                              ClusterName clusterName,
+	                                                              DataClusterData* dataDb,
+	                                                              Optional<int64_t> numTenantGroups,
+	                                                              Optional<ClusterConnectionString> connectionString) {
+		state Reference<ITransaction> tr = self->managementDb->createTransaction();
+		loop {
+			try {
+				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+				Optional<DataClusterMetadata> clusterMetadata =
+				    wait(MetaclusterAPI::tryGetClusterTransaction(tr, clusterName));
+				state Optional<DataClusterEntry> entry;
+
+				if (clusterMetadata.present()) {
+					if (numTenantGroups.present()) {
+						entry = clusterMetadata.get().entry;
+						entry.get().capacity.numTenantGroups = numTenantGroups.get();
+					}
+					MetaclusterAPI::updateClusterMetadata(
+					    tr, clusterName, clusterMetadata.get(), connectionString, entry);
+
+					wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1)));
+				}
+
+				return entry;
+			} catch (Error& e) {
+				wait(safeThreadFutureToFuture(tr->onError(e)));
+			}
+		}
+	}
+
+	ACTOR static Future<Void> configureCluster(MetaclusterManagementWorkload* self) {
+		state ClusterName clusterName = self->chooseClusterName();
+		state DataClusterData* dataDb = &self->dataDbs[clusterName];
+		state Optional<DataClusterEntry> updatedEntry;
+
+		state Optional<int64_t> newNumTenantGroups;
+		state Optional<ClusterConnectionString> connectionString;
+		if (deterministicRandom()->coinflip()) {
+			newNumTenantGroups = deterministicRandom()->randomInt(0, 4);
+		}
+		if (deterministicRandom()->coinflip()) {
+			connectionString = dataDb->db->getConnectionRecord()->getConnectionString();
+		}
+
+		try {
+			loop {
+				Optional<Optional<DataClusterEntry>> result =
+				    wait(timeout(configureImpl(self, clusterName, dataDb, newNumTenantGroups, connectionString),
+				                 deterministicRandom()->randomInt(1, 30)));
+				if (result.present()) {
+					updatedEntry = result.get();
+					break;
+				}
+			}
+
+			if (updatedEntry.present()) {
+				int64_t tenantGroupDelta =
+				    std::max<int64_t>(updatedEntry.get().capacity.numTenantGroups, dataDb->tenants.size()) -
+				    std::max<int64_t>(dataDb->tenantGroupCapacity, dataDb->tenants.size());
+
+				self->totalTenantGroupCapacity += tenantGroupDelta;
+				dataDb->tenantGroupCapacity = updatedEntry.get().capacity.numTenantGroups;
+			}
+
+			return Void();
+		} catch (Error& e) {
+			TraceEvent(SevError, "ConfigureClusterFailure").error(e).detail("ClusterName", clusterName);
+			ASSERT(false);
+			throw internal_error();
+		}
+	}
+
+	ACTOR static Future<Void> createTenant(MetaclusterManagementWorkload* self) {
+		state TenantName tenant = self->chooseTenantName();
+
+		auto itr = self->createdTenants.find(tenant);
+		state bool exists = itr != self->createdTenants.end();
+		state bool hasCapacity = self->createdTenants.size() < self->totalTenantGroupCapacity;
+		state bool retried = false;
+
+		try {
+			loop {
+				try {
+					Future<Void> createFuture =
+					    MetaclusterAPI::createTenant(self->managementDb, tenant, TenantMapEntry());
+					Optional<Void> result = wait(timeout(createFuture, deterministicRandom()->randomInt(1, 30)));
+					if (result.present()) {
+						break;
+					} else {
+						retried = true;
+					}
+				} catch (Error& e) {
+					if (e.code() == error_code_tenant_already_exists && retried && !exists) {
+						Optional<TenantMapEntry> entry = wait(MetaclusterAPI::tryGetTenant(self->managementDb, tenant));
+						ASSERT(entry.present());
+						break;
+					} else {
+						throw;
+					}
+				}
+			}
+
+			TenantMapEntry entry = wait(MetaclusterAPI::getTenant(self->managementDb, tenant));
+
+			ASSERT(!exists);
+			ASSERT(hasCapacity);
+			ASSERT(entry.assignedCluster.present());
+
+			auto assignedCluster = self->dataDbs.find(entry.assignedCluster.get());
+
+			ASSERT(assignedCluster != self->dataDbs.end());
+			ASSERT(assignedCluster->second.tenants.insert(tenant).second);
+			ASSERT(assignedCluster->second.tenantGroupCapacity >= assignedCluster->second.tenants.size());
+
+			self->createdTenants[tenant] = TenantData(entry.assignedCluster.get());
+
+			return Void();
+		} catch (Error& e) {
+			if (e.code() == error_code_tenant_already_exists) {
+				ASSERT(exists);
+				return Void();
+			} else if (e.code() == error_code_metacluster_no_capacity) {
+				ASSERT(!hasCapacity && !exists);
+				return Void();
+			}
+
+			TraceEvent(SevError, "CreateTenantFailure").error(e).detail("TenantName", tenant);
+			ASSERT(false);
+			throw internal_error();
+		}
+	}
+
+	ACTOR static Future<Void> deleteTenant(MetaclusterManagementWorkload* self) {
+		state TenantName tenant = self->chooseTenantName();
+
+		auto itr = self->createdTenants.find(tenant);
+		state bool exists = itr != self->createdTenants.end();
+		state bool retried = false;
+
+		try {
+			loop {
+				try {
+					Future<Void> deleteFuture = MetaclusterAPI::deleteTenant(self->managementDb, tenant);
+					Optional<Void> result = wait(timeout(deleteFuture, deterministicRandom()->randomInt(1, 30)));
+
+					if (result.present()) {
+						break;
+					} else {
+						retried = true;
+					}
+				} catch (Error& e) {
+					if (e.code() == error_code_tenant_not_found && retried && exists) {
+						Optional<TenantMapEntry> entry = wait(MetaclusterAPI::tryGetTenant(self->managementDb, tenant));
+						ASSERT(!entry.present());
+						break;
+					} else {
+						throw;
+					}
+				}
+			}
+
+			ASSERT(exists);
+			auto tenantData = self->createdTenants.find(tenant);
+			ASSERT(tenantData != self->createdTenants.end());
+			auto& dataDb = self->dataDbs[tenantData->second.cluster];
+			ASSERT(dataDb.registered);
+			auto tenantItr = dataDb.tenants.find(tenant);
+			ASSERT(tenantItr != dataDb.tenants.end());
+			if (dataDb.tenants.size() > dataDb.tenantGroupCapacity) {
+				--self->totalTenantGroupCapacity;
+			}
+			dataDb.tenants.erase(tenantItr);
+			self->createdTenants.erase(tenant);
+
+			return Void();
+		} catch (Error& e) {
+			if (e.code() == error_code_tenant_not_found) {
+				ASSERT(!exists);
+				return Void();
+			}
+
+			TraceEvent(SevError, "DeleteTenantFailure").error(e).detail("TenantName", tenant);
+			ASSERT(false);
+			throw internal_error();
+		}
+	}
+
+	Future<Void> start(Database const& cx) override {
+		if (clientId == 0) {
+			return _start(cx, this);
+		} else {
+			return Void();
+		}
+	}
+	ACTOR static Future<Void> _start(Database cx, MetaclusterManagementWorkload* self) {
+		state double start = now();
+
+		// Run a random sequence of operations for the duration of the test
+		while (now() < start + self->testDuration) {
+			state int operation = deterministicRandom()->randomInt(0, 7);
+			if (operation == 0) {
+				wait(self->registerCluster(self));
+			} else if (operation == 1) {
+				wait(self->removeCluster(self));
+			} else if (operation == 2) {
+				wait(self->listClusters(self));
+			} else if (operation == 3) {
+				wait(self->getCluster(self));
+			} else if (operation == 4) {
+				wait(self->configureCluster(self));
+			} else if (operation == 5) {
+				wait(self->createTenant(self));
+			} else if (operation == 6) {
+				wait(self->deleteTenant(self));
+			}
+		}
+
+		return Void();
+	}
+
+	// Checks that the data cluster state matches our local state
+	ACTOR static Future<Void> checkDataCluster(MetaclusterManagementWorkload* self,
+	                                           ClusterName clusterName,
+	                                           DataClusterData clusterData) {
+		state Optional<MetaclusterRegistrationEntry> metaclusterRegistration;
+		state std::vector<std::pair<TenantName, TenantMapEntry>> tenants;
+		state Reference<ReadYourWritesTransaction> tr = clusterData.db->createTransaction();
+
+		loop {
+			try {
+				tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+				wait(
+				    store(metaclusterRegistration,
+				          MetaclusterMetadata::metaclusterRegistration().get(clusterData.db.getReference())) &&
+				    store(tenants,
+				          TenantAPI::listTenantsTransaction(tr, ""_sr, "\xff\xff"_sr, clusterData.tenants.size() + 1)));
+				break;
+			} catch (Error& e) {
+				wait(safeThreadFutureToFuture(tr->onError(e)));
+			}
+		}
+
+		if (clusterData.registered) {
+			ASSERT(metaclusterRegistration.present() &&
+			       metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_DATA);
+		} else {
+			ASSERT(!metaclusterRegistration.present());
+		}
+
+		ASSERT(tenants.size() == clusterData.tenants.size());
+		for (auto [tenantName, tenantEntry] : tenants) {
+			ASSERT(clusterData.tenants.count(tenantName));
+			ASSERT(self->createdTenants[tenantName].cluster == clusterName);
+		}
+
+		return Void();
+	}
+
+	ACTOR static Future<Void> decommissionMetacluster(MetaclusterManagementWorkload* self) {
+		state Reference<ITransaction> tr = self->managementDb->createTransaction();
+
+		state bool deleteTenants = deterministicRandom()->coinflip();
+
+		if (deleteTenants) {
+			state std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
+			    wait(MetaclusterAPI::listTenants(self->managementDb, ""_sr, "\xff\xff"_sr, 10e6));
+
+			state std::vector<Future<Void>> deleteTenantFutures;
+			for (auto [tenantName, tenantMapEntry] : tenants) {
+				deleteTenantFutures.push_back(MetaclusterAPI::deleteTenant(self->managementDb, tenantName));
+			}
+
+			wait(waitForAll(deleteTenantFutures));
+		}
+
+		state std::map<ClusterName, DataClusterMetadata> dataClusters = wait(
+		    MetaclusterAPI::listClusters(self->managementDb, ""_sr, "\xff\xff"_sr, CLIENT_KNOBS->MAX_DATA_CLUSTERS));
+
+		std::vector<Future<Void>> removeClusterFutures;
+		for (auto [clusterName, clusterMetadata] : dataClusters) {
+			removeClusterFutures.push_back(
+			    MetaclusterAPI::removeCluster(self->managementDb, clusterName, !deleteTenants));
+		}
+
+		wait(waitForAll(removeClusterFutures));
+		wait(MetaclusterAPI::decommissionMetacluster(self->managementDb));
+
+		Optional<MetaclusterRegistrationEntry> entry =
+		    wait(MetaclusterMetadata::metaclusterRegistration().get(self->managementDb));
+		ASSERT(!entry.present());
+
+		return Void();
+	}
+
+	Future<bool> check(Database const& cx) override {
+		if (clientId == 0) {
+			return _check(this);
+		} else {
+			return true;
+		}
+	}
+	ACTOR static Future<bool> _check(MetaclusterManagementWorkload* self) {
+		// The metacluster consistency check runs the tenant consistency check for each cluster
+		state MetaclusterConsistencyCheck<IDatabase> metaclusterConsistencyCheck(
+		    self->managementDb, AllowPartialMetaclusterOperations::False);
+		wait(metaclusterConsistencyCheck.run());
+
+		std::map<ClusterName, DataClusterMetadata> dataClusters = wait(MetaclusterAPI::listClusters(
+		    self->managementDb, ""_sr, "\xff\xff"_sr, CLIENT_KNOBS->MAX_DATA_CLUSTERS + 1));
+
+		std::vector<Future<Void>> dataClusterChecks;
+		for (auto [clusterName, dataClusterData] : self->dataDbs) {
+			auto dataClusterItr = dataClusters.find(clusterName);
+			if (dataClusterData.registered) {
+				ASSERT(dataClusterItr != dataClusters.end());
+				ASSERT(dataClusterItr->second.entry.capacity.numTenantGroups == dataClusterData.tenantGroupCapacity);
+			} else {
+				ASSERT(dataClusterItr == dataClusters.end());
+			}
+
+			dataClusterChecks.push_back(checkDataCluster(self, clusterName, dataClusterData));
+		}
+		wait(waitForAll(dataClusterChecks));
+
+		wait(decommissionMetacluster(self));
+
+		return true;
+	}
+
+	void getMetrics(std::vector<PerfMetric>& m) override {}
+};
+
+WorkloadFactory<MetaclusterManagementWorkload> MetaclusterManagementWorkloadFactory("MetaclusterManagement");
diff --git a/fdbserver/workloads/ProtocolVersion.actor.cpp b/fdbserver/workloads/ProtocolVersion.actor.cpp
index 45b14d1a98..dc027758e1 100644
--- a/fdbserver/workloads/ProtocolVersion.actor.cpp
+++ b/fdbserver/workloads/ProtocolVersion.actor.cpp
@@ -32,7 +32,7 @@ struct ProtocolVersionWorkload : TestWorkload {
 		state std::vector<ISimulator::ProcessInfo*> allProcesses = g_pSimulator->getAllProcesses();
 		state std::vector<ISimulator::ProcessInfo*>::iterator diffVersionProcess =
 		    find_if(allProcesses.begin(), allProcesses.end(), [](const ISimulator::ProcessInfo* p) {
-			    return p->protocolVersion != currentProtocolVersion;
+			    return p->protocolVersion != currentProtocolVersion();
 		    });
 
 		ASSERT(diffVersionProcess != allProcesses.end());
diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp
index 2466716eb5..b23e2e01ce 100644
--- a/fdbserver/workloads/ReadWrite.actor.cpp
+++ b/fdbserver/workloads/ReadWrite.actor.cpp
@@ -22,6 +22,7 @@
 #include <utility>
 #include <vector>
 
+#include "fdbclient/FDBTypes.h"
 #include "fdbrpc/ContinuousSample.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
@@ -378,6 +379,7 @@ struct ReadWriteWorkload : ReadWriteCommon {
 	bool adjacentWrites;
 	int extraReadConflictRangesPerTransaction, extraWriteConflictRangesPerTransaction;
 	int readType;
+	bool cacheResult;
 	Optional<Key> transactionTag;
 
 	int transactionsTagThrottled{ 0 };
@@ -403,6 +405,7 @@ struct ReadWriteWorkload : ReadWriteCommon {
 		batchPriority = getOption(options, LiteralStringRef("batchPriority"), false);
 		descriptionString = getOption(options, LiteralStringRef("description"), LiteralStringRef("ReadWrite"));
 		readType = getOption(options, LiteralStringRef("readType"), 3);
+		cacheResult = getOption(options, LiteralStringRef("cacheResult"), true);
 		if (hasOption(options, LiteralStringRef("transactionTag"))) {
 			transactionTag = getOption(options, LiteralStringRef("transactionTag"), ""_sr);
 		}
@@ -432,7 +435,10 @@ struct ReadWriteWorkload : ReadWriteCommon {
 		if (transactionTag.present() && tr.getTags().size() == 0) {
 			tr.setOption(FDBTransactionOptions::AUTO_THROTTLE_TAG, transactionTag.get());
 		}
-		tr.getTransaction().trState->readType = static_cast<ReadType>(readType);
+		ReadOptions options;
+		options.type = static_cast<ReadType>(readType);
+		options.cacheResult = cacheResult;
+		tr.getTransaction().trState->readOptions = options;
 	}
 
 	std::string description() const override { return descriptionString.toString(); }
diff --git a/fdbserver/workloads/SaveAndKill.actor.cpp b/fdbserver/workloads/SaveAndKill.actor.cpp
index c9187ce6ff..0b56a08d82 100644
--- a/fdbserver/workloads/SaveAndKill.actor.cpp
+++ b/fdbserver/workloads/SaveAndKill.actor.cpp
@@ -70,6 +70,8 @@ struct SaveAndKillWorkload : TestWorkload {
 
 		ini.SetBoolValue("META", "enableEncryption", SERVER_KNOBS->ENABLE_ENCRYPTION);
 		ini.SetBoolValue("META", "enableTLogEncryption", SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION);
+		ini.SetBoolValue("META", "enableStorageServerEncryption", SERVER_KNOBS->ENABLE_STORAGE_SERVER_ENCRYPTION);
+		ini.SetBoolValue("META", "enableBlobGranuleEncryption", SERVER_KNOBS->ENABLE_BLOB_GRANULE_ENCRYPTION);
 
 		std::vector<ISimulator::ProcessInfo*> processes = g_simulator.getAllProcesses();
 		std::map<NetworkAddress, ISimulator::ProcessInfo*> rebootingProcesses = g_simulator.currentlyRebootingProcesses;
diff --git a/fdbserver/workloads/SkewedReadWrite.actor.cpp b/fdbserver/workloads/SkewedReadWrite.actor.cpp
index 7019fc1065..23b2fcf442 100644
--- a/fdbserver/workloads/SkewedReadWrite.actor.cpp
+++ b/fdbserver/workloads/SkewedReadWrite.actor.cpp
@@ -369,7 +369,12 @@ struct SkewedReadWriteWorkload : ReadWriteCommon {
 WorkloadFactory<SkewedReadWriteWorkload> SkewedReadWriteWorkloadFactory("SkewedReadWrite");
 
 TEST_CASE("/KVWorkload/methods/ParseKeyForIndex") {
-	auto wk = SkewedReadWriteWorkload(WorkloadContext());
+	WorkloadContext wcx;
+	wcx.clientId = 1;
+	wcx.clientCount = 1;
+	wcx.sharedRandomNumber = 1;
+
+	auto wk = SkewedReadWriteWorkload(wcx);
 	for (int i = 0; i < 1000; ++i) {
 		auto idx = deterministicRandom()->randomInt64(0, wk.nodeCount);
 		Key k = wk.keyForIndex(idx);
diff --git a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
index 7d9aae26ad..c3096bc60c 100644
--- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
+++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
@@ -873,31 +873,34 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 		}
 		TraceEvent(SevDebug, "DatabaseLocked").log();
 		// if database locked, fdb read should get database_locked error
-		try {
-			tx->reset();
-			tx->setOption(FDBTransactionOptions::RAW_ACCESS);
-			RangeResult res = wait(tx->getRange(normalKeys, 1));
-		} catch (Error& e) {
-			if (e.code() == error_code_actor_cancelled)
-				throw;
-			ASSERT(e.code() == error_code_database_locked);
+		tx->reset();
+		loop {
+			try {
+				tx->setOption(FDBTransactionOptions::RAW_ACCESS);
+				RangeResult res = wait(tx->getRange(normalKeys, 1));
+			} catch (Error& e) {
+				if (e.code() == error_code_actor_cancelled)
+					throw;
+				if (e.code() == error_code_grv_proxy_memory_limit_exceeded ||
+				    e.code() == error_code_batch_transaction_throttled) {
+					wait(tx->onError(e));
+				} else {
+					ASSERT(e.code() == error_code_database_locked);
+					break;
+				}
+			}
 		}
 		// make sure we unlock the database
 		// unlock is idempotent, thus we can commit many times until successful
+		tx->reset();
 		loop {
 			try {
-				tx->reset();
 				tx->setOption(FDBTransactionOptions::RAW_ACCESS);
 				tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
 				// unlock the database
 				tx->clear(SpecialKeySpace::getManagementApiCommandPrefix("lock"));
 				wait(tx->commit());
 				TraceEvent(SevDebug, "DatabaseUnlocked").log();
-				tx->reset();
-				// read should be successful
-				tx->setOption(FDBTransactionOptions::RAW_ACCESS);
-				RangeResult res = wait(tx->getRange(normalKeys, 1));
-				tx->reset();
 				break;
 			} catch (Error& e) {
 				TraceEvent(SevDebug, "DatabaseUnlockFailure").error(e);
@@ -905,9 +908,23 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 				wait(tx->onError(e));
 			}
 		}
+
+		tx->reset();
+		loop {
+			try {
+				// read should be successful
+				tx->setOption(FDBTransactionOptions::RAW_ACCESS);
+				RangeResult res = wait(tx->getRange(normalKeys, 1));
+				break;
+			} catch (Error& e) {
+				wait(tx->onError(e));
+			}
+		}
+
 		// test consistencycheck which only used by ConsistencyCheck Workload
 		// Note: we have exclusive ownership of fdbShouldConsistencyCheckBeSuspended,
 		// no existing workloads can modify the key
+		tx->reset();
 		{
 			try {
 				tx->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
diff --git a/fdbserver/workloads/TenantEntryCacheWorkload.actor.cpp b/fdbserver/workloads/TenantEntryCacheWorkload.actor.cpp
new file mode 100644
index 0000000000..c069b24710
--- /dev/null
+++ b/fdbserver/workloads/TenantEntryCacheWorkload.actor.cpp
@@ -0,0 +1,312 @@
+/*
+ * TenantEntryCacheWorkload.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/DatabaseContext.h"
+#include "fdbclient/NativeAPI.actor.h"
+#include "fdbclient/TenantManagement.actor.h"
+
+#include "fdbserver/Knobs.h"
+#include "fdbserver/TenantEntryCache.actor.h"
+#include "fdbserver/workloads/workloads.actor.h"
+
+#include "flow/Error.h"
+#include "flow/IRandom.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+namespace {
+TenantEntryCachePayload<int64_t> createPayload(const TenantName& name, const TenantMapEntry& entry) {
+	TenantEntryCachePayload<int64_t> payload;
+	payload.name = name;
+	payload.entry = entry;
+	payload.payload = entry.id;
+
+	return payload;
+}
+} // namespace
+
+struct TenantEntryCacheWorkload : TestWorkload {
+	const TenantName tenantNamePrefix = "tenant_entry_cache_workload_"_sr;
+	TenantName localTenantNamePrefix;
+	int maxTenants;
+	int clientId;
+
+	TenantEntryCacheWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
+		clientId = wcx.clientId;
+		maxTenants = std::max(3, std::min<int>(1e8 - 1, getOption(options, "maxTenants"_sr, 1000)));
+		localTenantNamePrefix = format("%stenant_%d_", tenantNamePrefix.toString().c_str(), clientId);
+	}
+	~TenantEntryCacheWorkload() {}
+
+	static void compareTenants(Optional<TenantEntryCachePayload<int64_t>> left, TenantMapEntry& right) {
+		ASSERT(left.present());
+		ASSERT_EQ(left.get().entry.id, right.id);
+		ASSERT_EQ(left.get().entry.prefix.compare(right.prefix), 0);
+		ASSERT_EQ(left.get().payload, right.id);
+	}
+
+	ACTOR static Future<Void> compareContents(std::vector<std::pair<TenantName, TenantMapEntry>>* tenants,
+	                                          Reference<TenantEntryCache<int64_t>> cache) {
+		state int i;
+		for (i = 0; i < tenants->size(); i++) {
+			if (deterministicRandom()->coinflip()) {
+				Optional<TenantEntryCachePayload<int64_t>> e = wait(cache->getById(tenants->at(i).second.id));
+				compareTenants(e, tenants->at(i).second);
+			} else {
+				Optional<TenantEntryCachePayload<int64_t>> e = wait(cache->getByName(tenants->at(i).first));
+				compareTenants(e, tenants->at(i).second);
+			}
+		}
+
+		return Void();
+	}
+
+	ACTOR static Future<Void> testTenantNotFound(Database cx) {
+		state Reference<TenantEntryCache<int64_t>> cache = makeReference<TenantEntryCache<int64_t>>(cx, createPayload);
+		TraceEvent("TenantNotFoundStart");
+
+		wait(cache->init());
+		// Ensure associated counter values gets updated
+		ASSERT_EQ(cache->numRefreshByInit(), 1);
+
+		state TenantMapEntry dummy(std::numeric_limits<int64_t>::max(), TenantState::READY, true /* encrypted */);
+		Optional<TenantEntryCachePayload<int64_t>> value = wait(cache->getById(dummy.id));
+		ASSERT(!value.present());
+
+		Optional<TenantEntryCachePayload<int64_t>> value1 = wait(cache->getByPrefix(dummy.prefix));
+		ASSERT(!value1.present());
+
+		// Ensure associated counter values gets updated
+		ASSERT_EQ(cache->numRefreshByMisses(), 2);
+
+		TraceEvent("TenantNotFoundEnd");
+		return Void();
+	}
+
+	ACTOR static Future<Void> testCreateTenantsAndLookup(
+	    Database cx,
+	    TenantEntryCacheWorkload* self,
+	    std::vector<std::pair<TenantName, TenantMapEntry>>* tenantList) {
+		state Reference<TenantEntryCache<int64_t>> cache = makeReference<TenantEntryCache<int64_t>>(cx, createPayload);
+		state int nTenants = deterministicRandom()->randomInt(5, self->maxTenants);
+
+		TraceEvent("CreateTenantsAndLookupStart");
+
+		wait(cache->init());
+		// Ensure associated counter values gets updated
+		ASSERT_EQ(cache->numRefreshByInit(), 1);
+		ASSERT_GE(cache->numCacheRefreshes(), 1);
+
+		tenantList->clear();
+		state int i = 0;
+		state std::unordered_set<TenantName> tenantNames;
+		while (i < nTenants) {
+			state TenantName name(format("%s%08d",
+			                             self->localTenantNamePrefix.toString().c_str(),
+			                             deterministicRandom()->randomInt(0, self->maxTenants)));
+			if (tenantNames.find(name) != tenantNames.end()) {
+				continue;
+			}
+
+			Optional<TenantMapEntry> entry = wait(TenantAPI::createTenant(cx.getReference(), StringRef(name)));
+			ASSERT(entry.present());
+			tenantList->emplace_back(std::make_pair(name, entry.get()));
+			tenantNames.emplace(name);
+			i++;
+		}
+
+		wait(compareContents(tenantList, cache));
+
+		TraceEvent("CreateTenantsAndLookupEnd");
+		return Void();
+	}
+
+	ACTOR static Future<Void> testTenantInsert(Database cx,
+	                                           TenantEntryCacheWorkload* self,
+	                                           std::vector<std::pair<TenantName, TenantMapEntry>>* tenantList) {
+		state Reference<TenantEntryCache<int64_t>> cache = makeReference<TenantEntryCache<int64_t>>(cx, createPayload);
+
+		ASSERT(!tenantList->empty() && tenantList->size() >= 2);
+
+		TraceEvent("TestTenantInsertStart");
+
+		wait(cache->init());
+		// Ensure associated counter values gets updated
+		ASSERT_EQ(cache->numRefreshByInit(), 1);
+		ASSERT_GE(cache->numCacheRefreshes(), 1);
+
+		state std::pair<TenantName, TenantMapEntry> p = tenantList->at(0);
+		state Optional<TenantEntryCachePayload<int64_t>> entry;
+
+		// Tenant rename
+		p.first = TenantName(format("%s%08d",
+		                            self->localTenantNamePrefix.toString().c_str(),
+		                            deterministicRandom()->randomInt(self->maxTenants + 100, self->maxTenants + 200)));
+		cache->put(p);
+		Optional<TenantEntryCachePayload<int64_t>> e = wait(cache->getByName(p.first));
+		entry = e;
+		compareTenants(entry, p.second);
+
+		// Tenant delete & recreate
+		p.second.id = p.second.id + deterministicRandom()->randomInt(self->maxTenants + 500, self->maxTenants + 700);
+		cache->put(p);
+		Optional<TenantEntryCachePayload<int64_t>> e1 = wait(cache->getById(p.second.id));
+		entry = e1;
+		compareTenants(entry, p.second);
+		ASSERT_EQ(p.first.contents().toString().compare(entry.get().name.contents().toString()), 0);
+
+		// Delete a tenant and rename an existing TenantEntry to reuse the name of deleted tenant
+		state std::pair<TenantName, TenantMapEntry> p1 = tenantList->back();
+		tenantList->pop_back();
+		wait(TenantAPI::deleteTenant(cx.getReference(), p1.first));
+		cache->put(std::make_pair(p1.first, p.second));
+		Optional<TenantEntryCachePayload<int64_t>> e2 = wait(cache->getById(p.second.id));
+		entry = e2;
+		compareTenants(entry, p.second);
+		ASSERT_EQ(p1.first.contents().toString().compare(entry.get().name.contents().toString()), 0);
+
+		TraceEvent("TestTenantInsertEnd");
+		return Void();
+	}
+
+	ACTOR static Future<Void> testCacheReload(Database cx,
+	                                          std::vector<std::pair<TenantName, TenantMapEntry>>* tenantList) {
+		state Reference<TenantEntryCache<int64_t>> cache = makeReference<TenantEntryCache<int64_t>>(cx, createPayload);
+
+		ASSERT(!tenantList->empty());
+
+		TraceEvent("CacheReloadStart");
+
+		wait(cache->init());
+		// Ensure associated counter values gets updated
+		ASSERT_EQ(cache->numRefreshByInit(), 1);
+		ASSERT_GE(cache->numCacheRefreshes(), 1);
+
+		wait(compareContents(tenantList, cache));
+
+		TraceEvent("CacheReloadEnd");
+		return Void();
+	}
+
+	ACTOR static Future<Void> testTenantCacheDefaultFunc(Database cx) {
+		wait(delay(0.0));
+		return Void();
+	}
+
+	ACTOR static Future<Void> testCacheRefresh(Database cx) {
+		state Reference<TenantEntryCache<int64_t>> cache = makeReference<TenantEntryCache<int64_t>>(cx, createPayload);
+
+		TraceEvent("TestCacheRefreshStart");
+
+		wait(cache->init());
+		// Ensure associated counter values gets updated
+		ASSERT_EQ(cache->numRefreshByInit(), 1);
+		ASSERT_GE(cache->numCacheRefreshes(), 1);
+
+		int refreshWait =
+		    SERVER_KNOBS->TENANT_CACHE_LIST_REFRESH_INTERVAL * 10; // initial delay + multiple refresh runs
+		wait(delay(refreshWait));
+
+		// InitRefresh + multiple timer based invocations
+		ASSERT_GE(cache->numCacheRefreshes(), 3);
+
+		TraceEvent("TestCacheRefreshEnd");
+		return Void();
+	}
+
+	ACTOR static Future<Void> tenantEntryRemove(Database cx,
+	                                            std::vector<std::pair<TenantName, TenantMapEntry>>* tenantList) {
+		state Reference<TenantEntryCache<int64_t>> cache = makeReference<TenantEntryCache<int64_t>>(
+		    cx, deterministicRandom()->randomUniqueID(), createPayload, TenantEntryCacheRefreshMode::NONE);
+
+		wait(cache->init());
+
+		ASSERT(!tenantList->empty());
+
+		// Remove an entry from the cache
+		state int idx = deterministicRandom()->randomInt(0, tenantList->size());
+		Optional<TenantEntryCachePayload<int64_t>> entry = wait(cache->getByName(tenantList->at(idx).first));
+		ASSERT(entry.present());
+
+		TraceEvent("TestTenantEntryRemoveStart")
+		    .detail("Id", tenantList->at(idx).second.id)
+		    .detail("Name", tenantList->at(idx).first)
+		    .detail("Prefix", tenantList->at(idx).second.prefix);
+
+		wait(TenantAPI::deleteTenant(cx.getReference(), tenantList->at(idx).first));
+
+		if (deterministicRandom()->coinflip()) {
+			wait(cache->removeEntryById(tenantList->at(idx).second.id));
+		} else if (deterministicRandom()->coinflip()) {
+			wait(cache->removeEntryByPrefix(tenantList->at(idx).second.prefix));
+		} else {
+			wait(cache->removeEntryByName(tenantList->at(idx).first));
+		}
+
+		state Optional<TenantEntryCachePayload<int64_t>> e = wait(cache->getById(tenantList->at(idx).second.id));
+		ASSERT(!e.present());
+		state Optional<TenantEntryCachePayload<int64_t>> e1 =
+		    wait(cache->getByPrefix(tenantList->at(idx).second.prefix));
+		ASSERT(!e1.present());
+		state Optional<TenantEntryCachePayload<int64_t>> e2 = wait(cache->getByName(tenantList->at(idx).first));
+		ASSERT(!e2.present());
+
+		// Ensure remove-entry is an idempotent operation
+		cache->removeEntryByName(tenantList->at(idx).first);
+		Optional<TenantEntryCachePayload<int64_t>> e3 = wait(cache->getById(tenantList->at(idx).second.id));
+		ASSERT(!e3.present());
+
+		return Void();
+	}
+
+	Future<Void> setup(Database const& cx) override {
+		if (clientId == 0 && g_network->isSimulated() && BUGGIFY) {
+			IKnobCollection::getMutableGlobalKnobCollection().setKnob("tenant_cache_list_refresh_interval",
+			                                                          KnobValueRef::create(int{ 2 }));
+		}
+
+		return Void();
+	}
+
+	Future<Void> start(Database const& cx) override {
+		if (clientId == 0) {
+			return _start(cx, this);
+		}
+		return Void();
+	}
+
+	ACTOR Future<Void> _start(Database cx, TenantEntryCacheWorkload* self) {
+		state std::vector<std::pair<TenantName, TenantMapEntry>> tenantList;
+
+		wait(testTenantNotFound(cx));
+		wait(testCreateTenantsAndLookup(cx, self, &tenantList));
+		wait(testTenantInsert(cx, self, &tenantList));
+		wait(tenantEntryRemove(cx, &tenantList));
+		wait(testTenantCacheDefaultFunc(cx));
+		wait(testCacheRefresh(cx));
+
+		return Void();
+	}
+
+	std::string description() const override { return "TenantEntryCache"; }
+	Future<bool> check(Database const& cx) override { return true; }
+	void getMetrics(std::vector<PerfMetric>& m) override {}
+};
+
+WorkloadFactory<TenantEntryCacheWorkload> TenantEntryCacheWorkloadFactory("TenantEntryCache");
\ No newline at end of file
diff --git a/fdbserver/workloads/TenantManagementConcurrencyWorkload.actor.cpp b/fdbserver/workloads/TenantManagementConcurrencyWorkload.actor.cpp
new file mode 100644
index 0000000000..55c005990f
--- /dev/null
+++ b/fdbserver/workloads/TenantManagementConcurrencyWorkload.actor.cpp
@@ -0,0 +1,345 @@
+/*
+ * TenantManagementConcurrencyWorkload.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <limits>
+#include "fdbclient/ClusterConnectionMemoryRecord.h"
+#include "fdbclient/FDBOptions.g.h"
+#include "fdbclient/GenericManagementAPI.actor.h"
+#include "fdbclient/MetaclusterManagement.actor.h"
+#include "fdbclient/ReadYourWrites.h"
+#include "fdbclient/TenantManagement.actor.h"
+#include "fdbclient/ThreadSafeTransaction.h"
+#include "fdbrpc/simulator.h"
+#include "fdbserver/workloads/MetaclusterConsistency.actor.h"
+#include "fdbserver/workloads/TenantConsistency.actor.h"
+#include "fdbserver/workloads/workloads.actor.h"
+#include "fdbserver/Knobs.h"
+#include "flow/Error.h"
+#include "flow/IRandom.h"
+#include "flow/flow.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+struct TenantManagementConcurrencyWorkload : TestWorkload {
+	const TenantName tenantNamePrefix = "tenant_management_concurrency_workload_"_sr;
+	const Key testParametersKey = "test_parameters"_sr;
+
+	int maxTenants;
+	int maxTenantGroups;
+	double testDuration;
+	bool useMetacluster;
+
+	Reference<IDatabase> mvDb;
+	Database dataDb;
+
+	TenantManagementConcurrencyWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
+		maxTenants = std::min<int>(1e8 - 1, getOption(options, "maxTenants"_sr, 100));
+		maxTenantGroups = std::min<int>(2 * maxTenants, getOption(options, "maxTenantGroups"_sr, 20));
+		testDuration = getOption(options, "testDuration"_sr, 120.0);
+
+		if (clientId == 0) {
+			useMetacluster = deterministicRandom()->coinflip();
+		} else {
+			// Other clients read the metacluster state from the database
+			useMetacluster = false;
+		}
+	}
+
+	std::string description() const override { return "TenantManagementConcurrency"; }
+
+	struct TestParameters {
+		constexpr static FileIdentifier file_identifier = 14350843;
+
+		bool useMetacluster = false;
+
+		TestParameters() {}
+		TestParameters(bool useMetacluster) : useMetacluster(useMetacluster) {}
+
+		template <class Ar>
+		void serialize(Ar& ar) {
+			serializer(ar, useMetacluster);
+		}
+
+		Value encode() const { return ObjectWriter::toValue(*this, Unversioned()); }
+		static TestParameters decode(ValueRef const& value) {
+			return ObjectReader::fromStringRef<TestParameters>(value, Unversioned());
+		}
+	};
+
+	Future<Void> setup(Database const& cx) override {
+		if (clientId == 0 && g_network->isSimulated() && BUGGIFY) {
+			IKnobCollection::getMutableGlobalKnobCollection().setKnob(
+			    "max_tenants_per_cluster", KnobValueRef::create(int{ deterministicRandom()->randomInt(20, 100) }));
+		}
+
+		return _setup(cx, this);
+	}
+	ACTOR static Future<Void> _setup(Database cx, TenantManagementConcurrencyWorkload* self) {
+		state ClusterConnectionString connectionString(g_simulator.extraDatabases[0]);
+		Reference<IDatabase> threadSafeHandle =
+		    wait(unsafeThreadFutureToFuture(ThreadSafeDatabase::createFromExistingDatabase(cx)));
+
+		MultiVersionApi::api->selectApiVersion(cx->apiVersion);
+		self->mvDb = MultiVersionDatabase::debugCreateFromExistingDatabase(threadSafeHandle);
+
+		if (self->useMetacluster && self->clientId == 0) {
+			wait(success(MetaclusterAPI::createMetacluster(cx.getReference(), "management_cluster"_sr)));
+
+			DataClusterEntry entry;
+			entry.capacity.numTenantGroups = 1e9;
+			wait(MetaclusterAPI::registerCluster(self->mvDb, "cluster1"_sr, connectionString, entry));
+		}
+
+		state Transaction tr(cx);
+		if (self->clientId == 0) {
+			// Send test parameters to the other clients
+			loop {
+				try {
+					tr.setOption(FDBTransactionOptions::RAW_ACCESS);
+					tr.set(self->testParametersKey, TestParameters(self->useMetacluster).encode());
+					wait(tr.commit());
+					break;
+				} catch (Error& e) {
+					wait(tr.onError(e));
+				}
+			}
+		} else {
+			// Read the tenant subspace chosen and saved by client 0
+			loop {
+				try {
+					tr.setOption(FDBTransactionOptions::RAW_ACCESS);
+					Optional<Value> val = wait(tr.get(self->testParametersKey));
+					if (val.present()) {
+						TestParameters params = TestParameters::decode(val.get());
+						self->useMetacluster = params.useMetacluster;
+						break;
+					}
+
+					wait(delay(1.0));
+					tr.reset();
+				} catch (Error& e) {
+					wait(tr.onError(e));
+				}
+			}
+		}
+
+		if (self->useMetacluster) {
+			ASSERT(g_simulator.extraDatabases.size() == 1);
+			auto extraFile = makeReference<ClusterConnectionMemoryRecord>(connectionString);
+			self->dataDb = Database::createDatabase(extraFile, -1);
+		} else {
+			self->dataDb = cx;
+		}
+
+		return Void();
+	}
+
+	TenantName chooseTenantName() {
+		TenantName tenant(
+		    format("%s%08d", tenantNamePrefix.toString().c_str(), deterministicRandom()->randomInt(0, maxTenants)));
+
+		return tenant;
+	}
+
+	Optional<TenantGroupName> chooseTenantGroup() {
+		Optional<TenantGroupName> tenantGroup;
+		if (deterministicRandom()->coinflip()) {
+			tenantGroup =
+			    TenantGroupNameRef(format("tenantgroup%08d", deterministicRandom()->randomInt(0, maxTenantGroups)));
+		}
+
+		return tenantGroup;
+	}
+
+	ACTOR static Future<Void> createTenant(TenantManagementConcurrencyWorkload* self) {
+		state TenantName tenant = self->chooseTenantName();
+		state TenantMapEntry entry;
+		entry.tenantGroup = self->chooseTenantGroup();
+
+		try {
+			loop {
+				Future<Void> createFuture =
+				    self->useMetacluster ? MetaclusterAPI::createTenant(self->mvDb, tenant, entry)
+				                         : success(TenantAPI::createTenant(self->dataDb.getReference(), tenant, entry));
+				Optional<Void> result = wait(timeout(createFuture, 30));
+				if (result.present()) {
+					break;
+				}
+			}
+
+			return Void();
+		} catch (Error& e) {
+			if (e.code() == error_code_tenant_removed) {
+				ASSERT(self->useMetacluster);
+			} else if (e.code() != error_code_tenant_already_exists && e.code() != error_code_cluster_no_capacity) {
+				TraceEvent(SevError, "CreateTenantFailure").error(e).detail("TenantName", tenant);
+				ASSERT(false);
+			}
+
+			return Void();
+		}
+	}
+
+	ACTOR static Future<Void> deleteTenant(TenantManagementConcurrencyWorkload* self) {
+		state TenantName tenant = self->chooseTenantName();
+
+		try {
+			loop {
+				Future<Void> deleteFuture = self->useMetacluster
+				                                ? MetaclusterAPI::deleteTenant(self->mvDb, tenant)
+				                                : TenantAPI::deleteTenant(self->dataDb.getReference(), tenant);
+				Optional<Void> result = wait(timeout(deleteFuture, 30));
+
+				if (result.present()) {
+					break;
+				}
+			}
+
+			return Void();
+		} catch (Error& e) {
+			if (e.code() != error_code_tenant_not_found) {
+				TraceEvent(SevError, "DeleteTenantFailure").error(e).detail("TenantName", tenant);
+			}
+			return Void();
+		}
+	}
+
+	ACTOR static Future<Void> configureImpl(TenantManagementConcurrencyWorkload* self,
+	                                        TenantName tenant,
+	                                        std::map<Standalone<StringRef>, Optional<Value>> configParams) {
+		if (self->useMetacluster) {
+			wait(MetaclusterAPI::configureTenant(self->mvDb, tenant, configParams));
+		} else {
+			state Reference<ReadYourWritesTransaction> tr = self->dataDb->createTransaction();
+			loop {
+				try {
+					tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+					TenantMapEntry entry = wait(TenantAPI::getTenantTransaction(tr, tenant));
+					TenantMapEntry updatedEntry = entry;
+					for (auto param : configParams) {
+						updatedEntry.configure(param.first, param.second);
+					}
+					wait(TenantAPI::configureTenantTransaction(tr, tenant, entry, updatedEntry));
+					wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1)));
+					break;
+				} catch (Error& e) {
+					wait(tr->onError(e));
+				}
+			}
+		}
+
+		return Void();
+	}
+
+	ACTOR static Future<Void> configureTenant(TenantManagementConcurrencyWorkload* self) {
+		state TenantName tenant = self->chooseTenantName();
+		state std::map<Standalone<StringRef>, Optional<Value>> configParams;
+		configParams["tenant_group"_sr] = self->chooseTenantGroup();
+
+		try {
+			loop {
+				Optional<Void> result = wait(timeout(configureImpl(self, tenant, configParams), 30));
+
+				if (result.present()) {
+					break;
+				}
+			}
+
+			return Void();
+		} catch (Error& e) {
+			if (e.code() != error_code_tenant_not_found && e.code() != error_code_invalid_tenant_state) {
+				TraceEvent(SevError, "ConfigureTenantFailure").error(e).detail("TenantName", tenant);
+			}
+			return Void();
+		}
+	}
+
+	ACTOR static Future<Void> renameTenant(TenantManagementConcurrencyWorkload* self) {
+		state TenantName oldTenant = self->chooseTenantName();
+		state TenantName newTenant = self->chooseTenantName();
+
+		try {
+			loop {
+				Future<Void> renameFuture =
+				    self->useMetacluster ? MetaclusterAPI::renameTenant(self->mvDb, oldTenant, newTenant)
+				                         : TenantAPI::renameTenant(self->dataDb.getReference(), oldTenant, newTenant);
+				Optional<Void> result = wait(timeout(renameFuture, 30));
+
+				if (result.present()) {
+					break;
+				}
+			}
+
+			return Void();
+		} catch (Error& e) {
+			if (e.code() == error_code_invalid_tenant_state || e.code() == error_code_tenant_removed ||
+			    e.code() == error_code_cluster_no_capacity) {
+				ASSERT(self->useMetacluster);
+			} else if (e.code() != error_code_tenant_not_found && e.code() != error_code_tenant_already_exists) {
+				TraceEvent(SevError, "RenameTenantFailure")
+				    .error(e)
+				    .detail("OldTenant", oldTenant)
+				    .detail("NewTenant", newTenant);
+			}
+			return Void();
+		}
+	}
+
+	Future<Void> start(Database const& cx) override { return _start(cx, this); }
+	ACTOR static Future<Void> _start(Database cx, TenantManagementConcurrencyWorkload* self) {
+		state double start = now();
+
+		// Run a random sequence of tenant management operations for the duration of the test
+		while (now() < start + self->testDuration) {
+			state int operation = deterministicRandom()->randomInt(0, 4);
+			if (operation == 0) {
+				wait(createTenant(self));
+			} else if (operation == 1) {
+				wait(deleteTenant(self));
+			} else if (operation == 2) {
+				wait(configureTenant(self));
+			} else if (operation == 3) {
+				wait(renameTenant(self));
+			}
+		}
+
+		return Void();
+	}
+
+	Future<bool> check(Database const& cx) override { return _check(cx, this); }
+	ACTOR static Future<bool> _check(Database cx, TenantManagementConcurrencyWorkload* self) {
+		if (self->useMetacluster) {
+			// The metacluster consistency check runs the tenant consistency check for each cluster
+			state MetaclusterConsistencyCheck<IDatabase> metaclusterConsistencyCheck(
+			    self->mvDb, AllowPartialMetaclusterOperations::True);
+			wait(metaclusterConsistencyCheck.run());
+		} else {
+			state TenantConsistencyCheck<DatabaseContext> tenantConsistencyCheck(self->dataDb.getReference());
+			wait(tenantConsistencyCheck.run());
+		}
+
+		return true;
+	}
+
+	void getMetrics(std::vector<PerfMetric>& m) override {}
+};
+
+WorkloadFactory<TenantManagementConcurrencyWorkload> TenantManagementConcurrencyWorkloadFactory(
+    "TenantManagementConcurrency");
\ No newline at end of file
diff --git a/fdbserver/workloads/TenantManagementWorkload.actor.cpp b/fdbserver/workloads/TenantManagementWorkload.actor.cpp
index 052bdc94ac..2657b24513 100644
--- a/fdbserver/workloads/TenantManagementWorkload.actor.cpp
+++ b/fdbserver/workloads/TenantManagementWorkload.actor.cpp
@@ -21,18 +21,26 @@
 #include <cstdint>
 #include <limits>
 #include "fdbclient/ClientBooleanParams.h"
+#include "fdbclient/ClusterConnectionMemoryRecord.h"
 #include "fdbclient/FDBOptions.g.h"
+#include "fdbclient/GenericManagementAPI.actor.h"
+#include "fdbclient/KeyBackedTypes.h"
+#include "fdbclient/MetaclusterManagement.actor.h"
 #include "fdbclient/ReadYourWrites.h"
 #include "fdbclient/RunTransaction.actor.h"
 #include "fdbclient/TenantManagement.actor.h"
 #include "fdbclient/TenantSpecialKeys.actor.h"
-#include "libb64/decode.h"
+#include "fdbclient/ThreadSafeTransaction.h"
 #include "fdbrpc/simulator.h"
+#include "fdbserver/workloads/MetaclusterConsistency.actor.h"
+#include "fdbserver/workloads/TenantConsistency.actor.h"
 #include "fdbserver/workloads/workloads.actor.h"
 #include "fdbserver/Knobs.h"
 #include "flow/Error.h"
 #include "flow/IRandom.h"
+#include "flow/ThreadHelper.actor.h"
 #include "flow/flow.h"
+#include "libb64/decode.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
 
 struct TenantManagementWorkload : TestWorkload {
@@ -56,8 +64,10 @@ struct TenantManagementWorkload : TestWorkload {
 	int64_t maxId = -1;
 
 	const Key keyName = "key"_sr;
+	const Key testParametersKey = "test_parameters"_sr;
 	const Value noTenantValue = "no_tenant"_sr;
 	const TenantName tenantNamePrefix = "tenant_management_workload_"_sr;
+	const ClusterName dataClusterName = "cluster1"_sr;
 	TenantName localTenantNamePrefix;
 	TenantName localTenantGroupNamePrefix;
 
@@ -74,45 +84,159 @@ struct TenantManagementWorkload : TestWorkload {
 	int maxTenants;
 	int maxTenantGroups;
 	double testDuration;
+	bool useMetacluster;
+	bool singleClient;
 
-	enum class OperationType { SPECIAL_KEYS, MANAGEMENT_DATABASE, MANAGEMENT_TRANSACTION };
+	Version oldestDeletionVersion = 0;
+	Version newestDeletionVersion = 0;
 
-	static OperationType randomOperationType() {
-		int randomNum = deterministicRandom()->randomInt(0, 3);
-		if (randomNum == 0) {
-			return OperationType::SPECIAL_KEYS;
-		} else if (randomNum == 1) {
-			return OperationType::MANAGEMENT_DATABASE;
+	Reference<IDatabase> mvDb;
+	Database dataDb;
+
+	// This test exercises multiple different ways to work with tenants
+	enum class OperationType {
+		// Use the special key-space APIs
+		SPECIAL_KEYS,
+		// Use the ManagementAPI functions that take a Database object and implement a retry loop
+		MANAGEMENT_DATABASE,
+		// Use the ManagementAPI functions that take a Transaction object
+		MANAGEMENT_TRANSACTION,
+		// Use the Metacluster API, if applicable. Note: not all APIs have a metacluster variant,
+		// and if there isn't one this will choose one of the other options.
+		METACLUSTER
+	};
+
+	OperationType randomOperationType() {
+		double metaclusterProb = useMetacluster ? 0.9 : 0.1;
+
+		if (deterministicRandom()->random01() < metaclusterProb) {
+			return OperationType::METACLUSTER;
 		} else {
-			return OperationType::MANAGEMENT_TRANSACTION;
+			return (OperationType)deterministicRandom()->randomInt(0, 3);
 		}
 	}
 
 	TenantManagementWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
 		maxTenants = std::min<int>(1e8 - 1, getOption(options, "maxTenants"_sr, 1000));
 		maxTenantGroups = std::min<int>(2 * maxTenants, getOption(options, "maxTenantGroups"_sr, 20));
-		testDuration = getOption(options, "testDuration"_sr, 60.0);
+		testDuration = getOption(options, "testDuration"_sr, 120.0);
+		singleClient = getOption(options, "singleClient"_sr, false);
 
 		localTenantNamePrefix = format("%stenant_%d_", tenantNamePrefix.toString().c_str(), clientId);
 		localTenantGroupNamePrefix = format("%stenantgroup_%d_", tenantNamePrefix.toString().c_str(), clientId);
+
+		bool defaultUseMetacluster = false;
+		if (clientId == 0 && g_network->isSimulated() && !g_simulator.extraDatabases.empty()) {
+			defaultUseMetacluster = deterministicRandom()->coinflip();
+		}
+
+		useMetacluster = getOption(options, "useMetacluster"_sr, defaultUseMetacluster);
 	}
 
 	std::string description() const override { return "TenantManagement"; }
 
-	Future<Void> setup(Database const& cx) override { return _setup(cx, this); }
+	struct TestParameters {
+		constexpr static FileIdentifier file_identifier = 1527576;
+
+		bool useMetacluster = false;
+
+		TestParameters() {}
+		TestParameters(bool useMetacluster) : useMetacluster(useMetacluster) {}
+
+		template <class Ar>
+		void serialize(Ar& ar) {
+			serializer(ar, useMetacluster);
+		}
+
+		Value encode() const { return ObjectWriter::toValue(*this, Unversioned()); }
+		static TestParameters decode(ValueRef const& value) {
+			return ObjectReader::fromStringRef<TestParameters>(value, Unversioned());
+		}
+	};
+
+	Future<Void> setup(Database const& cx) override {
+		if (clientId == 0 && g_network->isSimulated() && BUGGIFY) {
+			IKnobCollection::getMutableGlobalKnobCollection().setKnob(
+			    "max_tenants_per_cluster", KnobValueRef::create(int{ deterministicRandom()->randomInt(20, 100) }));
+		}
+
+		if (clientId == 0 || !singleClient) {
+			return _setup(cx, this);
+		} else {
+			return Void();
+		}
+	}
+
 	ACTOR Future<Void> _setup(Database cx, TenantManagementWorkload* self) {
+		Reference<IDatabase> threadSafeHandle =
+		    wait(unsafeThreadFutureToFuture(ThreadSafeDatabase::createFromExistingDatabase(cx)));
+
+		MultiVersionApi::api->selectApiVersion(cx->apiVersion);
+		self->mvDb = MultiVersionDatabase::debugCreateFromExistingDatabase(threadSafeHandle);
+
+		if (self->useMetacluster && self->clientId == 0) {
+			wait(success(MetaclusterAPI::createMetacluster(cx.getReference(), "management_cluster"_sr)));
+
+			DataClusterEntry entry;
+			entry.capacity.numTenantGroups = 1e9;
+			wait(MetaclusterAPI::registerCluster(
+			    self->mvDb, self->dataClusterName, g_simulator.extraDatabases[0], entry));
+		}
+
 		state Transaction tr(cx);
 		if (self->clientId == 0) {
+			// Communicates test parameters to all other clients by storing it in a key
 			loop {
 				try {
 					tr.setOption(FDBTransactionOptions::RAW_ACCESS);
-					tr.set(self->keyName, self->noTenantValue);
+					tr.set(self->testParametersKey, TestParameters(self->useMetacluster).encode());
 					wait(tr.commit());
 					break;
 				} catch (Error& e) {
 					wait(tr.onError(e));
 				}
 			}
+		} else {
+			// Read the parameters chosen and saved by client 0
+			loop {
+				try {
+					tr.setOption(FDBTransactionOptions::RAW_ACCESS);
+					Optional<Value> val = wait(tr.get(self->testParametersKey));
+					if (val.present()) {
+						TestParameters params = TestParameters::decode(val.get());
+						self->useMetacluster = params.useMetacluster;
+						break;
+					}
+
+					wait(delay(1.0));
+					tr.reset();
+				} catch (Error& e) {
+					wait(tr.onError(e));
+				}
+			}
+		}
+
+		if (self->useMetacluster) {
+			ASSERT(g_simulator.extraDatabases.size() == 1);
+			auto extraFile = makeReference<ClusterConnectionMemoryRecord>(g_simulator.extraDatabases[0]);
+			self->dataDb = Database::createDatabase(extraFile, -1);
+		} else {
+			self->dataDb = cx;
+		}
+
+		if (self->clientId == 0) {
+			// Set a key outside of all tenants to make sure that our tenants aren't writing to the regular key-space
+			state Transaction dataTr(self->dataDb);
+			loop {
+				try {
+					dataTr.setOption(FDBTransactionOptions::RAW_ACCESS);
+					dataTr.set(self->keyName, self->noTenantValue);
+					wait(dataTr.commit());
+					break;
+				} catch (Error& e) {
+					wait(dataTr.onError(e));
+				}
+			}
 		}
 
 		return Void();
@@ -142,9 +266,16 @@ struct TenantManagementWorkload : TestWorkload {
 		return tenantGroup;
 	}
 
+	Future<Optional<TenantMapEntry>> tryGetTenant(TenantName tenantName, OperationType operationType) {
+		if (operationType == OperationType::METACLUSTER) {
+			return MetaclusterAPI::tryGetTenant(mvDb, tenantName);
+		} else {
+			return TenantAPI::tryGetTenant(dataDb.getReference(), tenantName);
+		}
+	}
+
 	// Creates tenant(s) using the specified operation type
-	ACTOR static Future<Void> createImpl(Database cx,
-	                                     Reference<ReadYourWritesTransaction> tr,
+	ACTOR static Future<Void> createImpl(Reference<ReadYourWritesTransaction> tr,
 	                                     std::map<TenantName, TenantMapEntry> tenantsToCreate,
 	                                     OperationType operationType,
 	                                     TenantManagementWorkload* self) {
@@ -154,7 +285,7 @@ struct TenantManagementWorkload : TestWorkload {
 				tr->set(self->specialKeysTenantMapPrefix.withSuffix(tenant), ""_sr);
 				if (entry.tenantGroup.present()) {
 					tr->set(self->specialKeysTenantConfigPrefix.withSuffix(
-					            Tuple().append(tenant).append("tenant_group"_sr).pack()),
+					            Tuple::makeTuple(tenant, "tenant_group"_sr).pack()),
 					        entry.tenantGroup.get());
 				}
 			}
@@ -162,7 +293,7 @@ struct TenantManagementWorkload : TestWorkload {
 		} else if (operationType == OperationType::MANAGEMENT_DATABASE) {
 			ASSERT(tenantsToCreate.size() == 1);
 			wait(success(TenantAPI::createTenant(
-			    cx.getReference(), tenantsToCreate.begin()->first, tenantsToCreate.begin()->second)));
+			    self->dataDb.getReference(), tenantsToCreate.begin()->first, tenantsToCreate.begin()->second)));
 		} else if (operationType == OperationType::MANAGEMENT_TRANSACTION) {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			int64_t _nextId = wait(TenantAPI::getNextTenantId(tr));
@@ -173,16 +304,20 @@ struct TenantManagementWorkload : TestWorkload {
 				entry.setId(nextId++);
 				createFutures.push_back(success(TenantAPI::createTenantTransaction(tr, tenant, entry)));
 			}
-			TenantMetadata::lastTenantId.set(tr, nextId - 1);
+			TenantMetadata::lastTenantId().set(tr, nextId - 1);
 			wait(waitForAll(createFutures));
 			wait(tr->commit());
+		} else {
+			ASSERT(tenantsToCreate.size() == 1);
+			wait(MetaclusterAPI::createTenant(
+			    self->mvDb, tenantsToCreate.begin()->first, tenantsToCreate.begin()->second));
 		}
 
 		return Void();
 	}
 
-	ACTOR static Future<Void> createTenant(Database cx, TenantManagementWorkload* self) {
-		state OperationType operationType = TenantManagementWorkload::randomOperationType();
+	ACTOR static Future<Void> createTenant(TenantManagementWorkload* self) {
+		state OperationType operationType = self->randomOperationType();
 		int numTenants = 1;
 
 		// For transaction-based operations, test creating multiple tenants in the same transaction
@@ -227,27 +362,85 @@ struct TenantManagementWorkload : TestWorkload {
 			hasSystemTenantGroup = hasSystemTenantGroup || entry.tenantGroup.orDefault(""_sr).startsWith("\xff"_sr);
 		}
 
-		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
+		// If any tenant existed at the start of this function, then we expect the creation to fail or be a no-op,
+		// depending on the type of create operation being executed
+		state bool existedAtStart = alreadyExists;
+
+		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->dataDb);
 		state int64_t minTenantCount = std::numeric_limits<int64_t>::max();
 		state int64_t finalTenantCount = 0;
 
 		loop {
 			try {
-				if (operationType != OperationType::MANAGEMENT_DATABASE) {
-					tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
-					wait(store(finalTenantCount, TenantMetadata::tenantCount.getD(tr, Snapshot::False, 0)));
-					minTenantCount = std::min(finalTenantCount, minTenantCount);
+				// First, attempt to create the tenants
+				state bool retried = false;
+				loop {
+					if (operationType == OperationType::MANAGEMENT_TRANSACTION ||
+					    operationType == OperationType::SPECIAL_KEYS) {
+						tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+						wait(store(finalTenantCount, TenantMetadata::tenantCount().getD(tr, Snapshot::False, 0)));
+						minTenantCount = std::min(finalTenantCount, minTenantCount);
+					}
+
+					try {
+						Optional<Void> result = wait(timeout(createImpl(tr, tenantsToCreate, operationType, self),
+						                                     deterministicRandom()->randomInt(1, 30)));
+
+						if (result.present()) {
+							// Make sure that we had capacity to create the tenants. This cannot be validated for
+							// database operations because we cannot determine the tenant count in the same transaction
+							// that the tenant is created
+							if (operationType == OperationType::SPECIAL_KEYS ||
+							    operationType == OperationType::MANAGEMENT_TRANSACTION) {
+								ASSERT(minTenantCount + newTenants <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER);
+							} else {
+								// Database operations shouldn't get here if the tenant already exists
+								ASSERT(!alreadyExists);
+							}
+
+							break;
+						}
+
+						retried = true;
+						tr->reset();
+					} catch (Error& e) {
+						// If we retried the creation after our initial attempt succeeded, then we proceed with the rest
+						// of the creation steps normally. Otherwise, the creation happened elsewhere and we failed
+						// here, so we can rethrow the error.
+						if (e.code() == error_code_tenant_already_exists && !existedAtStart) {
+							ASSERT(operationType == OperationType::METACLUSTER ||
+							       operationType == OperationType::MANAGEMENT_DATABASE);
+							ASSERT(retried);
+							break;
+						} else {
+							throw;
+						}
+					}
+
+					// Check the state of the first created tenant
+					Optional<TenantMapEntry> resultEntry =
+					    wait(self->tryGetTenant(tenantsToCreate.begin()->first, operationType));
+
+					if (resultEntry.present()) {
+						if (resultEntry.get().tenantState == TenantState::READY) {
+							// The tenant now exists, so we will retry and expect the creation to react accordingly
+							alreadyExists = true;
+						} else {
+							// Only a metacluster tenant creation can end up in a partially created state
+							// We should be able to retry and pick up where we left off
+							ASSERT(operationType == OperationType::METACLUSTER);
+							ASSERT(resultEntry.get().tenantState == TenantState::REGISTERING);
+						}
+					}
 				}
 
-				wait(createImpl(cx, tr, tenantsToCreate, operationType, self));
+				// Check that using the wrong creation type fails depending on whether we are using a metacluster
+				ASSERT(self->useMetacluster == (operationType == OperationType::METACLUSTER));
 
-				if (operationType == OperationType::MANAGEMENT_DATABASE) {
-					ASSERT(!alreadyExists);
-				} else {
-					// Make sure that we had capacity to create the tenants. This cannot be validated for database
-					// operations because we cannot determine the tenant count in the same transaction that the tenant
-					// is created
-					ASSERT(minTenantCount + newTenants <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER);
+				// Database-based creation modes will fail if the tenant already existed
+				if (operationType == OperationType::MANAGEMENT_DATABASE ||
+				    operationType == OperationType::METACLUSTER) {
+					ASSERT(!existedAtStart);
 				}
 
 				// It is not legal to create a tenant or tenant group starting with \xff
@@ -262,13 +455,23 @@ struct TenantManagementWorkload : TestWorkload {
 					}
 
 					// Read the created tenant object and verify that its state is correct
-					state Optional<TenantMapEntry> entry =
-					    wait(TenantAPI::tryGetTenant(cx.getReference(), tenantItr->first));
+					state Optional<TenantMapEntry> entry = wait(self->tryGetTenant(tenantItr->first, operationType));
+
 					ASSERT(entry.present());
 					ASSERT(entry.get().id > self->maxId);
 					ASSERT(entry.get().tenantGroup == tenantItr->second.tenantGroup);
 					ASSERT(entry.get().tenantState == TenantState::READY);
 
+					if (self->useMetacluster) {
+						// In a metacluster, we should also see that the tenant was created on the data cluster
+						Optional<TenantMapEntry> dataEntry =
+						    wait(TenantAPI::tryGetTenant(self->dataDb.getReference(), tenantItr->first));
+						ASSERT(dataEntry.present());
+						ASSERT(dataEntry.get().id == entry.get().id);
+						ASSERT(dataEntry.get().tenantGroup == entry.get().tenantGroup);
+						ASSERT(dataEntry.get().tenantState == TenantState::READY);
+					}
+
 					// Update our local tenant state to include the newly created one
 					self->maxId = entry.get().id;
 					self->createdTenants[tenantItr->first] =
@@ -282,7 +485,7 @@ struct TenantManagementWorkload : TestWorkload {
 					// Randomly decide to insert a key into the tenant
 					state bool insertData = deterministicRandom()->random01() < 0.5;
 					if (insertData) {
-						state Transaction insertTr(cx, tenantItr->first);
+						state Transaction insertTr(self->dataDb, tenantItr->first);
 						loop {
 							try {
 								// The value stored in the key will be the name of the tenant
@@ -298,7 +501,7 @@ struct TenantManagementWorkload : TestWorkload {
 
 						// Make sure that the key inserted correctly concatenates the tenant prefix with the
 						// relative key
-						state Transaction checkTr(cx);
+						state Transaction checkTr(self->dataDb);
 						loop {
 							try {
 								checkTr.setOption(FDBTransactionOptions::RAW_ACCESS);
@@ -313,7 +516,7 @@ struct TenantManagementWorkload : TestWorkload {
 					}
 
 					// Perform some final tenant validation
-					wait(checkTenantContents(cx, self, tenantItr->first, self->createdTenants[tenantItr->first]));
+					wait(checkTenantContents(self, tenantItr->first, self->createdTenants[tenantItr->first]));
 				}
 
 				return Void();
@@ -324,19 +527,24 @@ struct TenantManagementWorkload : TestWorkload {
 				} else if (e.code() == error_code_invalid_tenant_group_name) {
 					ASSERT(hasSystemTenantGroup);
 					return Void();
+				} else if (e.code() == error_code_invalid_metacluster_operation) {
+					ASSERT(operationType == OperationType::METACLUSTER != self->useMetacluster);
+					return Void();
 				} else if (e.code() == error_code_cluster_no_capacity) {
 					// Confirm that we overshot our capacity. This check cannot be done for database operations
 					// because we cannot transactionally get the tenant count with the creation.
-					if (operationType != OperationType::MANAGEMENT_DATABASE) {
+					if (operationType == OperationType::MANAGEMENT_TRANSACTION ||
+					    operationType == OperationType::SPECIAL_KEYS) {
 						ASSERT(finalTenantCount + newTenants > CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER);
 					}
 					return Void();
 				}
 
 				// Database-based operations should not need to be retried
-				else if (operationType == OperationType::MANAGEMENT_DATABASE) {
+				else if (operationType == OperationType::MANAGEMENT_DATABASE ||
+				         operationType == OperationType::METACLUSTER) {
 					if (e.code() == error_code_tenant_already_exists) {
-						ASSERT(alreadyExists && operationType == OperationType::MANAGEMENT_DATABASE);
+						ASSERT(existedAtStart);
 					} else {
 						ASSERT(tenantsToCreate.size() == 1);
 						TraceEvent(SevError, "CreateTenantFailure")
@@ -361,8 +569,8 @@ struct TenantManagementWorkload : TestWorkload {
 		}
 	}
 
-	ACTOR static Future<Void> deleteImpl(Database cx,
-	                                     Reference<ReadYourWritesTransaction> tr,
+	// Deletes the tenant or tenant range using the specified operation type
+	ACTOR static Future<Void> deleteImpl(Reference<ReadYourWritesTransaction> tr,
 	                                     TenantName beginTenant,
 	                                     Optional<TenantName> endTenant,
 	                                     std::vector<TenantName> tenants,
@@ -379,10 +587,8 @@ struct TenantManagementWorkload : TestWorkload {
 			}
 			wait(tr->commit());
 		} else if (operationType == OperationType::MANAGEMENT_DATABASE) {
-			ASSERT(tenants.size() == 1);
-			for (tenantIndex = 0; tenantIndex != tenants.size(); ++tenantIndex) {
-				wait(TenantAPI::deleteTenant(cx.getReference(), tenants[tenantIndex]));
-			}
+			ASSERT(!endTenant.present() && tenants.size() == 1);
+			wait(TenantAPI::deleteTenant(self->dataDb.getReference(), beginTenant));
 		} else if (operationType == OperationType::MANAGEMENT_TRANSACTION) {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			std::vector<Future<Void>> deleteFutures;
@@ -392,22 +598,42 @@ struct TenantManagementWorkload : TestWorkload {
 
 			wait(waitForAll(deleteFutures));
 			wait(tr->commit());
+		} else {
+			ASSERT(!endTenant.present() && tenants.size() == 1);
+			wait(MetaclusterAPI::deleteTenant(self->mvDb, beginTenant));
 		}
 
 		return Void();
 	}
 
-	ACTOR static Future<Void> deleteTenant(Database cx, TenantManagementWorkload* self) {
+	// Returns GRV and eats GRV errors
+	ACTOR static Future<Version> getReadVersion(Reference<ReadYourWritesTransaction> tr) {
+		loop {
+			try {
+				Version version = wait(tr->getReadVersion());
+				return version;
+			} catch (Error& e) {
+				if (e.code() == error_code_grv_proxy_memory_limit_exceeded ||
+				    e.code() == error_code_batch_transaction_throttled) {
+					wait(tr->onError(e));
+				} else {
+					throw;
+				}
+			}
+		}
+	}
+
+	ACTOR static Future<Void> deleteTenant(TenantManagementWorkload* self) {
 		state TenantName beginTenant = self->chooseTenantName(true);
-		state OperationType operationType = TenantManagementWorkload::randomOperationType();
-		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
+		state OperationType operationType = self->randomOperationType();
+		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->dataDb);
 
 		// For transaction-based deletion, we randomly allow the deletion of a range of tenants
-		state Optional<TenantName> endTenant = operationType != OperationType::MANAGEMENT_DATABASE &&
-		                                               !beginTenant.startsWith("\xff"_sr) &&
-		                                               deterministicRandom()->random01() < 0.2
-		                                           ? Optional<TenantName>(self->chooseTenantName(false))
-		                                           : Optional<TenantName>();
+		state Optional<TenantName> endTenant =
+		    operationType != OperationType::MANAGEMENT_DATABASE && operationType != OperationType::METACLUSTER &&
+		            !beginTenant.startsWith("\xff"_sr) && deterministicRandom()->random01() < 0.2
+		        ? Optional<TenantName>(self->chooseTenantName(false))
+		        : Optional<TenantName>();
 
 		if (endTenant.present() && endTenant < beginTenant) {
 			TenantName temp = beginTenant;
@@ -421,9 +647,15 @@ struct TenantManagementWorkload : TestWorkload {
 		// retried.
 		state bool alreadyExists = itr != self->createdTenants.end();
 
+		// True if the beginTenant existed at the start of this function
+		state bool existedAtStart = alreadyExists;
+
 		// True if all of the tenants in the range are empty and can be deleted
 		state bool isEmpty = true;
 
+		// True if we expect that some tenant will be deleted
+		state bool anyExists = alreadyExists;
+
 		// Collect a list of all tenants that we expect should be deleted by this operation
 		state std::vector<TenantName> tenants;
 		if (!endTenant.present()) {
@@ -433,6 +665,7 @@ struct TenantManagementWorkload : TestWorkload {
 			     itr != self->createdTenants.end() && itr->first < endTenant.get();
 			     ++itr) {
 				tenants.push_back(itr->first);
+				anyExists = true;
 			}
 		}
 
@@ -443,7 +676,7 @@ struct TenantManagementWorkload : TestWorkload {
 				for (tenantIndex = 0; tenantIndex < tenants.size(); ++tenantIndex) {
 					// For most tenants, we will delete the contents and make them empty
 					if (deterministicRandom()->random01() < 0.9) {
-						state Transaction clearTr(cx, tenants[tenantIndex]);
+						state Transaction clearTr(self->dataDb, tenants[tenantIndex]);
 						loop {
 							try {
 								clearTr.clear(self->keyName);
@@ -476,15 +709,91 @@ struct TenantManagementWorkload : TestWorkload {
 		loop {
 			try {
 				// Attempt to delete the tenant(s)
-				wait(deleteImpl(cx, tr, beginTenant, endTenant, tenants, operationType, self));
+				state bool retried = false;
+				loop {
+					try {
+						state Version beforeVersion = wait(self->getReadVersion(tr));
+						Optional<Void> result =
+						    wait(timeout(deleteImpl(tr, beginTenant, endTenant, tenants, operationType, self),
+						                 deterministicRandom()->randomInt(1, 30)));
 
-				// Transaction-based operations do not fail if the tenant isn't present. If we attempted to delete a
-				// single tenant that didn't exist, we can just return.
-				if (!alreadyExists && !endTenant.present() && operationType != OperationType::MANAGEMENT_DATABASE) {
+						if (result.present()) {
+							if (anyExists) {
+								if (self->oldestDeletionVersion == 0 && !tenants.empty()) {
+									tr->reset();
+									Version afterVersion = wait(self->getReadVersion(tr));
+									self->oldestDeletionVersion = afterVersion;
+								}
+								self->newestDeletionVersion = beforeVersion;
+							}
+
+							// Database operations shouldn't get here if the tenant didn't exist
+							ASSERT(operationType == OperationType::SPECIAL_KEYS ||
+							       operationType == OperationType::MANAGEMENT_TRANSACTION || alreadyExists);
+							break;
+						}
+
+						retried = true;
+						tr->reset();
+					} catch (Error& e) {
+						// If we retried the deletion after our initial attempt succeeded, then we proceed with the
+						// rest of the deletion steps normally. Otherwise, the deletion happened elsewhere and we
+						// failed here, so we can rethrow the error.
+						if (e.code() == error_code_tenant_not_found && existedAtStart) {
+							ASSERT(operationType == OperationType::METACLUSTER ||
+							       operationType == OperationType::MANAGEMENT_DATABASE);
+							ASSERT(retried);
+							break;
+						} else if (e.code() == error_code_grv_proxy_memory_limit_exceeded ||
+						           e.code() == error_code_batch_transaction_throttled) {
+							// GRV proxy returns an error
+							wait(tr->onError(e));
+							continue;
+						} else {
+							throw;
+						}
+					}
+
+					if (!tenants.empty()) {
+						// Check the state of the first deleted tenant
+						Optional<TenantMapEntry> resultEntry =
+						    wait(self->tryGetTenant(*tenants.begin(), operationType));
+
+						if (!resultEntry.present()) {
+							alreadyExists = false;
+						} else if (resultEntry.get().tenantState == TenantState::REMOVING) {
+							ASSERT(operationType == OperationType::METACLUSTER);
+						} else {
+							ASSERT(resultEntry.get().tenantState == TenantState::READY);
+						}
+					}
+				}
+
+				// The management transaction operation is a no-op if there are no tenants to delete in a range
+				// delete
+				if (tenants.size() == 0 && operationType == OperationType::MANAGEMENT_TRANSACTION) {
 					return Void();
 				}
 
-				ASSERT(alreadyExists || endTenant.present());
+				// The special keys operation is a no-op if the begin and end tenant are equal (i.e. the range is
+				// empty)
+				if (endTenant.present() && beginTenant == endTenant.get() &&
+				    operationType == OperationType::SPECIAL_KEYS) {
+					return Void();
+				}
+
+				// Check that using the wrong deletion type fails depending on whether we are using a metacluster
+				ASSERT(self->useMetacluster == (operationType == OperationType::METACLUSTER));
+
+				// Transaction-based operations do not fail if the tenant isn't present. If we attempted to delete a
+				// single tenant that didn't exist, we can just return.
+				if (!existedAtStart && !endTenant.present() &&
+				    (operationType == OperationType::MANAGEMENT_TRANSACTION ||
+				     operationType == OperationType::SPECIAL_KEYS)) {
+					return Void();
+				}
+
+				ASSERT(existedAtStart || endTenant.present());
 
 				// Deletion should not succeed if any tenant in the range wasn't empty
 				ASSERT(isEmpty);
@@ -510,12 +819,16 @@ struct TenantManagementWorkload : TestWorkload {
 				if (e.code() == error_code_tenant_not_empty) {
 					ASSERT(!isEmpty);
 					return Void();
+				} else if (e.code() == error_code_invalid_metacluster_operation) {
+					ASSERT(operationType == OperationType::METACLUSTER != self->useMetacluster);
+					return Void();
 				}
 
 				// Database-based operations do not need to be retried
-				else if (operationType == OperationType::MANAGEMENT_DATABASE) {
+				else if (operationType == OperationType::MANAGEMENT_DATABASE ||
+				         operationType == OperationType::METACLUSTER) {
 					if (e.code() == error_code_tenant_not_found) {
-						ASSERT(!alreadyExists && !endTenant.present());
+						ASSERT(!existedAtStart && !endTenant.present());
 					} else {
 						TraceEvent(SevError, "DeleteTenantFailure")
 						    .error(e)
@@ -542,11 +855,10 @@ struct TenantManagementWorkload : TestWorkload {
 	}
 
 	// Performs some validation on a tenant's contents
-	ACTOR static Future<Void> checkTenantContents(Database cx,
-	                                              TenantManagementWorkload* self,
+	ACTOR static Future<Void> checkTenantContents(TenantManagementWorkload* self,
 	                                              TenantName tenant,
 	                                              TenantData tenantData) {
-		state Transaction tr(cx, tenant);
+		state Transaction tr(self->dataDb, tenant);
 		loop {
 			try {
 				// We only every store a single key in each tenant. Therefore we expect a range read of the entire
@@ -589,6 +901,7 @@ struct TenantManagementWorkload : TestWorkload {
 		std::string base64TenantGroup;
 		std::string printableTenantGroup;
 		bool encrypted;
+		std::string assignedClusterStr;
 
 		jsonDoc.get("id", id);
 		jsonDoc.get("prefix.base64", base64Prefix);
@@ -608,14 +921,18 @@ struct TenantManagementWorkload : TestWorkload {
 			tenantGroup = TenantGroupNameRef(tenantGroupStr);
 		}
 
-		TenantMapEntry entry(id, TenantState::READY, tenantGroup, encrypted);
+		Optional<ClusterName> assignedCluster;
+		if (jsonDoc.tryGet("assigned_cluster", assignedClusterStr)) {
+			assignedCluster = ClusterNameRef(assignedClusterStr);
+		}
+
+		TenantMapEntry entry(id, TenantMapEntry::stringToTenantState(tenantStateStr), tenantGroup, encrypted);
 		ASSERT(entry.prefix == prefix);
 		return entry;
 	}
 
 	// Gets the metadata for a tenant using the specified operation type
-	ACTOR static Future<TenantMapEntry> getImpl(Database cx,
-	                                            Reference<ReadYourWritesTransaction> tr,
+	ACTOR static Future<TenantMapEntry> getImpl(Reference<ReadYourWritesTransaction> tr,
 	                                            TenantName tenant,
 	                                            OperationType operationType,
 	                                            TenantManagementWorkload* self) {
@@ -628,35 +945,39 @@ struct TenantManagementWorkload : TestWorkload {
 			}
 			entry = TenantManagementWorkload::jsonToTenantMapEntry(value.get());
 		} else if (operationType == OperationType::MANAGEMENT_DATABASE) {
-			TenantMapEntry _entry = wait(TenantAPI::getTenant(cx.getReference(), tenant));
+			TenantMapEntry _entry = wait(TenantAPI::getTenant(self->dataDb.getReference(), tenant));
 			entry = _entry;
 		} else if (operationType == OperationType::MANAGEMENT_TRANSACTION) {
 			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
 			TenantMapEntry _entry = wait(TenantAPI::getTenantTransaction(tr, tenant));
 			entry = _entry;
+		} else {
+			TenantMapEntry _entry = wait(MetaclusterAPI::getTenant(self->mvDb, tenant));
+			entry = _entry;
 		}
 
 		return entry;
 	}
 
-	ACTOR static Future<Void> getTenant(Database cx, TenantManagementWorkload* self) {
+	ACTOR static Future<Void> getTenant(TenantManagementWorkload* self) {
 		state TenantName tenant = self->chooseTenantName(true);
-		state OperationType operationType = TenantManagementWorkload::randomOperationType();
-		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
+		state OperationType operationType = self->randomOperationType();
+		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->dataDb);
 
 		// True if the tenant should should exist and return a result
 		auto itr = self->createdTenants.find(tenant);
-		state bool alreadyExists = itr != self->createdTenants.end();
+		state bool alreadyExists = itr != self->createdTenants.end() &&
+		                           !(operationType == OperationType::METACLUSTER && !self->useMetacluster);
 		state TenantData tenantData = alreadyExists ? itr->second : TenantData();
 
 		loop {
 			try {
 				// Get the tenant metadata and check that it matches our local state
-				state TenantMapEntry entry = wait(getImpl(cx, tr, tenant, operationType, self));
+				state TenantMapEntry entry = wait(getImpl(tr, tenant, operationType, self));
 				ASSERT(alreadyExists);
 				ASSERT(entry.id == tenantData.id);
 				ASSERT(entry.tenantGroup == tenantData.tenantGroup);
-				wait(self->checkTenantContents(cx, self, tenant, tenantData));
+				wait(checkTenantContents(self, tenant, tenantData));
 				return Void();
 			} catch (Error& e) {
 				state bool retry = false;
@@ -668,8 +989,10 @@ struct TenantManagementWorkload : TestWorkload {
 				}
 
 				// Transaction-based operations should retry
-				else if (operationType != OperationType::MANAGEMENT_DATABASE) {
+				else if (operationType == OperationType::MANAGEMENT_TRANSACTION ||
+				         operationType == OperationType::SPECIAL_KEYS) {
 					try {
+						retry = true;
 						wait(tr->onError(e));
 						retry = true;
 					} catch (Error& e) {
@@ -688,7 +1011,6 @@ struct TenantManagementWorkload : TestWorkload {
 
 	// Gets a list of tenants using the specified operation type
 	ACTOR static Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listImpl(
-	    Database cx,
 	    Reference<ReadYourWritesTransaction> tr,
 	    TenantName beginTenant,
 	    TenantName endTenant,
@@ -706,24 +1028,29 @@ struct TenantManagementWorkload : TestWorkload {
 			}
 		} else if (operationType == OperationType::MANAGEMENT_DATABASE) {
 			std::vector<std::pair<TenantName, TenantMapEntry>> _tenants =
-			    wait(TenantAPI::listTenants(cx.getReference(), beginTenant, endTenant, limit));
+			    wait(TenantAPI::listTenants(self->dataDb.getReference(), beginTenant, endTenant, limit));
 			tenants = _tenants;
 		} else if (operationType == OperationType::MANAGEMENT_TRANSACTION) {
 			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
 			std::vector<std::pair<TenantName, TenantMapEntry>> _tenants =
 			    wait(TenantAPI::listTenantsTransaction(tr, beginTenant, endTenant, limit));
 			tenants = _tenants;
+		} else {
+			std::vector<std::pair<TenantName, TenantMapEntry>> _tenants =
+			    wait(MetaclusterAPI::listTenants(self->mvDb, beginTenant, endTenant, limit));
+			tenants = _tenants;
 		}
 
 		return tenants;
 	}
 
-	ACTOR static Future<Void> listTenants(Database cx, TenantManagementWorkload* self) {
+	ACTOR static Future<Void> listTenants(TenantManagementWorkload* self) {
 		state TenantName beginTenant = self->chooseTenantName(false);
 		state TenantName endTenant = self->chooseTenantName(false);
-		state int limit = std::min(CLIENT_KNOBS->TOO_MANY, deterministicRandom()->randomInt(1, self->maxTenants * 2));
-		state OperationType operationType = TenantManagementWorkload::randomOperationType();
-		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
+		state int limit = std::min(CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1,
+		                           deterministicRandom()->randomInt(1, self->maxTenants * 2));
+		state OperationType operationType = self->randomOperationType();
+		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->dataDb);
 
 		if (beginTenant > endTenant) {
 			std::swap(beginTenant, endTenant);
@@ -733,7 +1060,14 @@ struct TenantManagementWorkload : TestWorkload {
 			try {
 				// Attempt to read the chosen list of tenants
 				state std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
-				    wait(listImpl(cx, tr, beginTenant, endTenant, limit, operationType, self));
+				    wait(listImpl(tr, beginTenant, endTenant, limit, operationType, self));
+
+				// Attempting to read the list of tenants using the metacluster API in a non-metacluster should
+				// return nothing in this test
+				if (operationType == OperationType::METACLUSTER && !self->useMetacluster) {
+					ASSERT(tenants.size() == 0);
+					return Void();
+				}
 
 				ASSERT(tenants.size() <= limit);
 
@@ -754,7 +1088,8 @@ struct TenantManagementWorkload : TestWorkload {
 				state Error error = e;
 
 				// Transaction-based operations need to be retried
-				if (operationType != OperationType::MANAGEMENT_DATABASE) {
+				if (operationType == OperationType::MANAGEMENT_TRANSACTION ||
+				    operationType == OperationType::SPECIAL_KEYS) {
 					try {
 						retry = true;
 						wait(tr->onError(e));
@@ -777,18 +1112,19 @@ struct TenantManagementWorkload : TestWorkload {
 	}
 
 	// Helper function that checks tenant keyspace and updates internal Tenant Map after a rename operation
-	ACTOR Future<Void> verifyTenantRename(Database cx,
-	                                      TenantManagementWorkload* self,
-	                                      TenantName oldTenantName,
-	                                      TenantName newTenantName) {
-		state Optional<TenantMapEntry> oldTenantEntry = wait(TenantAPI::tryGetTenant(cx.getReference(), oldTenantName));
-		state Optional<TenantMapEntry> newTenantEntry = wait(TenantAPI::tryGetTenant(cx.getReference(), newTenantName));
+	ACTOR static Future<Void> verifyTenantRename(TenantManagementWorkload* self,
+	                                             TenantName oldTenantName,
+	                                             TenantName newTenantName) {
+		state Optional<TenantMapEntry> oldTenantEntry =
+		    wait(TenantAPI::tryGetTenant(self->dataDb.getReference(), oldTenantName));
+		state Optional<TenantMapEntry> newTenantEntry =
+		    wait(TenantAPI::tryGetTenant(self->dataDb.getReference(), newTenantName));
 		ASSERT(!oldTenantEntry.present());
 		ASSERT(newTenantEntry.present());
 		TenantData tData = self->createdTenants[oldTenantName];
 		self->createdTenants[newTenantName] = tData;
 		self->createdTenants.erase(oldTenantName);
-		state Transaction insertTr(cx, newTenantName);
+		state Transaction insertTr(self->dataDb, newTenantName);
 		if (!tData.empty) {
 			loop {
 				try {
@@ -803,20 +1139,18 @@ struct TenantManagementWorkload : TestWorkload {
 		return Void();
 	}
 
-	ACTOR Future<Void> verifyTenantRenames(Database cx,
-	                                       TenantManagementWorkload* self,
-	                                       std::map<TenantName, TenantName> tenantRenames) {
+	ACTOR static Future<Void> verifyTenantRenames(TenantManagementWorkload* self,
+	                                              std::map<TenantName, TenantName> tenantRenames) {
 		state std::map<TenantName, TenantName> tenantRenamesCopy = tenantRenames;
 		state std::map<TenantName, TenantName>::iterator iter = tenantRenamesCopy.begin();
 		for (; iter != tenantRenamesCopy.end(); ++iter) {
-			wait(self->verifyTenantRename(cx, self, iter->first, iter->second));
-			wait(self->checkTenantContents(cx, self, iter->second, self->createdTenants[iter->second]));
+			wait(verifyTenantRename(self, iter->first, iter->second));
+			wait(checkTenantContents(self, iter->second, self->createdTenants[iter->second]));
 		}
 		return Void();
 	}
 
-	ACTOR static Future<Void> renameImpl(Database cx,
-	                                     Reference<ReadYourWritesTransaction> tr,
+	ACTOR static Future<Void> renameImpl(Reference<ReadYourWritesTransaction> tr,
 	                                     OperationType operationType,
 	                                     std::map<TenantName, TenantName> tenantRenames,
 	                                     bool tenantNotFound,
@@ -832,9 +1166,9 @@ struct TenantManagementWorkload : TestWorkload {
 		} else if (operationType == OperationType::MANAGEMENT_DATABASE) {
 			ASSERT(tenantRenames.size() == 1);
 			auto iter = tenantRenames.begin();
-			wait(TenantAPI::renameTenant(cx.getReference(), iter->first, iter->second));
+			wait(TenantAPI::renameTenant(self->dataDb.getReference(), iter->first, iter->second));
 			ASSERT(!tenantNotFound && !tenantExists);
-		} else { // operationType == OperationType::MANAGEMENT_TRANSACTION
+		} else if (operationType == OperationType::MANAGEMENT_TRANSACTION) {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			std::vector<Future<Void>> renameFutures;
 			for (auto& iter : tenantRenames) {
@@ -843,14 +1177,18 @@ struct TenantManagementWorkload : TestWorkload {
 			wait(waitForAll(renameFutures));
 			wait(tr->commit());
 			ASSERT(!tenantNotFound && !tenantExists);
+		} else { // operationType == OperationType::METACLUSTER
+			ASSERT(tenantRenames.size() == 1);
+			auto iter = tenantRenames.begin();
+			wait(MetaclusterAPI::renameTenant(self->mvDb, iter->first, iter->second));
 		}
 		return Void();
 	}
 
-	ACTOR static Future<Void> renameTenant(Database cx, TenantManagementWorkload* self) {
-		state OperationType operationType = TenantManagementWorkload::randomOperationType();
+	ACTOR static Future<Void> renameTenant(TenantManagementWorkload* self) {
+		state OperationType operationType = self->randomOperationType();
 		state int numTenants = 1;
-		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
+		state Reference<ReadYourWritesTransaction> tr = self->dataDb->createTransaction();
 
 		if (operationType == OperationType::SPECIAL_KEYS || operationType == OperationType::MANAGEMENT_TRANSACTION) {
 			numTenants = deterministicRandom()->randomInt(1, 5);
@@ -891,43 +1229,45 @@ struct TenantManagementWorkload : TestWorkload {
 
 		loop {
 			try {
-				wait(renameImpl(
-				    cx, tr, operationType, tenantRenames, tenantNotFound, tenantExists, tenantOverlap, self));
-				wait(self->verifyTenantRenames(cx, self, tenantRenames));
+				wait(renameImpl(tr, operationType, tenantRenames, tenantNotFound, tenantExists, tenantOverlap, self));
+				wait(verifyTenantRenames(self, tenantRenames));
+				// Check that using the wrong rename API fails depending on whether we are using a metacluster
+				ASSERT(self->useMetacluster == (operationType == OperationType::METACLUSTER));
 				return Void();
 			} catch (Error& e) {
 				if (e.code() == error_code_tenant_not_found) {
-					TraceEvent("RenameTenantOldTenantNotFound")
-					    .detail("TenantRenames", describe(tenantRenames))
-					    .detail("CommitUnknownResult", unknownResult);
 					if (unknownResult) {
-						wait(self->verifyTenantRenames(cx, self, tenantRenames));
+						wait(verifyTenantRenames(self, tenantRenames));
 					} else {
 						ASSERT(tenantNotFound);
 					}
 					return Void();
 				} else if (e.code() == error_code_tenant_already_exists) {
-					TraceEvent("RenameTenantNewTenantAlreadyExists")
-					    .detail("TenantRenames", describe(tenantRenames))
-					    .detail("CommitUnknownResult", unknownResult);
 					if (unknownResult) {
-						wait(self->verifyTenantRenames(cx, self, tenantRenames));
+						wait(verifyTenantRenames(self, tenantRenames));
 					} else {
 						ASSERT(tenantExists);
 					}
 					return Void();
 				} else if (e.code() == error_code_special_keys_api_failure) {
-					TraceEvent("RenameTenantNameConflict").detail("TenantRenames", describe(tenantRenames));
 					ASSERT(tenantOverlap);
 					return Void();
+				} else if (e.code() == error_code_invalid_metacluster_operation) {
+					ASSERT(operationType == OperationType::METACLUSTER != self->useMetacluster);
+					return Void();
+				} else if (e.code() == error_code_cluster_no_capacity) {
+					// This error should only occur on metacluster due to the multi-stage process.
+					// Having temporary tenants may exceed capacity, so we disallow the rename.
+					ASSERT(operationType == OperationType::METACLUSTER);
+					return Void();
 				} else {
 					try {
 						// In the case of commit_unknown_result, assume we continue retrying
 						// until it's successful. Next loop around may throw error because it's
 						// already been moved, so account for that and update internal map as needed.
 						if (e.code() == error_code_commit_unknown_result) {
-							TraceEvent("RenameTenantCommitUnknownResult").error(e);
-							ASSERT(operationType != OperationType::MANAGEMENT_DATABASE);
+							ASSERT(operationType != OperationType::MANAGEMENT_DATABASE &&
+							       operationType != OperationType::METACLUSTER);
 							unknownResult = true;
 						}
 						wait(tr->onError(e));
@@ -984,6 +1324,8 @@ struct TenantManagementWorkload : TestWorkload {
 
 			wait(tr->commit());
 			ASSERT(!specialKeysUseInvalidTuple);
+		} else if (operationType == OperationType::METACLUSTER) {
+			wait(MetaclusterAPI::configureTenant(self->mvDb, tenant, configParameters));
 		} else {
 			// We don't have a transaction or database variant of this function
 			ASSERT(false);
@@ -992,13 +1334,14 @@ struct TenantManagementWorkload : TestWorkload {
 		return Void();
 	}
 
-	ACTOR static Future<Void> configureTenant(Database cx, TenantManagementWorkload* self) {
-		state OperationType operationType = OperationType::SPECIAL_KEYS;
+	ACTOR static Future<Void> configureTenant(TenantManagementWorkload* self) {
+		state OperationType operationType =
+		    deterministicRandom()->coinflip() ? OperationType::SPECIAL_KEYS : OperationType::METACLUSTER;
 
 		state TenantName tenant = self->chooseTenantName(true);
 		auto itr = self->createdTenants.find(tenant);
 		state bool exists = itr != self->createdTenants.end();
-		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
+		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->dataDb);
 
 		state std::map<Standalone<StringRef>, Optional<Value>> configuration;
 		state Optional<TenantGroupName> newTenantGroup;
@@ -1056,6 +1399,9 @@ struct TenantManagementWorkload : TestWorkload {
 				} else if (e.code() == error_code_invalid_tenant_configuration) {
 					ASSERT(hasInvalidOption);
 					return Void();
+				} else if (e.code() == error_code_invalid_metacluster_operation) {
+					ASSERT(operationType == OperationType::METACLUSTER != self->useMetacluster);
+					return Void();
 				} else if (e.code() == error_code_invalid_tenant_group_name) {
 					ASSERT(hasSystemTenantGroup);
 					return Void();
@@ -1071,7 +1417,14 @@ struct TenantManagementWorkload : TestWorkload {
 		}
 	}
 
-	Future<Void> start(Database const& cx) override { return _start(cx, this); }
+	Future<Void> start(Database const& cx) override {
+		if (clientId == 0 || !singleClient) {
+			return _start(cx, this);
+		} else {
+			return Void();
+		}
+	}
+
 	ACTOR Future<Void> _start(Database cx, TenantManagementWorkload* self) {
 		state double start = now();
 
@@ -1079,72 +1432,52 @@ struct TenantManagementWorkload : TestWorkload {
 		while (now() < start + self->testDuration) {
 			state int operation = deterministicRandom()->randomInt(0, 6);
 			if (operation == 0) {
-				wait(createTenant(cx, self));
+				wait(createTenant(self));
 			} else if (operation == 1) {
-				wait(deleteTenant(cx, self));
+				wait(deleteTenant(self));
 			} else if (operation == 2) {
-				wait(getTenant(cx, self));
+				wait(getTenant(self));
 			} else if (operation == 3) {
-				wait(listTenants(cx, self));
+				wait(listTenants(self));
 			} else if (operation == 4) {
-				wait(renameTenant(cx, self));
+				wait(renameTenant(self));
 			} else if (operation == 5) {
-				wait(configureTenant(cx, self));
+				wait(configureTenant(self));
 			}
 		}
 
 		return Void();
 	}
 
-	// Verify that the tenant count matches the actual number of tenants in the cluster and that we haven't created too
-	// many
-	ACTOR static Future<Void> checkTenantCount(Database cx) {
-		state Reference<ReadYourWritesTransaction> tr = cx->createTransaction();
-		loop {
-			try {
-				tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
-				state int64_t tenantCount = wait(TenantMetadata::tenantCount.getD(tr, Snapshot::False, 0));
-				KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> tenants =
-				    wait(TenantMetadata::tenantMap.getRange(tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1));
-
-				ASSERT(tenants.results.size() == tenantCount && !tenants.more);
-				ASSERT(tenantCount <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER);
-				return Void();
-			} catch (Error& e) {
-				wait(tr->onError(e));
-			}
-		}
-	}
-
 	// Verify that the set of tenants in the database matches our local state
-	ACTOR static Future<Void> compareTenants(Database cx, TenantManagementWorkload* self) {
+	ACTOR static Future<Void> compareTenants(TenantManagementWorkload* self) {
 		state std::map<TenantName, TenantData>::iterator localItr = self->createdTenants.begin();
 		state std::vector<Future<Void>> checkTenants;
 		state TenantName beginTenant = ""_sr.withPrefix(self->localTenantNamePrefix);
 		state TenantName endTenant = "\xff\xff"_sr.withPrefix(self->localTenantNamePrefix);
 
 		loop {
-			// Read the tenant list
-			state std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
-			    wait(TenantAPI::listTenants(cx.getReference(), beginTenant, endTenant, 1000));
+			// Read the tenant list from the data cluster.
+			state std::vector<std::pair<TenantName, TenantMapEntry>> dataClusterTenants =
+			    wait(TenantAPI::listTenants(self->dataDb.getReference(), beginTenant, endTenant, 1000));
 
-			auto dataItr = tenants.begin();
+			auto dataItr = dataClusterTenants.begin();
 
 			TenantNameRef lastTenant;
-			while (dataItr != tenants.end()) {
+			while (dataItr != dataClusterTenants.end()) {
 				ASSERT(localItr != self->createdTenants.end());
 				ASSERT(dataItr->first == localItr->first);
 				ASSERT(dataItr->second.tenantGroup == localItr->second.tenantGroup);
 				ASSERT(dataItr->second.encrypted == localItr->second.encrypted);
 
-				checkTenants.push_back(checkTenantContents(cx, self, dataItr->first, localItr->second));
+				checkTenants.push_back(checkTenantContents(self, dataItr->first, localItr->second));
 				lastTenant = dataItr->first;
 
 				++localItr;
 				++dataItr;
 			}
 
-			if (tenants.size() < 1000) {
+			if (dataClusterTenants.size() < 1000) {
 				break;
 			} else {
 				beginTenant = keyAfter(lastTenant);
@@ -1165,10 +1498,10 @@ struct TenantManagementWorkload : TestWorkload {
 		KeyBackedSet<Tuple>::RangeResultType tenants =
 		    wait(runTransaction(db, [tenantGroupRef, expectedCountRef](Reference<typename DB::TransactionT> tr) {
 			    tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
-			    return TenantMetadata::tenantGroupTenantIndex.getRange(tr,
-			                                                           Tuple::makeTuple(tenantGroupRef),
-			                                                           Tuple::makeTuple(keyAfter(tenantGroupRef)),
-			                                                           expectedCountRef + 1);
+			    return TenantMetadata::tenantGroupTenantIndex().getRange(tr,
+			                                                             Tuple::makeTuple(tenantGroupRef),
+			                                                             Tuple::makeTuple(keyAfter(tenantGroupRef)),
+			                                                             expectedCountRef + 1);
 		    }));
 
 		ASSERT(tenants.results.size() == expectedCount && !tenants.more);
@@ -1176,41 +1509,43 @@ struct TenantManagementWorkload : TestWorkload {
 	}
 
 	// Verify that the set of tenants in the database matches our local state
-	ACTOR static Future<Void> compareTenantGroups(Database cx, TenantManagementWorkload* self) {
-		// Verify that the set of tena
+	ACTOR static Future<Void> compareTenantGroups(TenantManagementWorkload* self) {
 		state std::map<TenantName, TenantGroupData>::iterator localItr = self->createdTenantGroups.begin();
 		state TenantName beginTenantGroup = ""_sr.withPrefix(self->localTenantGroupNamePrefix);
 		state TenantName endTenantGroup = "\xff\xff"_sr.withPrefix(self->localTenantGroupNamePrefix);
 		state std::vector<Future<Void>> checkTenantGroups;
 
 		loop {
-			// Read the tenant group list
-			state KeyBackedRangeResult<std::pair<TenantGroupName, TenantGroupEntry>> tenantGroups;
+			// Read the tenant group list from the data cluster.
+			state KeyBackedRangeResult<std::pair<TenantGroupName, TenantGroupEntry>> dataClusterTenantGroups;
 			TenantName const& beginTenantGroupRef = beginTenantGroup;
 			TenantName const& endTenantGroupRef = endTenantGroup;
-			KeyBackedRangeResult<std::pair<TenantGroupName, TenantGroupEntry>> _tenantGroups = wait(runTransaction(
-			    cx.getReference(), [beginTenantGroupRef, endTenantGroupRef](Reference<ReadYourWritesTransaction> tr) {
-				    tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
-				    return TenantMetadata::tenantGroupMap.getRange(tr, beginTenantGroupRef, endTenantGroupRef, 1000);
-			    }));
-			tenantGroups = _tenantGroups;
+			wait(
+			    store(dataClusterTenantGroups,
+			          runTransaction(self->dataDb.getReference(),
+			                         [beginTenantGroupRef, endTenantGroupRef](Reference<ReadYourWritesTransaction> tr) {
+				                         tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+				                         return TenantMetadata::tenantGroupMap().getRange(
+				                             tr, beginTenantGroupRef, endTenantGroupRef, 1000);
+			                         })));
 
-			auto dataItr = tenantGroups.results.begin();
+			auto dataItr = dataClusterTenantGroups.results.begin();
 
 			TenantGroupNameRef lastTenantGroup;
-			while (dataItr != tenantGroups.results.end()) {
+			while (dataItr != dataClusterTenantGroups.results.end()) {
 				ASSERT(localItr != self->createdTenantGroups.end());
 				ASSERT(dataItr->first == localItr->first);
+				ASSERT(!dataItr->second.assignedCluster.present());
 				lastTenantGroup = dataItr->first;
 
-				checkTenantGroups.push_back(
-				    checkTenantGroupTenantCount(cx.getReference(), dataItr->first, localItr->second.tenantCount));
+				checkTenantGroups.push_back(checkTenantGroupTenantCount(
+				    self->dataDb.getReference(), dataItr->first, localItr->second.tenantCount));
 
 				++localItr;
 				++dataItr;
 			}
 
-			if (!tenantGroups.more) {
+			if (!dataClusterTenantGroups.more) {
 				break;
 			} else {
 				beginTenantGroup = keyAfter(lastTenantGroup);
@@ -1221,9 +1556,42 @@ struct TenantManagementWorkload : TestWorkload {
 		return Void();
 	}
 
-	Future<bool> check(Database const& cx) override { return _check(cx, this); }
+	// Check that the tenant tombstones are properly cleaned up
+	ACTOR static Future<Void> checkTombstoneCleanup(TenantManagementWorkload* self) {
+		state Reference<ReadYourWritesTransaction> tr = self->dataDb->createTransaction();
+		loop {
+			try {
+				tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+
+				Optional<TenantTombstoneCleanupData> tombstoneCleanupData =
+				    wait(TenantMetadata::tombstoneCleanupData().get(tr));
+
+				if (self->oldestDeletionVersion != 0) {
+					ASSERT(tombstoneCleanupData.present());
+					if (self->newestDeletionVersion - self->oldestDeletionVersion >
+					    CLIENT_KNOBS->TENANT_TOMBSTONE_CLEANUP_INTERVAL * CLIENT_KNOBS->VERSIONS_PER_SECOND) {
+						ASSERT(tombstoneCleanupData.get().tombstonesErasedThrough >= 0);
+					}
+				}
+				break;
+			} catch (Error& e) {
+				wait(tr->onError(e));
+			}
+		}
+
+		return Void();
+	}
+
+	Future<bool> check(Database const& cx) override {
+		if (clientId == 0 || !singleClient) {
+			return _check(cx, this);
+		} else {
+			return true;
+		}
+	}
+
 	ACTOR static Future<bool> _check(Database cx, TenantManagementWorkload* self) {
-		state Transaction tr(cx);
+		state Transaction tr(self->dataDb);
 
 		// Check that the key we set outside of the tenant is present and has the correct value
 		// This is the same key we set inside some of our tenants, so this checks that no tenant
@@ -1239,11 +1607,19 @@ struct TenantManagementWorkload : TestWorkload {
 			}
 		}
 
-		if (self->clientId == 0) {
-			wait(checkTenantCount(cx));
+		wait(compareTenants(self) && compareTenantGroups(self));
+
+		if (self->useMetacluster) {
+			// The metacluster consistency check runs the tenant consistency check for each cluster
+			state MetaclusterConsistencyCheck<IDatabase> metaclusterConsistencyCheck(
+			    self->mvDb, AllowPartialMetaclusterOperations::False);
+			wait(metaclusterConsistencyCheck.run());
+			wait(checkTombstoneCleanup(self));
+		} else {
+			state TenantConsistencyCheck<DatabaseContext> tenantConsistencyCheck(self->dataDb.getReference());
+			wait(tenantConsistencyCheck.run());
 		}
 
-		wait(compareTenants(cx, self) && compareTenantGroups(cx, self));
 		return true;
 	}
 
diff --git a/fdbserver/workloads/UnitTests.actor.cpp b/fdbserver/workloads/UnitTests.actor.cpp
index 83488f07a4..4d0ac0c87e 100644
--- a/fdbserver/workloads/UnitTests.actor.cpp
+++ b/fdbserver/workloads/UnitTests.actor.cpp
@@ -125,6 +125,10 @@ struct UnitTestWorkload : TestWorkload {
 			}
 		}
 
+		std::sort(tests.begin(), tests.end(), [](auto lhs, auto rhs) {
+			return std::string_view(lhs->name) < std::string_view(rhs->name);
+		});
+
 		fprintf(stdout, "Found %zu tests\n", tests.size());
 
 		if (tests.size() == 0) {
@@ -142,6 +146,11 @@ struct UnitTestWorkload : TestWorkload {
 			state UnitTest* test = *t;
 			printf("Testing %s\n", test->name);
 
+			TraceEvent(SevInfo, "RunningUnitTest")
+			    .detail("Name", test->name)
+			    .detail("File", test->file)
+			    .detail("Line", test->line);
+
 			state Error result = success();
 			state double start_now = now();
 			state double start_timer = timer();
diff --git a/fdbserver/workloads/VersionStamp.actor.cpp b/fdbserver/workloads/VersionStamp.actor.cpp
index deb900d788..1d19ff25c9 100644
--- a/fdbserver/workloads/VersionStamp.actor.cpp
+++ b/fdbserver/workloads/VersionStamp.actor.cpp
@@ -155,7 +155,9 @@ struct VersionStampWorkload : TestWorkload {
 
 	ACTOR Future<bool> _check(Database cx, VersionStampWorkload* self) {
 		if (self->validateExtraDB) {
-			auto extraFile = makeReference<ClusterConnectionMemoryRecord>(*g_simulator.extraDB);
+			ASSERT(g_simulator.extraDatabases.size() == 1);
+			auto extraFile =
+			    makeReference<ClusterConnectionMemoryRecord>(ClusterConnectionString(g_simulator.extraDatabases[0]));
 			cx = Database::createDatabase(extraFile, -1);
 		}
 		state ReadYourWritesTransaction tr(cx);
@@ -312,8 +314,10 @@ struct VersionStampWorkload : TestWorkload {
 		state double lastTime = now();
 		state Database extraDB;
 
-		if (g_simulator.extraDB != nullptr) {
-			auto extraFile = makeReference<ClusterConnectionMemoryRecord>(*g_simulator.extraDB);
+		if (!g_simulator.extraDatabases.empty()) {
+			ASSERT(g_simulator.extraDatabases.size() == 1);
+			auto extraFile =
+			    makeReference<ClusterConnectionMemoryRecord>(ClusterConnectionString(g_simulator.extraDatabases[0]));
 			extraDB = Database::createDatabase(extraFile, -1);
 		}
 
@@ -380,7 +384,7 @@ struct VersionStampWorkload : TestWorkload {
 
 				} catch (Error& e) {
 					err = e;
-					if (err.code() == error_code_database_locked && g_simulator.extraDB != nullptr) {
+					if (err.code() == error_code_database_locked && !g_simulator.extraDatabases.empty()) {
 						//TraceEvent("VST_CommitDatabaseLocked");
 						cx_is_primary = !cx_is_primary;
 						tr = ReadYourWritesTransaction(cx_is_primary ? cx : extraDB);
diff --git a/fdbserver/workloads/WriteDuringRead.actor.cpp b/fdbserver/workloads/WriteDuringRead.actor.cpp
index 78df96173f..0619389279 100644
--- a/fdbserver/workloads/WriteDuringRead.actor.cpp
+++ b/fdbserver/workloads/WriteDuringRead.actor.cpp
@@ -88,9 +88,11 @@ struct WriteDuringReadWorkload : TestWorkload {
 		CODE_PROBE(adjacentKeys && (nodes + minNode) > CLIENT_KNOBS->KEY_SIZE_LIMIT,
 		           "WriteDuringReadWorkload testing large keys");
 
-		useExtraDB = g_simulator.extraDB != nullptr;
+		useExtraDB = !g_simulator.extraDatabases.empty();
 		if (useExtraDB) {
-			auto extraFile = makeReference<ClusterConnectionMemoryRecord>(*g_simulator.extraDB);
+			ASSERT(g_simulator.extraDatabases.size() == 1);
+			auto extraFile =
+			    makeReference<ClusterConnectionMemoryRecord>(ClusterConnectionString(g_simulator.extraDatabases[0]));
 			extraDB = Database::createDatabase(extraFile, -1);
 			useSystemKeys = false;
 		}
diff --git a/flow/BlobCipher.cpp b/flow/BlobCipher.cpp
index a8033f5633..123f63fef0 100644
--- a/flow/BlobCipher.cpp
+++ b/flow/BlobCipher.cpp
@@ -20,21 +20,34 @@
 
 #include "flow/BlobCipher.h"
 
+#include "flow/Arena.h"
 #include "flow/EncryptUtils.h"
 #include "flow/Knobs.h"
 #include "flow/Error.h"
 #include "flow/FastRef.h"
 #include "flow/IRandom.h"
 #include "flow/ITrace.h"
+#include "flow/flow.h"
 #include "flow/network.h"
 #include "flow/Trace.h"
 #include "flow/UnitTest.h"
 
+#include <chrono>
 #include <cstring>
+#include <limits>
 #include <memory>
 #include <string>
+#include <thread>
 #include <utility>
 
+#ifndef _WIN32
+#include <unistd.h>
+#else
+#include <io.h>
+#endif
+
+#define BLOB_CIPHER_DEBUG false
+
 namespace {
 bool isEncryptHeaderAuthTokenModeValid(const EncryptAuthTokenMode mode) {
 	return mode >= ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE && mode < ENCRYPT_HEADER_AUTH_TOKEN_LAST;
@@ -46,29 +59,43 @@ bool isEncryptHeaderAuthTokenModeValid(const EncryptAuthTokenMode mode) {
 BlobCipherKey::BlobCipherKey(const EncryptCipherDomainId& domainId,
                              const EncryptCipherBaseKeyId& baseCiphId,
                              const uint8_t* baseCiph,
-                             int baseCiphLen) {
+                             int baseCiphLen,
+                             const int64_t refreshAt,
+                             const int64_t expireAt) {
+	// Salt generated is used while applying HMAC Key derivation, hence, not using crypto-secure hash algorithm is ok.
+	// Further, 'deterministic' salt generation is used to preserve simulation determinism properties.
 	EncryptCipherRandomSalt salt;
 	if (g_network->isSimulated()) {
 		salt = deterministicRandom()->randomUInt64();
 	} else {
 		salt = nondeterministicRandom()->randomUInt64();
 	}
-	initKey(domainId, baseCiph, baseCiphLen, baseCiphId, salt);
+
+	// Support two type of CipherKeys: 'revocable' & 'non-revocable' ciphers.
+	// In all cases, either cipherKey never expires i.e. refreshAt == infinite, or, refreshAt needs <= expireAt
+	// timestamp.
+	ASSERT(refreshAt == std::numeric_limits<int64_t>::max() || (refreshAt <= expireAt));
+
+	initKey(domainId, baseCiph, baseCiphLen, baseCiphId, salt, refreshAt, expireAt);
 }
 
 BlobCipherKey::BlobCipherKey(const EncryptCipherDomainId& domainId,
                              const EncryptCipherBaseKeyId& baseCiphId,
                              const uint8_t* baseCiph,
                              int baseCiphLen,
-                             const EncryptCipherRandomSalt& salt) {
-	initKey(domainId, baseCiph, baseCiphLen, baseCiphId, salt);
+                             const EncryptCipherRandomSalt& salt,
+                             const int64_t refreshAt,
+                             const int64_t expireAt) {
+	initKey(domainId, baseCiph, baseCiphLen, baseCiphId, salt, refreshAt, expireAt);
 }
 
 void BlobCipherKey::initKey(const EncryptCipherDomainId& domainId,
                             const uint8_t* baseCiph,
                             int baseCiphLen,
                             const EncryptCipherBaseKeyId& baseCiphId,
-                            const EncryptCipherRandomSalt& salt) {
+                            const EncryptCipherRandomSalt& salt,
+                            const int64_t refreshAt,
+                            const int64_t expireAt) {
 	// Set the base encryption key properties
 	baseCipher = std::make_unique<uint8_t[]>(AES_256_KEY_LENGTH);
 	memset(baseCipher.get(), 0, AES_256_KEY_LENGTH);
@@ -82,15 +109,19 @@ void BlobCipherKey::initKey(const EncryptCipherDomainId& domainId,
 	cipher = std::make_unique<uint8_t[]>(AES_256_KEY_LENGTH);
 	memset(cipher.get(), 0, AES_256_KEY_LENGTH);
 	applyHmacSha256Derivation();
-	// update the key creation time
-	creationTime = now();
+	// update cipher 'refresh' and 'expire' TS
+	refreshAtTS = refreshAt;
+	expireAtTS = expireAt;
 
-	TraceEvent("BlobCipherKey")
+#if BLOB_CIPHER_DEBUG
+	TraceEvent(SevDebug, "BlobCipher.KeyInit")
 	    .detail("DomainId", domainId)
 	    .detail("BaseCipherId", baseCipherId)
 	    .detail("BaseCipherLen", baseCipherLen)
 	    .detail("RandomSalt", randomSalt)
-	    .detail("CreationTime", creationTime);
+	    .detail("RefreshAt", refreshAtTS)
+	    .detail("ExpireAtTS", expireAtTS);
+#endif
 }
 
 void BlobCipherKey::applyHmacSha256Derivation() {
@@ -118,7 +149,7 @@ BlobCipherKeyIdCache::BlobCipherKeyIdCache()
 
 BlobCipherKeyIdCache::BlobCipherKeyIdCache(EncryptCipherDomainId dId)
   : domainId(dId), latestBaseCipherKeyId(), latestRandomSalt() {
-	TraceEvent("Init_BlobCipherKeyIdCache").detail("DomainId", domainId);
+	TraceEvent(SevInfo, "BlobCipher.KeyIdCacheInit").detail("DomainId", domainId);
 }
 
 BlobCipherKeyIdCacheKey BlobCipherKeyIdCache::getCacheKey(const EncryptCipherBaseKeyId& baseCipherKeyId,
@@ -151,7 +182,9 @@ Reference<BlobCipherKey> BlobCipherKeyIdCache::getCipherByBaseCipherId(const Enc
 
 Reference<BlobCipherKey> BlobCipherKeyIdCache::insertBaseCipherKey(const EncryptCipherBaseKeyId& baseCipherId,
                                                                    const uint8_t* baseCipher,
-                                                                   int baseCipherLen) {
+                                                                   int baseCipherLen,
+                                                                   const int64_t refreshAt,
+                                                                   const int64_t expireAt) {
 	ASSERT_GT(baseCipherId, ENCRYPT_INVALID_CIPHER_KEY_ID);
 
 	// BaseCipherKeys are immutable, given the routine invocation updates 'latestCipher',
@@ -159,21 +192,30 @@ Reference<BlobCipherKey> BlobCipherKeyIdCache::insertBaseCipherKey(const Encrypt
 	Reference<BlobCipherKey> latestCipherKey = getLatestCipherKey();
 	if (latestCipherKey.isValid() && latestCipherKey->getBaseCipherId() == baseCipherId) {
 		if (memcmp(latestCipherKey->rawBaseCipher(), baseCipher, baseCipherLen) == 0) {
-			TraceEvent("InsertBaseCipherKey_AlreadyPresent")
+#if BLOB_CIPHER_DEBUG
+			TraceEvent(SevDebug, "InsertBaseCipherKey_AlreadyPresent")
 			    .detail("BaseCipherKeyId", baseCipherId)
 			    .detail("DomainId", domainId);
+#endif
+
 			// Key is already present; nothing more to do.
 			return latestCipherKey;
 		} else {
-			TraceEvent("InsertBaseCipherKey_UpdateCipher")
+			TraceEvent(SevInfo, "BlobCipher.UpdatetBaseCipherKey")
 			    .detail("BaseCipherKeyId", baseCipherId)
 			    .detail("DomainId", domainId);
 			throw encrypt_update_cipher();
 		}
 	}
 
+	TraceEvent(SevInfo, "BlobCipherKey.InsertBaseCipherKeyLatest")
+	    .detail("DomainId", domainId)
+	    .detail("BaseCipherId", baseCipherId)
+	    .detail("RefreshAt", refreshAt)
+	    .detail("ExpireAt", expireAt);
+
 	Reference<BlobCipherKey> cipherKey =
-	    makeReference<BlobCipherKey>(domainId, baseCipherId, baseCipher, baseCipherLen);
+	    makeReference<BlobCipherKey>(domainId, baseCipherId, baseCipher, baseCipherLen, refreshAt, expireAt);
 	BlobCipherKeyIdCacheKey cacheKey = getCacheKey(cipherKey->getBaseCipherId(), cipherKey->getSalt());
 	keyIdCache.emplace(cacheKey, cipherKey);
 
@@ -187,7 +229,9 @@ Reference<BlobCipherKey> BlobCipherKeyIdCache::insertBaseCipherKey(const Encrypt
 Reference<BlobCipherKey> BlobCipherKeyIdCache::insertBaseCipherKey(const EncryptCipherBaseKeyId& baseCipherId,
                                                                    const uint8_t* baseCipher,
                                                                    int baseCipherLen,
-                                                                   const EncryptCipherRandomSalt& salt) {
+                                                                   const EncryptCipherRandomSalt& salt,
+                                                                   const int64_t refreshAt,
+                                                                   const int64_t expireAt) {
 	ASSERT_NE(baseCipherId, ENCRYPT_INVALID_CIPHER_KEY_ID);
 	ASSERT_NE(salt, ENCRYPT_INVALID_RANDOM_SALT);
 
@@ -197,21 +241,31 @@ Reference<BlobCipherKey> BlobCipherKeyIdCache::insertBaseCipherKey(const Encrypt
 	BlobCipherKeyIdCacheMapCItr itr = keyIdCache.find(cacheKey);
 	if (itr != keyIdCache.end()) {
 		if (memcmp(itr->second->rawBaseCipher(), baseCipher, baseCipherLen) == 0) {
-			TraceEvent("InsertBaseCipherKey_AlreadyPresent")
+#if BLOB_CIPHER_DEBUG
+			TraceEvent(SevDebug, "InsertBaseCipherKey_AlreadyPresent")
 			    .detail("BaseCipherKeyId", baseCipherId)
 			    .detail("DomainId", domainId);
+#endif
+
 			// Key is already present; nothing more to do.
 			return itr->second;
 		} else {
-			TraceEvent("InsertBaseCipherKey_UpdateCipher")
+			TraceEvent(SevInfo, "BlobCipher.UpdateBaseCipherKey")
 			    .detail("BaseCipherKeyId", baseCipherId)
 			    .detail("DomainId", domainId);
 			throw encrypt_update_cipher();
 		}
 	}
 
+	TraceEvent(SevInfo, "BlobCipherKey.InsertBaseCipherKey")
+	    .detail("DomainId", domainId)
+	    .detail("BaseCipherId", baseCipherId)
+	    .detail("Salt", salt)
+	    .detail("RefreshAt", refreshAt)
+	    .detail("ExpireAt", expireAt);
+
 	Reference<BlobCipherKey> cipherKey =
-	    makeReference<BlobCipherKey>(domainId, baseCipherId, baseCipher, baseCipherLen, salt);
+	    makeReference<BlobCipherKey>(domainId, baseCipherId, baseCipher, baseCipherLen, salt, refreshAt, expireAt);
 	keyIdCache.emplace(cacheKey, cipherKey);
 	return cipherKey;
 }
@@ -237,7 +291,9 @@ std::vector<Reference<BlobCipherKey>> BlobCipherKeyIdCache::getAllCipherKeys() {
 Reference<BlobCipherKey> BlobCipherKeyCache::insertCipherKey(const EncryptCipherDomainId& domainId,
                                                              const EncryptCipherBaseKeyId& baseCipherId,
                                                              const uint8_t* baseCipher,
-                                                             int baseCipherLen) {
+                                                             int baseCipherLen,
+                                                             const int64_t refreshAt,
+                                                             const int64_t expireAt) {
 	if (domainId == ENCRYPT_INVALID_DOMAIN_ID || baseCipherId == ENCRYPT_INVALID_CIPHER_KEY_ID) {
 		throw encrypt_invalid_id();
 	}
@@ -248,18 +304,18 @@ Reference<BlobCipherKey> BlobCipherKeyCache::insertCipherKey(const EncryptCipher
 			// Add mapping to track new encryption domain
 			Reference<BlobCipherKeyIdCache> keyIdCache = makeReference<BlobCipherKeyIdCache>(domainId);
 			Reference<BlobCipherKey> cipherKey =
-			    keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen);
+			    keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen, refreshAt, expireAt);
 			domainCacheMap.emplace(domainId, keyIdCache);
 			return cipherKey;
 		} else {
 			// Track new baseCipher keys
 			Reference<BlobCipherKeyIdCache> keyIdCache = domainItr->second;
-			return keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen);
+			return keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen, refreshAt, expireAt);
 		}
-
-		TraceEvent("InsertCipherKey").detail("DomainId", domainId).detail("BaseCipherKeyId", baseCipherId);
 	} catch (Error& e) {
-		TraceEvent("InsertCipherKey_Failed").detail("BaseCipherKeyId", baseCipherId).detail("DomainId", domainId);
+		TraceEvent(SevWarn, "BlobCipher.InsertCipherKeyFailed")
+		    .detail("BaseCipherKeyId", baseCipherId)
+		    .detail("DomainId", domainId);
 		throw;
 	}
 }
@@ -268,7 +324,9 @@ Reference<BlobCipherKey> BlobCipherKeyCache::insertCipherKey(const EncryptCipher
                                                              const EncryptCipherBaseKeyId& baseCipherId,
                                                              const uint8_t* baseCipher,
                                                              int baseCipherLen,
-                                                             const EncryptCipherRandomSalt& salt) {
+                                                             const EncryptCipherRandomSalt& salt,
+                                                             const int64_t refreshAt,
+                                                             const int64_t expireAt) {
 	if (domainId == ENCRYPT_INVALID_DOMAIN_ID || baseCipherId == ENCRYPT_INVALID_CIPHER_KEY_ID ||
 	    salt == ENCRYPT_INVALID_RANDOM_SALT) {
 		throw encrypt_invalid_id();
@@ -280,20 +338,17 @@ Reference<BlobCipherKey> BlobCipherKeyCache::insertCipherKey(const EncryptCipher
 		if (domainItr == domainCacheMap.end()) {
 			// Add mapping to track new encryption domain
 			Reference<BlobCipherKeyIdCache> keyIdCache = makeReference<BlobCipherKeyIdCache>(domainId);
-			cipherKey = keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen, salt);
+			cipherKey =
+			    keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen, salt, refreshAt, expireAt);
 			domainCacheMap.emplace(domainId, keyIdCache);
 		} else {
 			// Track new baseCipher keys
 			Reference<BlobCipherKeyIdCache> keyIdCache = domainItr->second;
-			cipherKey = keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen, salt);
+			cipherKey =
+			    keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen, salt, refreshAt, expireAt);
 		}
-
-		TraceEvent("InsertCipherKey")
-		    .detail("DomainId", domainId)
-		    .detail("BaseCipherKeyId", baseCipherId)
-		    .detail("Salt", salt);
 	} catch (Error& e) {
-		TraceEvent("InsertCipherKey_Failed")
+		TraceEvent(SevWarn, "BlobCipher.InsertCipherKey_Failed")
 		    .detail("BaseCipherKeyId", baseCipherId)
 		    .detail("DomainId", domainId)
 		    .detail("Salt", salt);
@@ -305,21 +360,27 @@ Reference<BlobCipherKey> BlobCipherKeyCache::insertCipherKey(const EncryptCipher
 
 Reference<BlobCipherKey> BlobCipherKeyCache::getLatestCipherKey(const EncryptCipherDomainId& domainId) {
 	if (domainId == ENCRYPT_INVALID_DOMAIN_ID) {
-		TraceEvent("GetLatestCipherKey_InvalidID").detail("DomainId", domainId);
+		TraceEvent(SevWarn, "BlobCipher.GetLatestCipherKeyInvalidID").detail("DomainId", domainId);
 		throw encrypt_invalid_id();
 	}
 	auto domainItr = domainCacheMap.find(domainId);
 	if (domainItr == domainCacheMap.end()) {
-		TraceEvent("GetLatestCipherKey_DomainNotFound").detail("DomainId", domainId);
+		TraceEvent(SevInfo, "BlobCipher.GetLatestCipherKeyDomainNotFound").detail("DomainId", domainId);
 		return Reference<BlobCipherKey>();
 	}
 
 	Reference<BlobCipherKeyIdCache> keyIdCache = domainItr->second;
 	Reference<BlobCipherKey> cipherKey = keyIdCache->getLatestCipherKey();
-	if (cipherKey.isValid() && (now() - cipherKey->getCreationTime()) > FLOW_KNOBS->ENCRYPT_CIPHER_KEY_CACHE_TTL) {
-		TraceEvent("GetLatestCipherKey_ExpiredTTL")
+
+	// Ensure 'freshness' guarantees for the latestCipher
+	if (cipherKey.isValid() && cipherKey->needsRefresh()) {
+#if BLOB_CIPHER_DEBUG
+		TraceEvent("SevDebug, BlobCipher.GetLatestNeedsRefresh")
 		    .detail("DomainId", domainId)
-		    .detail("BaseCipherId", cipherKey->getBaseCipherId());
+		    .detail("Now", now())
+		    .detail("RefreshAt", cipherKey->getRefreshAtTS());
+#endif
+
 		return Reference<BlobCipherKey>();
 	}
 
@@ -335,7 +396,22 @@ Reference<BlobCipherKey> BlobCipherKeyCache::getCipherKey(const EncryptCipherDom
 	}
 
 	Reference<BlobCipherKeyIdCache> keyIdCache = domainItr->second;
-	return keyIdCache->getCipherByBaseCipherId(baseCipherId, salt);
+	Reference<BlobCipherKey> cipherKey = keyIdCache->getCipherByBaseCipherId(baseCipherId, salt);
+
+	// Ensure 'liveness' guarantees for the cipher
+	if (cipherKey.isValid() && cipherKey->isExpired()) {
+#if BLOB_CIPHER_DEBUG
+		TraceEvent(SevDebug, "BlobCipher.GetCipherExpired")
+		    .detail("DomainId", domainId)
+		    .detail("BaseCipherId", baseCipherId)
+		    .detail("Now", now())
+		    .detail("ExpireAt", cipherKey->getExpireAtTS());
+#endif
+
+		return Reference<BlobCipherKey>();
+	}
+
+	return cipherKey;
 }
 
 void BlobCipherKeyCache::resetEncryptDomainId(const EncryptCipherDomainId domainId) {
@@ -346,15 +422,18 @@ void BlobCipherKeyCache::resetEncryptDomainId(const EncryptCipherDomainId domain
 
 	Reference<BlobCipherKeyIdCache> keyIdCache = domainItr->second;
 	keyIdCache->cleanup();
-	TraceEvent("ResetEncryptDomainId").detail("DomainId", domainId);
+	TraceEvent(SevInfo, "BlobCipher.ResetEncryptDomainId").detail("DomainId", domainId);
 }
 
 void BlobCipherKeyCache::cleanup() noexcept {
 	Reference<BlobCipherKeyCache> instance = BlobCipherKeyCache::getInstance();
+
+	TraceEvent(SevInfo, "BlobCipherKeyCache.Cleanup").log();
+
 	for (auto& domainItr : instance->domainCacheMap) {
 		Reference<BlobCipherKeyIdCache> keyIdCache = domainItr.second;
 		keyIdCache->cleanup();
-		TraceEvent("BlobCipherKeyCache_Cleanup").detail("DomainId", domainItr.first);
+		TraceEvent(SevInfo, "BlobCipher.KeyCacheCleanup").detail("DomainId", domainItr.first);
 	}
 
 	instance->domainCacheMap.clear();
@@ -423,7 +502,7 @@ Reference<EncryptBuf> EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte
 	uint8_t* ciphertext = encryptBuf->begin();
 	int bytes{ 0 };
 	if (EVP_EncryptUpdate(ctx, ciphertext, &bytes, plaintext, plaintextLen) != 1) {
-		TraceEvent("Encrypt_UpdateFailed")
+		TraceEvent(SevWarn, "BlobCipher.EncryptUpdateFailed")
 		    .detail("BaseCipherId", textCipherKey->getBaseCipherId())
 		    .detail("EncryptDomainId", textCipherKey->getDomainId());
 		throw encrypt_ops_error();
@@ -431,14 +510,14 @@ Reference<EncryptBuf> EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte
 
 	int finalBytes{ 0 };
 	if (EVP_EncryptFinal_ex(ctx, ciphertext + bytes, &finalBytes) != 1) {
-		TraceEvent("Encrypt_FinalFailed")
+		TraceEvent(SevWarn, "BlobCipher.EncryptFinalFailed")
 		    .detail("BaseCipherId", textCipherKey->getBaseCipherId())
 		    .detail("EncryptDomainId", textCipherKey->getDomainId());
 		throw encrypt_ops_error();
 	}
 
 	if ((bytes + finalBytes) != plaintextLen) {
-		TraceEvent("Encrypt_UnexpectedCipherLen")
+		TraceEvent(SevWarn, "BlobCipher.EncryptUnexpectedCipherLen")
 		    .detail("PlaintextLen", plaintextLen)
 		    .detail("EncryptedBufLen", bytes + finalBytes);
 		throw encrypt_ops_error();
@@ -508,20 +587,20 @@ Standalone<StringRef> EncryptBlobCipherAes265Ctr::encryptBlobGranuleChunk(const
 	int bytes{ 0 };
 
 	if (EVP_EncryptUpdate(ctx, ciphertext, &bytes, plaintext, plaintextLen) != 1) {
-		TraceEvent("Encrypt_UpdateFailed")
+		TraceEvent(SevWarn, "BlobCipher.EncryptUpdateFailed")
 		    .detail("BaseCipherId", textCipherKey->getBaseCipherId())
 		    .detail("EncryptDomainId", textCipherKey->getDomainId());
 		throw encrypt_ops_error();
 	}
 	int finalBytes{ 0 };
 	if (EVP_EncryptFinal_ex(ctx, ciphertext + bytes, &finalBytes) != 1) {
-		TraceEvent("Encrypt_FinalFailed")
+		TraceEvent(SevWarn, "BlobCipher.EncryptFinalFailed")
 		    .detail("BaseCipherId", textCipherKey->getBaseCipherId())
 		    .detail("EncryptDomainId", textCipherKey->getDomainId());
 		throw encrypt_ops_error();
 	}
 	if ((bytes + finalBytes) != plaintextLen) {
-		TraceEvent("Encrypt_UnexpectedCipherLen")
+		TraceEvent(SevWarn, "BlobCipher.EncryptUnexpectedCipherLen")
 		    .detail("PlaintextLen", plaintextLen)
 		    .detail("EncryptedBufLen", bytes + finalBytes);
 		throw encrypt_ops_error();
@@ -573,7 +652,7 @@ void DecryptBlobCipherAes256Ctr::verifyHeaderAuthToken(const BlobCipherEncryptHe
 	                                                     AES_256_KEY_LENGTH,
 	                                                     arena);
 	if (memcmp(&header.multiAuthTokens.headerAuthToken[0], computedHeaderAuthToken.begin(), AUTH_TOKEN_SIZE) != 0) {
-		TraceEvent("VerifyEncryptBlobHeader_AuthTokenMismatch")
+		TraceEvent(SevWarn, "BlobCipher.VerifyEncryptBlobHeaderAuthTokenMismatch")
 		    .detail("HeaderVersion", header.flags.headerVersion)
 		    .detail("HeaderMode", header.flags.encryptMode)
 		    .detail("MultiAuthHeaderAuthToken",
@@ -603,7 +682,7 @@ void DecryptBlobCipherAes256Ctr::verifyHeaderSingleAuthToken(const uint8_t* ciph
 	StringRef computed = computeAuthToken(
 	    buff, ciphertextLen + sizeof(BlobCipherEncryptHeader), headerCipherKey->rawCipher(), AES_256_KEY_LENGTH, arena);
 	if (memcmp(&header.singleAuthToken.authToken[0], computed.begin(), AUTH_TOKEN_SIZE) != 0) {
-		TraceEvent("VerifyEncryptBlobHeader_AuthTokenMismatch")
+		TraceEvent(SevWarn, "BlobCipher.VerifyEncryptBlobHeaderAuthTokenMismatch")
 		    .detail("HeaderVersion", header.flags.headerVersion)
 		    .detail("HeaderMode", header.flags.encryptMode)
 		    .detail("SingleAuthToken",
@@ -629,7 +708,7 @@ void DecryptBlobCipherAes256Ctr::verifyHeaderMultiAuthToken(const uint8_t* ciphe
 	                     arena);
 	if (memcmp(&header.multiAuthTokens.cipherTextAuthToken[0], computedCipherTextAuthToken.begin(), AUTH_TOKEN_SIZE) !=
 	    0) {
-		TraceEvent("VerifyEncryptBlobHeader_AuthTokenMismatch")
+		TraceEvent(SevWarn, "BlobCipher.VerifyEncryptBlobHeaderAuthTokenMismatch")
 		    .detail("HeaderVersion", header.flags.headerVersion)
 		    .detail("HeaderMode", header.flags.encryptMode)
 		    .detail("MultiAuthCipherTextAuthToken",
@@ -659,7 +738,7 @@ void DecryptBlobCipherAes256Ctr::verifyEncryptHeaderMetadata(const BlobCipherEnc
 	if (header.flags.headerVersion != EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION ||
 	    header.flags.encryptMode != ENCRYPT_CIPHER_MODE_AES_256_CTR ||
 	    !isEncryptHeaderAuthTokenModeValid((EncryptAuthTokenMode)header.flags.authTokenMode)) {
-		TraceEvent("VerifyEncryptBlobHeader")
+		TraceEvent(SevWarn, "BlobCipher.VerifyEncryptBlobHeader")
 		    .detail("HeaderVersion", header.flags.headerVersion)
 		    .detail("ExpectedVersion", EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION)
 		    .detail("EncryptCipherMode", header.flags.encryptMode)
@@ -678,7 +757,8 @@ Reference<EncryptBuf> DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphert
 	verifyEncryptHeaderMetadata(header);
 
 	if (header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE && !headerCipherKey.isValid()) {
-		TraceEvent("Decrypt_InvalidHeaderCipherKey").detail("AuthTokenMode", header.flags.authTokenMode);
+		TraceEvent(SevWarn, "BlobCipher.DecryptInvalidHeaderCipherKey")
+		    .detail("AuthTokenMode", header.flags.authTokenMode);
 		throw encrypt_ops_error();
 	}
 
@@ -695,7 +775,7 @@ Reference<EncryptBuf> DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphert
 	uint8_t* plaintext = decrypted->begin();
 	int bytesDecrypted{ 0 };
 	if (!EVP_DecryptUpdate(ctx, plaintext, &bytesDecrypted, ciphertext, ciphertextLen)) {
-		TraceEvent("Decrypt_UpdateFailed")
+		TraceEvent(SevWarn, "BlobCipher.DecryptUpdateFailed")
 		    .detail("BaseCipherId", header.cipherTextDetails.baseCipherId)
 		    .detail("EncryptDomainId", header.cipherTextDetails.encryptDomainId);
 		throw encrypt_ops_error();
@@ -703,14 +783,14 @@ Reference<EncryptBuf> DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphert
 
 	int finalBlobBytes{ 0 };
 	if (EVP_DecryptFinal_ex(ctx, plaintext + bytesDecrypted, &finalBlobBytes) <= 0) {
-		TraceEvent("Decrypt_FinalFailed")
+		TraceEvent(SevWarn, "BlobCipher.DecryptFinalFailed")
 		    .detail("BaseCipherId", header.cipherTextDetails.baseCipherId)
 		    .detail("EncryptDomainId", header.cipherTextDetails.encryptDomainId);
 		throw encrypt_ops_error();
 	}
 
 	if ((bytesDecrypted + finalBlobBytes) != ciphertextLen) {
-		TraceEvent("Encrypt_UnexpectedPlaintextLen")
+		TraceEvent(SevWarn, "BlobCipher.EncryptUnexpectedPlaintextLen")
 		    .detail("CiphertextLen", ciphertextLen)
 		    .detail("DecryptedBufLen", bytesDecrypted + finalBlobBytes);
 		throw encrypt_ops_error();
@@ -760,6 +840,7 @@ StringRef computeAuthToken(const uint8_t* payload,
                            const uint8_t* key,
                            const int keyLen,
                            Arena& arena) {
+	CODE_PROBE(true, "Auth token generation");
 	HmacSha256DigestGen hmacGenerator(key, keyLen);
 	StringRef digest = hmacGenerator.digest(payload, payloadLen, arena);
 
@@ -782,7 +863,7 @@ void forceLinkBlobCipherTests() {}
 //  6.1  cleanup cipherKeys by given encryptDomainId
 //  6.2. Cleanup all cached cipherKeys
 TEST_CASE("flow/BlobCipher") {
-	TraceEvent("BlobCipherTest_Start").log();
+	TraceEvent("BlobCipherTest.Start").log();
 
 	// Construct a dummy External Key Manager representation and populate with some keys
 	class BaseCipher : public ReferenceCounted<BaseCipher>, NonCopyable {
@@ -791,11 +872,16 @@ TEST_CASE("flow/BlobCipher") {
 		int len;
 		EncryptCipherBaseKeyId keyId;
 		std::unique_ptr<uint8_t[]> key;
+		int64_t refreshAt;
+		int64_t expireAt;
 		EncryptCipherRandomSalt generatedSalt;
 
-		BaseCipher(const EncryptCipherDomainId& dId, const EncryptCipherBaseKeyId& kId)
+		BaseCipher(const EncryptCipherDomainId& dId,
+		           const EncryptCipherBaseKeyId& kId,
+		           const int64_t rAt,
+		           const int64_t eAt)
 		  : domainId(dId), len(deterministicRandom()->randomInt(AES_256_KEY_LENGTH / 2, AES_256_KEY_LENGTH + 1)),
-		    keyId(kId), key(std::make_unique<uint8_t[]>(len)) {
+		    keyId(kId), key(std::make_unique<uint8_t[]>(len)), refreshAt(rAt), expireAt(eAt) {
 			deterministicRandom()->randomBytes(key.get(), len);
 		}
 	};
@@ -810,7 +896,10 @@ TEST_CASE("flow/BlobCipher") {
 	    deterministicRandom()->randomInt(minBaseCipherKeyId, minBaseCipherKeyId + 50) + 15;
 	for (int dId = minDomainId; dId <= maxDomainId; dId++) {
 		for (int kId = minBaseCipherKeyId; kId <= maxBaseCipherKeyId; kId++) {
-			domainKeyMap[dId].emplace(kId, makeReference<BaseCipher>(dId, kId));
+			domainKeyMap[dId].emplace(
+			    kId,
+			    makeReference<BaseCipher>(
+			        dId, kId, std::numeric_limits<int64_t>::max(), std::numeric_limits<int64_t>::max()));
 		}
 	}
 	ASSERT_EQ(domainKeyMap.size(), maxDomainId);
@@ -818,7 +907,7 @@ TEST_CASE("flow/BlobCipher") {
 	Reference<BlobCipherKeyCache> cipherKeyCache = BlobCipherKeyCache::getInstance();
 
 	// validate getLatestCipherKey return empty when there's no cipher key
-	TraceEvent("BlobCipherTest_LatestKeyNotExists").log();
+	TraceEvent("BlobCipherTest.LatestKeyNotExists").log();
 	Reference<BlobCipherKey> latestKeyNonexists =
 	    cipherKeyCache->getLatestCipherKey(deterministicRandom()->randomInt(minDomainId, maxDomainId));
 	ASSERT(!latestKeyNonexists.isValid());
@@ -835,18 +924,27 @@ TEST_CASE("flow/BlobCipher") {
 		for (auto& baseKeyItr : domainItr.second) {
 			Reference<BaseCipher> baseCipher = baseKeyItr.second;
 
-			cipherKeyCache->insertCipherKey(
-			    baseCipher->domainId, baseCipher->keyId, baseCipher->key.get(), baseCipher->len);
+			cipherKeyCache->insertCipherKey(baseCipher->domainId,
+			                                baseCipher->keyId,
+			                                baseCipher->key.get(),
+			                                baseCipher->len,
+			                                baseCipher->refreshAt,
+			                                baseCipher->expireAt);
 			Reference<BlobCipherKey> fetchedKey = cipherKeyCache->getLatestCipherKey(baseCipher->domainId);
 			baseCipher->generatedSalt = fetchedKey->getSalt();
 		}
 	}
 	// insert EncryptHeader BlobCipher key
-	Reference<BaseCipher> headerBaseCipher = makeReference<BaseCipher>(ENCRYPT_HEADER_DOMAIN_ID, 1);
-	cipherKeyCache->insertCipherKey(
-	    headerBaseCipher->domainId, headerBaseCipher->keyId, headerBaseCipher->key.get(), headerBaseCipher->len);
+	Reference<BaseCipher> headerBaseCipher = makeReference<BaseCipher>(
+	    ENCRYPT_HEADER_DOMAIN_ID, 1, std::numeric_limits<int64_t>::max(), std::numeric_limits<int64_t>::max());
+	cipherKeyCache->insertCipherKey(headerBaseCipher->domainId,
+	                                headerBaseCipher->keyId,
+	                                headerBaseCipher->key.get(),
+	                                headerBaseCipher->len,
+	                                headerBaseCipher->refreshAt,
+	                                headerBaseCipher->expireAt);
 
-	TraceEvent("BlobCipherTest_InsertKeysDone").log();
+	TraceEvent("BlobCipherTest.InsertKeysDone").log();
 
 	// validate the cipherKey lookups work as desired
 	for (auto& domainItr : domainKeyMap) {
@@ -865,17 +963,21 @@ TEST_CASE("flow/BlobCipher") {
 			ASSERT_NE(std::memcmp(cipherKey->rawCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()), 0);
 		}
 	}
-	TraceEvent("BlobCipherTest_LooksupDone").log();
+	TraceEvent("BlobCipherTest.LooksupDone").log();
 
 	// Ensure attemtping to insert existing cipherKey (identical) more than once is treated as a NOP
 	try {
 		Reference<BaseCipher> baseCipher = domainKeyMap[minDomainId][minBaseCipherKeyId];
-		cipherKeyCache->insertCipherKey(
-		    baseCipher->domainId, baseCipher->keyId, baseCipher->key.get(), baseCipher->len);
+		cipherKeyCache->insertCipherKey(baseCipher->domainId,
+		                                baseCipher->keyId,
+		                                baseCipher->key.get(),
+		                                baseCipher->len,
+		                                std::numeric_limits<int64_t>::max(),
+		                                std::numeric_limits<int64_t>::max());
 	} catch (Error& e) {
 		throw;
 	}
-	TraceEvent("BlobCipherTest_ReinsertIdempotentKeyDone").log();
+	TraceEvent("BlobCipherTest.ReinsertIdempotentKeyDone").log();
 
 	// Ensure attemtping to insert an existing cipherKey (modified) fails with appropriate error
 	try {
@@ -886,13 +988,18 @@ TEST_CASE("flow/BlobCipher") {
 		for (int i = 2; i < 5; i++) {
 			rawCipher[i]++;
 		}
-		cipherKeyCache->insertCipherKey(baseCipher->domainId, baseCipher->keyId, &rawCipher[0], baseCipher->len);
+		cipherKeyCache->insertCipherKey(baseCipher->domainId,
+		                                baseCipher->keyId,
+		                                &rawCipher[0],
+		                                baseCipher->len,
+		                                std::numeric_limits<int64_t>::max(),
+		                                std::numeric_limits<int64_t>::max());
 	} catch (Error& e) {
 		if (e.code() != error_code_encrypt_update_cipher) {
 			throw;
 		}
 	}
-	TraceEvent("BlobCipherTest_ReinsertNonIdempotentKeyDone").log();
+	TraceEvent("BlobCipherTest.ReinsertNonIdempotentKeyDone").log();
 
 	// Validate Encryption ops
 	Reference<BlobCipherKey> cipherKey = cipherKeyCache->getLatestCipherKey(minDomainId);
@@ -908,7 +1015,7 @@ TEST_CASE("flow/BlobCipher") {
 	BlobCipherEncryptHeader headerCopy;
 	// validate basic encrypt followed by decrypt operation for AUTH_MODE_NONE
 	{
-		TraceEvent("NoneAuthMode_Start").log();
+		TraceEvent("NoneAuthMode.Start").log();
 
 		EncryptBlobCipherAes265Ctr encryptor(
 		    cipherKey, Reference<BlobCipherKey>(), iv, AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE);
@@ -921,7 +1028,7 @@ TEST_CASE("flow/BlobCipher") {
 		ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);
 		ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE);
 
-		TraceEvent("BlobCipherTest_EncryptDone")
+		TraceEvent("BlobCipherTest.EncryptDone")
 		    .detail("HeaderVersion", header.flags.headerVersion)
 		    .detail("HeaderEncryptMode", header.flags.encryptMode)
 		    .detail("DomainId", header.cipherTextDetails.encryptDomainId)
@@ -937,7 +1044,7 @@ TEST_CASE("flow/BlobCipher") {
 		ASSERT_EQ(decrypted->getLogicalSize(), bufLen);
 		ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0);
 
-		TraceEvent("BlobCipherTest_DecryptDone").log();
+		TraceEvent("BlobCipherTest.DecryptDone").log();
 
 		// induce encryption header corruption - headerVersion corrupted
 		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
@@ -985,12 +1092,12 @@ TEST_CASE("flow/BlobCipher") {
 			ASSERT(false);
 		}
 
-		TraceEvent("NoneAuthMode_Done").log();
+		TraceEvent("NoneAuthMode.Done").log();
 	}
 
 	// validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_SINGLE
 	{
-		TraceEvent("SingleAuthMode_Start").log();
+		TraceEvent("SingleAuthMode.Start").log();
 
 		EncryptBlobCipherAes265Ctr encryptor(
 		    cipherKey, headerCipherKey, iv, AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
@@ -1003,7 +1110,7 @@ TEST_CASE("flow/BlobCipher") {
 		ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);
 		ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
 
-		TraceEvent("BlobCipherTest_EncryptDone")
+		TraceEvent("BlobCipherTest.EncryptDone")
 		    .detail("HeaderVersion", header.flags.headerVersion)
 		    .detail("HeaderEncryptMode", header.flags.encryptMode)
 		    .detail("DomainId", header.cipherTextDetails.encryptDomainId)
@@ -1024,7 +1131,7 @@ TEST_CASE("flow/BlobCipher") {
 		ASSERT_EQ(decrypted->getLogicalSize(), bufLen);
 		ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0);
 
-		TraceEvent("BlobCipherTest_DecryptDone").log();
+		TraceEvent("BlobCipherTest.DecryptDone").log();
 
 		// induce encryption header corruption - headerVersion corrupted
 		encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
@@ -1090,12 +1197,12 @@ TEST_CASE("flow/BlobCipher") {
 			}
 		}
 
-		TraceEvent("SingleAuthMode_Done").log();
+		TraceEvent("SingleAuthMode.Done").log();
 	}
 
 	// validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_MULTI
 	{
-		TraceEvent("MultiAuthMode_Start").log();
+		TraceEvent("MultiAuthMode.Start").log();
 
 		EncryptBlobCipherAes265Ctr encryptor(
 		    cipherKey, headerCipherKey, iv, AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
@@ -1108,7 +1215,7 @@ TEST_CASE("flow/BlobCipher") {
 		ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);
 		ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
 
-		TraceEvent("BlobCipherTest_EncryptDone")
+		TraceEvent("BlobCipherTest.EncryptDone")
 		    .detail("HeaderVersion", header.flags.headerVersion)
 		    .detail("HeaderEncryptMode", header.flags.encryptMode)
 		    .detail("DomainId", header.cipherTextDetails.encryptDomainId)
@@ -1130,7 +1237,7 @@ TEST_CASE("flow/BlobCipher") {
 		ASSERT_EQ(decrypted->getLogicalSize(), bufLen);
 		ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0);
 
-		TraceEvent("BlobCipherTest_DecryptDone").log();
+		TraceEvent("BlobCipherTest.DecryptDone").log();
 
 		// induce encryption header corruption - headerVersion corrupted
 		encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
@@ -1212,7 +1319,7 @@ TEST_CASE("flow/BlobCipher") {
 			}
 		}
 
-		TraceEvent("MultiAuthMode_Done").log();
+		TraceEvent("MultiAuthMode.Done").log();
 	}
 
 	// Validate dropping encryptDomainId cached keys
@@ -1228,6 +1335,6 @@ TEST_CASE("flow/BlobCipher") {
 		ASSERT(cachedKeys.empty());
 	}
 
-	TraceEvent("BlobCipherTest_Done").log();
+	TraceEvent("BlobCipherTest.Done").log();
 	return Void();
 }
diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt
index 4612ae4395..71d80da008 100644
--- a/flow/CMakeLists.txt
+++ b/flow/CMakeLists.txt
@@ -11,6 +11,9 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
 endif()
 
 make_directory(${CMAKE_CURRENT_BINARY_DIR}/include/flow)
+set(FDB_PROTOCOL_VERSION_FILE "${CMAKE_CURRENT_SOURCE_DIR}/ProtocolVersions.cmake" CACHE STRING "Protocol version cmake file." FORCE)
+include(${FDB_PROTOCOL_VERSION_FILE})
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ProtocolVersion.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/include/flow/ProtocolVersion.h)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/SourceVersion.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/include/flow/SourceVersion.h)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/include/flow/config.h)
 
@@ -85,8 +88,3 @@ endif()
 
 add_executable(mkcert MkCertCli.cpp)
 target_link_libraries(mkcert PUBLIC flow)
-
-add_executable(mtls_unittest TLSTest.cpp)
-target_link_libraries(mtls_unittest PUBLIC flow)
-add_test(NAME mutual_tls_unittest
-         COMMAND $<TARGET_FILE:mtls_unittest>)
diff --git a/flow/DeterministicRandom.cpp b/flow/DeterministicRandom.cpp
index 63cc25c8d1..b55616e16f 100644
--- a/flow/DeterministicRandom.cpp
+++ b/flow/DeterministicRandom.cpp
@@ -93,11 +93,12 @@ uint64_t DeterministicRandom::randomUInt64() {
 }
 
 uint32_t DeterministicRandom::randomSkewedUInt32(uint32_t min, uint32_t maxPlusOne) {
-	std::uniform_real_distribution<double> distribution(std::log(min), std::log(maxPlusOne - 1));
-	double logpower = distribution(random);
-	uint32_t loguniform = static_cast<uint32_t>(std::pow(10, logpower));
-	// doubles can be imprecise, so let's make sure we don't violate an edge case.
-	return std::max(std::min(loguniform, maxPlusOne - 1), min);
+	ASSERT(min < maxPlusOne);
+	std::uniform_real_distribution<double> distribution(std::log(std::max<double>(min, 1.0 / M_E)),
+	                                                    std::log(maxPlusOne));
+	double exponent = distribution(random);
+	uint32_t value = static_cast<uint32_t>(std::pow(M_E, exponent));
+	return std::max(std::min(value, maxPlusOne - 1), min);
 }
 
 UID DeterministicRandom::randomUniqueID() {
diff --git a/flow/EncryptUtils.cpp b/flow/EncryptUtils.cpp
index 12daf881d9..55ae6c4d1c 100644
--- a/flow/EncryptUtils.cpp
+++ b/flow/EncryptUtils.cpp
@@ -61,7 +61,9 @@ std::string getEncryptDbgTraceKeyWithTS(std::string_view prefix,
                                         int64_t expAfterTS) {
 	// Construct the TraceEvent field key ensuring its uniqueness and compliance to TraceEvent field validator and log
 	// parsing tools
+	std::string dName = domainName.toString();
+	// Underscores are invalid in trace event detail name.
+	boost::replace_all(dName, "_", "-");
 	boost::format fmter("%s.%lld.%s.%llu.%lld.%lld");
-	return boost::str(
-	    boost::format(fmter % prefix % domainId % domainName.toString() % baseCipherId % refAfterTS % expAfterTS));
+	return boost::str(boost::format(fmter % prefix % domainId % dName % baseCipherId % refAfterTS % expAfterTS));
 }
diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp
index 21d3e2f884..916c7ec1c0 100644
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@@ -19,8 +19,11 @@
  */
 
 #include "flow/flow.h"
+#include "flow/Error.h"
 #include "flow/Knobs.h"
 #include "flow/BooleanParam.h"
+#include "flow/UnitTest.h"
+
 #include <cmath>
 #include <cinttypes>
 
@@ -126,6 +129,10 @@ void FlowKnobs::initialize(Randomize randomize, IsSimulated isSimulated) {
 	init( NETWORK_TEST_REQUEST_COUNT,                            0 ); // 0 -> run forever
 	init( NETWORK_TEST_REQUEST_SIZE,                             1 );
 	init( NETWORK_TEST_SCRIPT_MODE,                          false );
+
+	//Authorization
+	init( PUBLIC_KEY_FILE_MAX_SIZE,                    1024 * 1024 );
+	init( PUBLIC_KEY_FILE_REFRESH_INTERVAL_SECONDS,             30 );
 	init( MAX_CACHED_EXPIRED_TOKENS,                          1024 );
 
 	//AsyncFileCached
@@ -283,8 +290,8 @@ void FlowKnobs::initialize(Randomize randomize, IsSimulated isSimulated) {
 	init( HEALTH_MONITOR_CONNECTION_MAX_CLOSED,                  5 );
 
 	// Encryption
-	init( ENCRYPT_CIPHER_KEY_CACHE_TTL, isSimulated ? 120 : 10 * 60 );
-	if ( randomize && BUGGIFY) { ENCRYPT_CIPHER_KEY_CACHE_TTL = deterministicRandom()->randomInt(50, 100); }
+	init( ENCRYPT_CIPHER_KEY_CACHE_TTL, isSimulated ? 5 * 60 : 10 * 60 );
+	if ( randomize && BUGGIFY) { ENCRYPT_CIPHER_KEY_CACHE_TTL = deterministicRandom()->randomInt(2, 10) * 60; }
 	init( ENCRYPT_KEY_REFRESH_INTERVAL,   isSimulated ? 60 : 8 * 60 );
 	if ( randomize && BUGGIFY) { ENCRYPT_KEY_REFRESH_INTERVAL = deterministicRandom()->randomInt(2, 10); }
 	init( TOKEN_CACHE_SIZE,                                    100 );
@@ -309,22 +316,68 @@ static std::string toLower(std::string const& name) {
 	return lower_name;
 }
 
+// Converts the given string into a double. If any errors are
+// encountered, it throws an invalid_option_value exception.
+static double safe_stod(std::string const& str) {
+	size_t n;
+	double value = std::stod(str, &n);
+	if (n < str.size()) {
+		throw invalid_option_value();
+	}
+	return value;
+}
+
+// Converts the given (possibly hexadecimal) string into an
+// integer. If any errors are encountered, it throws an
+// invalid_option_value exception.
+static int safe_stoi(std::string const& str) {
+	size_t n;
+	int value = std::stoi(str, &n, 0);
+	if (n < str.size()) {
+		throw invalid_option_value();
+	}
+	return value;
+}
+
+// Converts the given (possibly hexadecimal) string into a 64-bit
+// integer. If any errors are encountered, it throws an
+// invalid_option_value exception.
+static int64_t safe_stoi64(std::string const& str) {
+	size_t n;
+	int64_t value = static_cast<int64_t>(std::stoll(str, &n, 0));
+	if (n < str.size()) {
+		throw invalid_option_value();
+	}
+	return value;
+}
+
+// Converts the given string into a bool. "true" and "false" are case
+// insenstively interpreted as true and false. Otherwise, any non-zero
+// integer is true. If any errors are encountered, it throws an
+// invalid_option_value exception.
+static bool safe_stob(std::string const& str) {
+	if (toLower(str) == "true") {
+		return true;
+	} else if (toLower(str) == "false") {
+		return false;
+	} else {
+		return safe_stoi(str) != 0;
+	}
+}
+
+// Parses a string value into the appropriate type based upon the knob
+// name. If any errors are encountered, it throws an
+// invalid_option_value exception.
 ParsedKnobValue Knobs::parseKnobValue(std::string const& knob, std::string const& value) const {
 	try {
 		if (double_knobs.count(knob)) {
-			return std::stod(value);
+			return safe_stod(value);
 		} else if (bool_knobs.count(knob)) {
-			if (toLower(value) == "true") {
-				return true;
-			} else if (toLower(value) == "false") {
-				return false;
-			} else {
-				return (std::stoi(value) != 0);
-			}
+			return safe_stob(value);
 		} else if (int64_knobs.count(knob)) {
-			return static_cast<int64_t>(std::stol(value, nullptr, 0));
+			return safe_stoi64(value);
 		} else if (int_knobs.count(knob)) {
-			return std::stoi(value, nullptr, 0);
+			return safe_stoi(value);
 		} else if (string_knobs.count(knob)) {
 			return value;
 		}
@@ -476,3 +529,23 @@ void Knobs::trace() const {
 		    .detail("Value", *k.second.value)
 		    .detail("Atomic", k.second.atomic);
 }
+
+TEST_CASE("/flow/Knobs/ParseKnobValue") {
+	// Test the safe conversion functions.
+	ASSERT_EQ(safe_stod("4.0"), 4.0);
+
+	ASSERT_EQ(safe_stoi("4"), 4);
+
+	ASSERT_EQ(safe_stoi64("4"), (int64_t)4);
+	try {
+		[[maybe_unused]] int64_t value = safe_stoi64("4GiB");
+		UNREACHABLE();
+	} catch (Error& e) {
+		ASSERT_EQ(e.code(), error_code_invalid_option_value);
+	}
+
+	ASSERT_EQ(safe_stob("true"), true);
+	ASSERT_EQ(safe_stob("false"), false);
+
+	return Void();
+}
diff --git a/flow/MkCert.cpp b/flow/MkCert.cpp
index f609eba54a..b9af4f90c3 100644
--- a/flow/MkCert.cpp
+++ b/flow/MkCert.cpp
@@ -166,13 +166,13 @@ PrivateKey makeEcP256() {
 	return PrivateKey(DerEncoded{}, StringRef(buf, len));
 }
 
-PrivateKey makeRsa2048Bit() {
+PrivateKey makeRsa4096Bit() {
 	auto kctx = AutoCPointer(::EVP_PKEY_CTX_new_id(EVP_PKEY_RSA, nullptr), &::EVP_PKEY_CTX_free);
 	OSSL_ASSERT(kctx);
 	auto key = AutoCPointer(nullptr, &::EVP_PKEY_free);
 	auto keyRaw = std::add_pointer_t<EVP_PKEY>();
 	OSSL_ASSERT(0 < ::EVP_PKEY_keygen_init(kctx));
-	OSSL_ASSERT(0 < ::EVP_PKEY_CTX_set_rsa_keygen_bits(kctx, 2048));
+	OSSL_ASSERT(0 < ::EVP_PKEY_CTX_set_rsa_keygen_bits(kctx, 4096));
 	OSSL_ASSERT(0 < ::EVP_PKEY_keygen(kctx, &keyRaw));
 	OSSL_ASSERT(keyRaw);
 	key.reset(keyRaw);
diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp
index c1703b4596..a41b7de14c 100644
--- a/flow/Net2.actor.cpp
+++ b/flow/Net2.actor.cpp
@@ -50,8 +50,11 @@
 #include "flow/ProtocolVersion.h"
 #include "flow/SendBufferIterator.h"
 #include "flow/TLSConfig.actor.h"
+#include "flow/WatchFile.actor.h"
 #include "flow/genericactors.actor.h"
 #include "flow/Util.h"
+#include "flow/UnitTest.h"
+#include "flow/ScopeExit.h"
 
 #ifdef ADDRESS_SANITIZER
 #include <sanitizer/lsan_interface.h>
@@ -214,7 +217,7 @@ public:
 		globals[id] = v;
 	}
 
-	ProtocolVersion protocolVersion() const override { return currentProtocolVersion; }
+	ProtocolVersion protocolVersion() const override { return currentProtocolVersion(); }
 
 	std::vector<flowGlobalType> globals;
 
@@ -236,6 +239,7 @@ public:
 	int sslHandshakerThreadsStarted;
 	int sslPoolHandshakesInProgress;
 	TLSConfig tlsConfig;
+	Reference<TLSPolicy> activeTlsPolicy;
 	Future<Void> backgroundCertRefresh;
 	ETLSInitState tlsInitializedState;
 
@@ -505,6 +509,8 @@ public:
 
 	NetworkAddress getPeerAddress() const override { return peer_address; }
 
+	bool hasTrustedPeer() const override { return true; }
+
 	UID getDebugID() const override { return id; }
 
 	tcp::socket& getSocket() override { return socket; }
@@ -837,7 +843,7 @@ public:
 	explicit SSLConnection(boost::asio::io_service& io_service,
 	                       Reference<ReferencedObject<boost::asio::ssl::context>> context)
 	  : id(nondeterministicRandom()->randomUniqueID()), socket(io_service), ssl_sock(socket, context->mutate()),
-	    sslContext(context) {}
+	    sslContext(context), has_trusted_peer(false) {}
 
 	explicit SSLConnection(Reference<ReferencedObject<boost::asio::ssl::context>> context, tcp::socket* existingSocket)
 	  : id(nondeterministicRandom()->randomUniqueID()), socket(std::move(*existingSocket)),
@@ -898,6 +904,9 @@ public:
 
 		try {
 			Future<Void> onHandshook;
+			ConfigureSSLStream(N2::g_net2->activeTlsPolicy, self->ssl_sock, [this](bool verifyOk) {
+				self->has_trusted_peer = verifyOk;
+			});
 
 			// If the background handshakers are not all busy, use one
 			if (N2::g_net2->sslPoolHandshakesInProgress < N2::g_net2->sslHandshakerThreadsStarted) {
@@ -973,6 +982,10 @@ public:
 
 		try {
 			Future<Void> onHandshook;
+			ConfigureSSLStream(N2::g_net2->activeTlsPolicy, self->ssl_sock, [this](bool verifyOk) {
+				self->has_trusted_peer = verifyOk;
+			});
+
 			// If the background handshakers are not all busy, use one
 			if (N2::g_net2->sslPoolHandshakesInProgress < N2::g_net2->sslHandshakerThreadsStarted) {
 				holder = Hold(&N2::g_net2->sslPoolHandshakesInProgress);
@@ -1106,6 +1119,8 @@ public:
 
 	NetworkAddress getPeerAddress() const override { return peer_address; }
 
+	bool hasTrustedPeer() const override { return has_trusted_peer; }
+
 	UID getDebugID() const override { return id; }
 
 	tcp::socket& getSocket() override { return socket; }
@@ -1118,6 +1133,7 @@ private:
 	ssl_socket ssl_sock;
 	NetworkAddress peer_address;
 	Reference<ReferencedObject<boost::asio::ssl::context>> sslContext;
+	bool has_trusted_peer;
 
 	void init() {
 		// Socket settings that have to be set after connect or accept succeeds
@@ -1163,6 +1179,16 @@ public:
 	            NetworkAddress listenAddress)
 	  : io_service(io_service), listenAddress(listenAddress), acceptor(io_service, tcpEndpoint(listenAddress)),
 	    contextVar(contextVar) {
+		// when port 0 is passed in, a random port will be opened
+		// set listenAddress as the address with the actual port opened instead of port 0
+		if (listenAddress.port == 0) {
+			this->listenAddress = NetworkAddress::parse(acceptor.local_endpoint()
+			                                                .address()
+			                                                .to_string()
+			                                                .append(":")
+			                                                .append(std::to_string(acceptor.local_endpoint().port()))
+			                                                .append(listenAddress.isTLS() ? ":tls" : ""));
+		}
 		platform::setCloseOnExec(acceptor.native_handle());
 	}
 
@@ -1238,45 +1264,11 @@ Net2::Net2(const TLSConfig& tlsConfig, bool useThreadPool, bool useMetrics)
 	updateNow();
 }
 
-ACTOR static Future<Void> watchFileForChanges(std::string filename, AsyncTrigger* fileChanged) {
-	if (filename == "") {
-		return Never();
-	}
-	state bool firstRun = true;
-	state bool statError = false;
-	state std::time_t lastModTime = 0;
-	loop {
-		try {
-			std::time_t modtime = wait(IAsyncFileSystem::filesystem()->lastWriteTime(filename));
-			if (firstRun) {
-				lastModTime = modtime;
-				firstRun = false;
-			}
-			if (lastModTime != modtime || statError) {
-				lastModTime = modtime;
-				statError = false;
-				fileChanged->trigger();
-			}
-		} catch (Error& e) {
-			if (e.code() == error_code_io_error) {
-				// EACCES, ELOOP, ENOENT all come out as io_error(), but are more of a system
-				// configuration issue than an FDB problem.  If we managed to load valid
-				// certificates, then there's no point in crashing, but we should complain
-				// loudly.  IAsyncFile will log the error, but not necessarily as a warning.
-				TraceEvent(SevWarnAlways, "TLSCertificateRefreshStatError").detail("File", filename);
-				statError = true;
-			} else {
-				throw;
-			}
-		}
-		wait(delay(FLOW_KNOBS->TLS_CERT_REFRESH_DELAY_SECONDS));
-	}
-}
-
 ACTOR static Future<Void> reloadCertificatesOnChange(
     TLSConfig config,
     std::function<void()> onPolicyFailure,
-    AsyncVar<Reference<ReferencedObject<boost::asio::ssl::context>>>* contextVar) {
+    AsyncVar<Reference<ReferencedObject<boost::asio::ssl::context>>>* contextVar,
+    Reference<TLSPolicy>* policy) {
 	if (FLOW_KNOBS->TLS_CERT_REFRESH_DELAY_SECONDS <= 0) {
 		return Void();
 	}
@@ -1290,9 +1282,13 @@ ACTOR static Future<Void> reloadCertificatesOnChange(
 	state int mismatches = 0;
 	state AsyncTrigger fileChanged;
 	state std::vector<Future<Void>> lifetimes;
-	lifetimes.push_back(watchFileForChanges(config.getCertificatePathSync(), &fileChanged));
-	lifetimes.push_back(watchFileForChanges(config.getKeyPathSync(), &fileChanged));
-	lifetimes.push_back(watchFileForChanges(config.getCAPathSync(), &fileChanged));
+	const int& intervalSeconds = FLOW_KNOBS->TLS_CERT_REFRESH_DELAY_SECONDS;
+	lifetimes.push_back(watchFileForChanges(
+	    config.getCertificatePathSync(), &fileChanged, &intervalSeconds, "TLSCertificateRefreshStatError"));
+	lifetimes.push_back(
+	    watchFileForChanges(config.getKeyPathSync(), &fileChanged, &intervalSeconds, "TLSKeyRefreshStatError"));
+	lifetimes.push_back(
+	    watchFileForChanges(config.getCAPathSync(), &fileChanged, &intervalSeconds, "TLSCARefreshStatError"));
 	loop {
 		wait(fileChanged.onTrigger());
 		TraceEvent("TLSCertificateRefreshBegin").log();
@@ -1300,7 +1296,8 @@ ACTOR static Future<Void> reloadCertificatesOnChange(
 		try {
 			LoadedTLSConfig loaded = wait(config.loadAsync());
 			boost::asio::ssl::context context(boost::asio::ssl::context::tls);
-			ConfigureSSLContext(loaded, &context, onPolicyFailure);
+			ConfigureSSLContext(loaded, context);
+			*policy = makeReference<TLSPolicy>(loaded, onPolicyFailure);
 			TraceEvent(SevInfo, "TLSCertificateRefreshSucceeded").log();
 			mismatches = 0;
 			contextVar->set(ReferencedObject<boost::asio::ssl::context>::from(std::move(context)));
@@ -1332,12 +1329,15 @@ void Net2::initTLS(ETLSInitState targetState) {
 			    .detail("KeyPath", tlsConfig.getKeyPathSync())
 			    .detail("HasPassword", !loaded.getPassword().empty())
 			    .detail("VerifyPeers", boost::algorithm::join(loaded.getVerifyPeers(), "|"));
-			ConfigureSSLContext(tlsConfig.loadSync(), &newContext, onPolicyFailure);
+			auto loadedTlsConfig = tlsConfig.loadSync();
+			ConfigureSSLContext(loadedTlsConfig, newContext);
+			activeTlsPolicy = makeReference<TLSPolicy>(loadedTlsConfig, onPolicyFailure);
 			sslContextVar.set(ReferencedObject<boost::asio::ssl::context>::from(std::move(newContext)));
 		} catch (Error& e) {
 			TraceEvent("Net2TLSInitError").error(e);
 		}
-		backgroundCertRefresh = reloadCertificatesOnChange(tlsConfig, onPolicyFailure, &sslContextVar);
+		backgroundCertRefresh =
+		    reloadCertificatesOnChange(tlsConfig, onPolicyFailure, &sslContextVar, &activeTlsPolicy);
 	}
 
 	// If a TLS connection is actually going to be used then start background threads if configured
@@ -2126,7 +2126,7 @@ struct TestGVR {
 };
 
 template <class F>
-void startThreadF(F&& func) {
+THREAD_HANDLE startThreadF(F&& func) {
 	struct Thing {
 		F f;
 		Thing(F&& f) : f(std::move(f)) {}
@@ -2138,71 +2138,153 @@ void startThreadF(F&& func) {
 		}
 	};
 	Thing* t = new Thing(std::move(func));
-	startThread(Thing::start, t);
+	return g_network->startThread(Thing::start, t);
+}
+
+TEST_CASE("/flow/Net2/ThreadSafeQueue/Interface") {
+	ThreadSafeQueue<int> tq;
+	ASSERT(!tq.pop().present());
+	ASSERT(tq.canSleep());
+
+	ASSERT(tq.push(1) == true);
+	ASSERT(!tq.canSleep());
+	ASSERT(!tq.canSleep());
+	ASSERT(tq.push(2) == false);
+	ASSERT(tq.push(3) == false);
+
+	ASSERT(tq.pop().get() == 1);
+	ASSERT(tq.pop().get() == 2);
+	ASSERT(tq.push(4) == false);
+	ASSERT(tq.pop().get() == 3);
+	ASSERT(tq.pop().get() == 4);
+	ASSERT(!tq.pop().present());
+	ASSERT(tq.canSleep());
+	return Void();
+}
+
+// A helper struct used by queueing tests which use multiple threads.
+struct QueueTestThreadState {
+	QueueTestThreadState(int threadId, int toProduce) : threadId(threadId), toProduce(toProduce) {}
+	int threadId;
+	THREAD_HANDLE handle;
+	int toProduce;
+	int produced = 0;
+	Promise<Void> doneProducing;
+	int consumed = 0;
+
+	static int valueToThreadId(int value) { return value >> 20; }
+	int elementValue(int index) { return index + (threadId << 20); }
+	int nextProduced() { return elementValue(produced++); }
+	int nextConsumed() { return elementValue(consumed++); }
+	void checkDone() {
+		ASSERT_EQ(produced, toProduce);
+		ASSERT_EQ(consumed, produced);
+	}
+};
+
+TEST_CASE("/flow/Net2/ThreadSafeQueue/Threaded") {
+	// Uses ThreadSafeQueue from multiple threads. Verifies that all pushed elements are popped, maintaining the
+	// ordering within a thread.
+	noUnseed = true; // multi-threading inherently non-deterministic
+
+	ThreadSafeQueue<int> queue;
+	state std::vector<QueueTestThreadState> perThread = { QueueTestThreadState(0, 1000000),
+		                                                  QueueTestThreadState(1, 100000),
+		                                                  QueueTestThreadState(2, 1000000) };
+	state std::vector<Future<Void>> doneProducing;
+
+	int total = 0;
+	for (int t = 0; t < perThread.size(); ++t) {
+		auto& s = perThread[t];
+		doneProducing.push_back(s.doneProducing.getFuture());
+		total += s.toProduce;
+		s.handle = startThreadF([&queue, &s]() {
+			printf("Thread%d\n", s.threadId);
+			int nextYield = 0;
+			while (s.produced < s.toProduce) {
+				queue.push(s.nextProduced());
+				if (nextYield-- == 0) {
+					std::this_thread::yield();
+					nextYield = nondeterministicRandom()->randomInt(0, 100);
+				}
+			}
+			printf("T%dDone\n", s.threadId);
+			s.doneProducing.send(Void());
+		});
+	}
+	int consumed = 0;
+	while (consumed < total) {
+		Optional<int> element = queue.pop();
+		if (element.present()) {
+			int v = element.get();
+			auto& s = perThread[QueueTestThreadState::valueToThreadId(v)];
+			++consumed;
+			ASSERT(v == s.nextConsumed());
+		} else {
+			std::this_thread::yield();
+		}
+		if ((consumed & 3) == 0)
+			queue.canSleep();
+	}
+
+	wait(waitForAll(doneProducing));
+
+	// Make sure we continue on the main thread.
+	Promise<Void> signal;
+	state Future<Void> doneConsuming = signal.getFuture();
+	g_network->onMainThread(std::move(signal), TaskPriority::DefaultOnMainThread);
+	wait(doneConsuming);
+
+	for (int t = 0; t < perThread.size(); ++t) {
+		waitThread(perThread[t].handle);
+		perThread[t].checkDone();
+	}
+	return Void();
+}
+
+// NB: This could be a test for any INetwork implementation, but Sim2 doesn't
+// satisfy this requirement yet.
+TEST_CASE("noSim/flow/Net2/onMainThreadFIFO") {
+	// Verifies that signals processed by onMainThread() are executed in order.
+	noUnseed = true; // multi-threading inherently non-deterministic
+
+	state std::vector<QueueTestThreadState> perThread = { QueueTestThreadState(0, 1000000),
+		                                                  QueueTestThreadState(1, 100000),
+		                                                  QueueTestThreadState(2, 1000000) };
+	state std::vector<Future<Void>> doneProducing;
+	for (int t = 0; t < perThread.size(); ++t) {
+		auto& s = perThread[t];
+		doneProducing.push_back(s.doneProducing.getFuture());
+		s.handle = startThreadF([&s]() {
+			int nextYield = 0;
+			while (s.produced < s.toProduce) {
+				if (nextYield-- == 0) {
+					std::this_thread::yield();
+					nextYield = nondeterministicRandom()->randomInt(0, 100);
+				}
+				int v = s.nextProduced();
+				onMainThreadVoid([&s, v]() { ASSERT_EQ(v, s.nextConsumed()); });
+			}
+			s.doneProducing.send(Void());
+		});
+	}
+	wait(waitForAll(doneProducing));
+
+	// Wait for one more onMainThread to wait for all scheduled signals to be executed.
+	Promise<Void> signal;
+	state Future<Void> doneConsuming = signal.getFuture();
+	g_network->onMainThread(std::move(signal), TaskPriority::DefaultOnMainThread);
+	wait(doneConsuming);
+
+	for (int t = 0; t < perThread.size(); ++t) {
+		waitThread(perThread[t].handle);
+		perThread[t].checkDone();
+	}
+	return Void();
 }
 
 void net2_test(){
-	/*printf("ThreadSafeQueue test\n");
-	printf("  Interface: ");
-	ThreadSafeQueue<int> tq;
-	ASSERT( tq.canSleep() == true );
-
-	ASSERT( tq.push( 1 ) == true ) ;
-	ASSERT( tq.push( 2 ) == false );
-	ASSERT( tq.push( 3 ) == false );
-
-	ASSERT( tq.pop().get() == 1 );
-	ASSERT( tq.pop().get() == 2 );
-	ASSERT( tq.push( 4 ) == false );
-	ASSERT( tq.pop().get() == 3 );
-	ASSERT( tq.pop().get() == 4 );
-	ASSERT( !tq.pop().present() );
-	printf("OK\n");
-
-	printf("Threaded: ");
-	Event finished, finished2;
-	int thread1Iterations = 1000000, thread2Iterations = 100000;
-
-	if (thread1Iterations)
-	    startThreadF([&](){
-	        printf("Thread1\n");
-	        for(int i=0; i<thread1Iterations; i++)
-	            tq.push(i);
-	        printf("T1Done\n");
-	        finished.set();
-	    });
-	if (thread2Iterations)
-	    startThreadF([&](){
-	        printf("Thread2\n");
-	        for(int i=0; i<thread2Iterations; i++)
-	            tq.push(i + (1<<20));
-	        printf("T2Done\n");
-	        finished2.set();
-	    });
-	int c = 0, mx[2]={0, 1<<20}, p = 0;
-	while (c < thread1Iterations + thread2Iterations)
-	{
-	    Optional<int> i = tq.pop();
-	    if (i.present()) {
-	        int v = i.get();
-	        ++c;
-	        if (mx[v>>20] != v)
-	            printf("Wrong value dequeued!\n");
-	        ASSERT( mx[v>>20] == v );
-	        mx[v>>20] = v + 1;
-	    } else {
-	        ++p;
-	        _mm_pause();
-	    }
-	    if ((c&3)==0) tq.canSleep();
-	}
-	printf("%d %d %x %x %s\n", c, p, mx[0], mx[1], mx[0]==thread1Iterations && mx[1]==(1<<20)+thread2Iterations ? "OK" :
-	"FAIL");
-
-	finished.block();
-	finished2.block();
-
-
+	/*
 	g_network = newNet2();  // for promise serialization below
 
 	Endpoint destination;
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index e994d6c38b..1c21e3b6ca 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -33,19 +33,24 @@
 
 #include "flow/StreamCipher.h"
 #include "flow/BlobCipher.h"
+#include "flow/ScopeExit.h"
 #include "flow/Trace.h"
 #include "flow/Error.h"
 
 #include "flow/Knobs.h"
 
+#include <algorithm>
 #include <iostream>
 #include <fstream>
 #include <sstream>
 #include <cstring>
-#include <algorithm>
+#include <string>
+#include <string_view>
+#include <vector>
 #include <boost/format.hpp>
 #include <boost/filesystem.hpp>
 #include <boost/filesystem/operations.hpp>
+#include <boost/algorithm/string.hpp>
 
 #include <sys/types.h>
 #include <time.h>
@@ -64,6 +69,7 @@
 #include <direct.h>
 #include <pdh.h>
 #include <pdhmsg.h>
+#include <processenv.h>
 #pragma comment(lib, "pdh.lib")
 
 // for SHGetFolderPath
@@ -147,6 +153,9 @@
 #endif
 
 #ifdef __APPLE__
+/* Needed for cross-platform 'environ' */
+#include <crt_externs.h>
+
 #include <sys/uio.h>
 #include <sys/syslimits.h>
 #include <mach/mach.h>
@@ -1933,6 +1942,39 @@ std::string epochsToGMTString(double epochs) {
 	return timeString;
 }
 
+std::vector<std::string> getEnvironmentKnobOptions() {
+	constexpr const size_t ENVKNOB_PREFIX_LEN = sizeof(ENVIRONMENT_KNOB_OPTION_PREFIX) - 1;
+	std::vector<std::string> knobOptions;
+#if defined(_WIN32)
+	auto e = GetEnvironmentStrings();
+	if (e == nullptr)
+		return {};
+	auto cleanup = ScopeExit([e]() { FreeEnvironmentStrings(e); });
+	while (*e) {
+		auto candidate = std::string_view(e);
+		if (boost::starts_with(candidate, ENVIRONMENT_KNOB_OPTION_PREFIX))
+			knobOptions.emplace_back(candidate.substr(ENVKNOB_PREFIX_LEN));
+		e += (candidate.size() + 1);
+	}
+#else
+	char** e = nullptr;
+#ifdef __linux__
+	e = environ;
+#elif defined(__APPLE__)
+	e = *_NSGetEnviron();
+#else
+#error Port me!
+#endif
+	for (; e && *e; e++) {
+		std::string_view envOption(*e);
+		if (boost::starts_with(envOption, ENVIRONMENT_KNOB_OPTION_PREFIX)) {
+			knobOptions.emplace_back(envOption.substr(ENVKNOB_PREFIX_LEN));
+		}
+	}
+#endif
+	return knobOptions;
+}
+
 void setMemoryQuota(size_t limit) {
 	if (limit == 0) {
 		return;
@@ -3589,12 +3631,17 @@ void crashHandler(int sig) {
 	//  but the idea is that we're about to crash anyway...
 	std::string backtrace = platform::get_backtrace();
 
-	bool error = (sig != SIGUSR2);
+	bool error = (sig != SIGUSR2 && sig != SIGTERM);
 
 	StreamCipherKey::cleanup();
 	StreamCipher::cleanup();
 	BlobCipherKeyCache::cleanup();
 
+	fprintf(error ? stderr : stdout, "SIGNAL: %s (%d)\n", strsignal(sig), sig);
+	if (error) {
+		fprintf(stderr, "Trace: %s\n", backtrace.c_str());
+	}
+
 	fflush(stdout);
 	{
 		TraceEvent te(error ? SevError : SevInfo, error ? "Crash" : "ProcessTerminated");
@@ -3605,8 +3652,9 @@ void crashHandler(int sig) {
 	}
 	flushTraceFileVoid();
 
-	fprintf(stderr, "SIGNAL: %s (%d)\n", strsignal(sig), sig);
-	fprintf(stderr, "Trace: %s\n", backtrace.c_str());
+#ifdef USE_GCOV
+	__gcov_flush();
+#endif
 
 	struct sigaction sa;
 	sa.sa_handler = SIG_DFL;
@@ -3646,6 +3694,7 @@ void registerCrashHandler() {
 	sigaction(SIGSEGV, &action, nullptr);
 	sigaction(SIGBUS, &action, nullptr);
 	sigaction(SIGUSR2, &action, nullptr);
+	sigaction(SIGTERM, &action, nullptr);
 #else
 	// No crash handler for other platforms!
 #endif
diff --git a/flow/ProtocolVersion.cpp b/flow/ProtocolVersion.cpp
new file mode 100644
index 0000000000..b70e7b78ec
--- /dev/null
+++ b/flow/ProtocolVersion.cpp
@@ -0,0 +1,37 @@
+/*
+ * ProtocolVersion.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "flow/ProtocolVersion.h"
+
+namespace {
+
+ProtocolVersion g_currentProtocolVersion(defaultProtocolVersionValue);
+
+}
+
+ProtocolVersion currentProtocolVersion() {
+	static ProtocolVersion firstReturnedProtocolVersion = g_currentProtocolVersion;
+	// Make sure the protocol version is not changed, once it is already in use
+	ASSERT(firstReturnedProtocolVersion == g_currentProtocolVersion);
+	return g_currentProtocolVersion;
+}
+
+void useFutureProtocolVersion() {
+	g_currentProtocolVersion = ProtocolVersion(futureProtocolVersionValue);
+}
\ No newline at end of file
diff --git a/flow/include/flow/ProtocolVersion.h b/flow/ProtocolVersion.h.cmake
similarity index 55%
rename from flow/include/flow/ProtocolVersion.h
rename to flow/ProtocolVersion.h.cmake
index f839fce2b0..6c33915d83 100644
--- a/flow/include/flow/ProtocolVersion.h
+++ b/flow/ProtocolVersion.h.cmake
@@ -23,26 +23,23 @@
 #include <cstdint>
 
 // This version impacts both communications and the deserialization of certain database and IKeyValueStore keys.
-//
-// The convention is that 'x' and 'y' should match the major and minor version of the software, and 'z' should be 0.
-// To make a change without a corresponding increase to the x.y version, increment the 'dev' digit.
-//
-// The last 2 bytes (4 digits) of the protocol version do not affect compatibility. These two bytes are not currently
-// used and should not be changed from 0.
-//                                                         xyzdev
-//                                                         vvvv
-constexpr uint64_t currentProtocolVersionValue = 0x0FDB00B072000000LL;
+constexpr uint64_t defaultProtocolVersionValue = @FDB_PV_DEFAULT_VERSION@;
 
 // The first protocol version that cannot be downgraded from. Ordinarily, this will be two release versions larger
 // than the current version, meaning that we only support downgrades between consecutive release versions.
-constexpr uint64_t minInvalidProtocolVersionValue = 0x0FDB00B074000000LL;
+constexpr uint64_t minInvalidProtocolVersionValue = @FDB_PV_MIN_INVALID_VERSION@;
 
 // The lowest protocol version that can be downgraded to.
-constexpr uint64_t minCompatibleProtocolVersionValue = 0x0FDB00B071000000LL;
+constexpr uint64_t minCompatibleProtocolVersionValue = @FDB_PV_MIN_COMPATIBLE_VERSION@;
 
+// The protocol version that will most likely follow the current one
+// Used only for testing upgrades to the future version
+constexpr uint64_t futureProtocolVersionValue = @FDB_PV_FUTURE_VERSION@;
+
+// The first check second expression version doesn't need to change because it's just for earlier protocol versions.
 #define PROTOCOL_VERSION_FEATURE(v, x)                                                                                 \
-	static_assert((v & 0xF0FFFFLL) == 0 || v < 0x0FDB00B071000000LL, "Unexpected feature protocol version");           \
-	static_assert(v <= currentProtocolVersionValue, "Feature protocol version too large");                             \
+	static_assert((v & @FDB_PV_LSB_MASK@) == 0 || v < 0x0FDB00B071000000LL, "Unexpected feature protocol version");             \
+	static_assert(v <= defaultProtocolVersionValue, "Feature protocol version too large");                             \
 	struct x {                                                                                                         \
 		static constexpr uint64_t protocolVersion = v;                                                                 \
 	};                                                                                                                 \
@@ -107,74 +104,74 @@ public: // introduced features
 	// We stopped using the dev version consistently in the past.
 	// To ensure binaries work across patch releases (e.g., 6.2.0 to 6.2.22), we require that the protocol version be
 	// the same for each of them.
-	PROTOCOL_VERSION_FEATURE(0x0FDB00A200090000LL, Watches);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00A2000D0000LL, MovableCoordinatedState);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00A340000000LL, ProcessID);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00A400040000LL, OpenDatabase);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00A446020000LL, Locality);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00A460010000LL, MultiGenerationTLog);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00A460010000LL, SharedMutations);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00A551000000LL, InexpensiveMultiVersionClient);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00A560010000LL, TagLocality);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B060000000LL, Fearless);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B061020000LL, EndpointAddrList);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B061030000LL, IPv6);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B061030000LL, TLogVersion);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B061070000LL, PseudoLocalities);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B061070000LL, ShardedTxsTags);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, TLogQueueEntryRef);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, GenerationRegVal);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, MovableCoordinatedStateV2);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, KeyServerValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, LogsValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, ServerTagValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, TagLocalityListValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, DatacenterReplicasValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, ProcessClassValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, WorkerListValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, BackupStartValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, LogRangeEncodeValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, HealthyZoneValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, DRBackupRanges);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, RegionConfiguration);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, ReplicationPolicy);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, BackupMutations);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, ClusterControllerPriorityInfo);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, ProcessIDFile);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, CloseUnusedConnection);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, DBCoreState);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, TagThrottleValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, StorageCacheValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, RestoreStatusValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, RestoreRequestValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, RestoreRequestDoneVersionValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, RestoreRequestTriggerValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, RestoreWorkerInterfaceValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, BackupProgressValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, KeyServerValueV2);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B063000000LL, UnifiedTLogSpilling);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, BackupWorker);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, ReportConflictingKeys);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, SmallEndpoints);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, CacheRole);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B070010000LL, StableInterfaces);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, ServerListValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, TagThrottleValueReason);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, SpanContext);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, TSS);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B071010000LL, ChangeFeed);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B071010000LL, BlobGranule);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B071010000LL, NetworkAddressHostnameFlag);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B071010000LL, StorageMetadata);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B071010000LL, PerpetualWiggleMetadata);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B071010000LL, StorageInterfaceReadiness);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B071010000LL, ResolverPrivateMutations);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, OTELSpanContext);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, SWVersionTracking);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, EncryptionAtRest);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, ShardEncodeLocationMetaData);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, Tenants);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, BlobGranuleFile);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_WATCHES@, Watches);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_MOVABLE_COORDINATED_STATE@, MovableCoordinatedState);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_PROCESS_ID@, ProcessID);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_OPEN_DATABASE@, OpenDatabase);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_LOCALITY@, Locality);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_MULTIGENERATION_TLOG@, MultiGenerationTLog);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_SHARED_MUTATIONS@, SharedMutations);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_INEXPENSIVE_MULTIVERSION_CLIENT@, InexpensiveMultiVersionClient);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_TAG_LOCALITY@, TagLocality);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_FEARLESS@, Fearless);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_ENDPOINT_ADDR_LIST@, EndpointAddrList);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_IPV6@, IPv6);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_TLOG_VERSION@, TLogVersion);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_PSEUDO_LOCALITIES@, PseudoLocalities);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_SHARDED_TXS_TAGS@, ShardedTxsTags);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_TLOG_QUEUE_ENTRY_REF@, TLogQueueEntryRef);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_GENERATION_REG_VAL@, GenerationRegVal);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_MOVABLE_COORDINATED_STATE_V2@, MovableCoordinatedStateV2);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_KEY_SERVER_VALUE@, KeyServerValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_LOGS_VALUE@, LogsValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_SERVER_TAG_VALUE@, ServerTagValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_TAG_LOCALITY_LIST_VALUE@, TagLocalityListValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_DATACENTER_REPLICAS_VALUE@, DatacenterReplicasValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_PROCESS_CLASS_VALUE@, ProcessClassValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_WORKER_LIST_VALUE@, WorkerListValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_BACKUP_START_VALUE@, BackupStartValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_LOG_RANGE_ENCODE_VALUE@, LogRangeEncodeValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_HEALTHY_ZONE_VALUE@, HealthyZoneValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_DR_BACKUP_RANGES@, DRBackupRanges);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_REGION_CONFIGURATION@, RegionConfiguration);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_REPLICATION_POLICY@, ReplicationPolicy);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_BACKUP_MUTATIONS@, BackupMutations);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_CLUSTER_CONTROLLER_PRIORITY_INFO@, ClusterControllerPriorityInfo);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_PROCESS_ID_FILE@, ProcessIDFile);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_CLOSE_UNUSED_CONNECTION@, CloseUnusedConnection);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_DB_CORE_STATE@, DBCoreState);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_TAG_THROTTLE_VALUE@, TagThrottleValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_STORAGE_CACHE_VALUE@, StorageCacheValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_RESTORE_STATUS_VALUE@, RestoreStatusValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_RESTORE_REQUEST_VALUE@, RestoreRequestValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_RESTORE_REQUEST_DONE_VERSION_VALUE@, RestoreRequestDoneVersionValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_RESTORE_REQUEST_TRIGGER_VALUE@, RestoreRequestTriggerValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_RESTORE_WORKER_INTERFACE_VALUE@, RestoreWorkerInterfaceValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_BACKUP_PROGRESS_VALUE@, BackupProgressValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_KEY_SERVER_VALUE_V2@, KeyServerValueV2);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_UNIFIED_TLOG_SPILLING@, UnifiedTLogSpilling);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_BACKUP_WORKER@, BackupWorker);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_REPORT_CONFLICTING_KEYS@, ReportConflictingKeys);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_SMALL_ENDPOINTS@, SmallEndpoints);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_CACHE_ROLE@, CacheRole);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_STABLE_INTERFACES@, StableInterfaces);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_SERVER_LIST_VALUE@, ServerListValue);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_TAG_THROTTLE_VALUE_REASON@, TagThrottleValueReason);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_SPAN_CONTEXT@, SpanContext);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_TSS@, TSS);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_CHANGE_FEED@, ChangeFeed);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_BLOB_GRANULE@, BlobGranule);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_NETWORK_ADDRESS_HOSTNAME_FLAG@, NetworkAddressHostnameFlag);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_STORAGE_METADATA@, StorageMetadata);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_PERPETUAL_WIGGLE_METADATA@, PerpetualWiggleMetadata);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_STORAGE_INTERFACE_READINESS@, StorageInterfaceReadiness);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_RESOLVER_PRIVATE_MUTATIONS@, ResolverPrivateMutations);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_OTEL_SPAN_CONTEXT@, OTELSpanContext);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_SW_VERSION_TRACKING@, SWVersionTracking);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_ENCRYPTION_AT_REST@, EncryptionAtRest);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_SHARD_ENCODE_LOCATION_METADATA@, ShardEncodeLocationMetaData);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_TENANTS@, Tenants);
+	PROTOCOL_VERSION_FEATURE(@FDB_PV_BLOB_GRANULE_FILE@, BlobGranuleFile);
 };
 
 template <>
@@ -184,23 +181,31 @@ struct Traceable<ProtocolVersion> : std::true_type {
 	}
 };
 
-constexpr ProtocolVersion currentProtocolVersion(currentProtocolVersionValue);
+constexpr ProtocolVersion defaultProtocolVersion(defaultProtocolVersionValue);
 constexpr ProtocolVersion minInvalidProtocolVersion(minInvalidProtocolVersionValue);
 constexpr ProtocolVersion minCompatibleProtocolVersion(minCompatibleProtocolVersionValue);
 
+// The protocol version of the process, normally it is equivalent defaultProtocolVersion
+// The currentProtocolVersion can be overridden dynamically for testing upgrades
+// to a future FDB version
+ProtocolVersion currentProtocolVersion();
+
+// Assume the next future protocol version as the current one. Used for testing purposes only
+void useFutureProtocolVersion();
+
 // This assert is intended to help prevent incrementing the leftmost digits accidentally. It will probably need to
 // change when we reach version 10.
-static_assert(currentProtocolVersion.version() < 0x0FDB00B100000000LL, "Unexpected protocol version");
+static_assert(defaultProtocolVersion.version() < @FDB_PV_LEFT_MOST_CHECK@, "Unexpected protocol version");
 
 // The last two bytes of the protocol version are currently masked out in compatibility checks. We do not use them,
 // so prevent them from being inadvertently changed.
 //
 // We also do not modify the protocol version for patch releases, so prevent modifying the patch version digit.
-static_assert((currentProtocolVersion.version() & 0xF0FFFFLL) == 0, "Unexpected protocol version");
+static_assert((defaultProtocolVersion.version() & @FDB_PV_LSB_MASK@) == 0, "Unexpected protocol version");
 
 // Downgrades must support at least one minor version.
 static_assert(minInvalidProtocolVersion.version() >=
-                  (currentProtocolVersion.version() & 0xFFFFFFFFFF000000LL) + 0x0000000002000000,
+                  (defaultProtocolVersion.version() & 0xFFFFFFFFFF000000LL) + 0x0000000002000000,
               "Downgrades must support one minor version");
 
 // The min invalid protocol version should be the smallest possible protocol version associated with a minor release
diff --git a/flow/ProtocolVersions.cmake b/flow/ProtocolVersions.cmake
new file mode 100644
index 0000000000..92f455ae2a
--- /dev/null
+++ b/flow/ProtocolVersions.cmake
@@ -0,0 +1,90 @@
+# Protocol Versions.
+# This version impacts both communications and the deserialization of certain database and IKeyValueStore keys.
+#
+# The convention is that 'x' and 'y' should match the major and minor version of the software, and 'z' should be 0.
+# To make a change without a corresponding increase to the x.y version, increment the 'dev' digit.
+#
+# The last 2 bytes (4 digits) of the protocol version do not affect compatibility. These two bytes are not currently
+# used and should not be changed from 0.
+#                                                          xyzdev
+#                                                          vvvv
+set(FDB_PV_DEFAULT_VERSION                      "0x0FDB00B072000000LL")
+set(FDB_PV_FUTURE_VERSION                       "0x0FDB00B073000000LL")
+set(FDB_PV_MIN_COMPATIBLE_VERSION               "0x0FDB00B071000000LL")
+set(FDB_PV_MIN_INVALID_VERSION                  "0x0FDB00B074000000LL")
+set(FDB_PV_LEFT_MOST_CHECK                      "0x0FDB00B100000000LL")
+set(FDB_PV_LSB_MASK                             "0xF0FFFFLL")
+
+# The 5th digit from right is dev version, for example, 2 in 0x0FDB00B061020000LL;
+# It was used to identify a protocol change (e.g., interface change) between major/minor versions (say 5.1 and 5.2)
+# We stopped using the dev version consistently in the past.
+# To ensure binaries work across patch releases (e.g., 6.2.0 to 6.2.22), we require that the protocol version be
+# the same for each of them.
+set(FDB_PV_WATCHES                              "0x0FDB00A200090000LL")
+set(FDB_PV_MOVABLE_COORDINATED_STATE            "0x0FDB00A2000D0000LL")
+set(FDB_PV_PROCESS_ID                           "0x0FDB00A340000000LL")
+set(FDB_PV_OPEN_DATABASE                        "0x0FDB00A400040000LL")
+set(FDB_PV_LOCALITY                             "0x0FDB00A446020000LL")
+set(FDB_PV_MULTIGENERATION_TLOG                 "0x0FDB00A460010000LL")
+set(FDB_PV_SHARED_MUTATIONS                     "0x0FDB00A460010000LL")
+set(FDB_PV_INEXPENSIVE_MULTIVERSION_CLIENT      "0x0FDB00A551000000LL")
+set(FDB_PV_TAG_LOCALITY                         "0x0FDB00A560010000LL")
+set(FDB_PV_FEARLESS                             "0x0FDB00B060000000LL")
+set(FDB_PV_ENDPOINT_ADDR_LIST                   "0x0FDB00B061020000LL")
+set(FDB_PV_IPV6                                 "0x0FDB00B061030000LL")
+set(FDB_PV_TLOG_VERSION                         "0x0FDB00B061030000LL")
+set(FDB_PV_PSEUDO_LOCALITIES                    "0x0FDB00B061070000LL")
+set(FDB_PV_SHARDED_TXS_TAGS                     "0x0FDB00B061070000LL")
+set(FDB_PV_TLOG_QUEUE_ENTRY_REF                 "0x0FDB00B062010001LL")
+set(FDB_PV_GENERATION_REG_VAL                   "0x0FDB00B062010001LL")
+set(FDB_PV_MOVABLE_COORDINATED_STATE_V2         "0x0FDB00B062010001LL")
+set(FDB_PV_KEY_SERVER_VALUE                     "0x0FDB00B062010001LL")
+set(FDB_PV_LOGS_VALUE                           "0x0FDB00B062010001LL")
+set(FDB_PV_SERVER_TAG_VALUE                     "0x0FDB00B062010001LL")
+set(FDB_PV_TAG_LOCALITY_LIST_VALUE              "0x0FDB00B062010001LL")
+set(FDB_PV_DATACENTER_REPLICAS_VALUE            "0x0FDB00B062010001LL")
+set(FDB_PV_PROCESS_CLASS_VALUE                  "0x0FDB00B062010001LL")
+set(FDB_PV_WORKER_LIST_VALUE                    "0x0FDB00B062010001LL")
+set(FDB_PV_BACKUP_START_VALUE                   "0x0FDB00B062010001LL")
+set(FDB_PV_LOG_RANGE_ENCODE_VALUE               "0x0FDB00B062010001LL")
+set(FDB_PV_HEALTHY_ZONE_VALUE                   "0x0FDB00B062010001LL")
+set(FDB_PV_DR_BACKUP_RANGES                     "0x0FDB00B062010001LL")
+set(FDB_PV_REGION_CONFIGURATION                 "0x0FDB00B062010001LL")
+set(FDB_PV_REPLICATION_POLICY                   "0x0FDB00B062010001LL")
+set(FDB_PV_BACKUP_MUTATIONS                     "0x0FDB00B062010001LL")
+set(FDB_PV_CLUSTER_CONTROLLER_PRIORITY_INFO     "0x0FDB00B062010001LL")
+set(FDB_PV_PROCESS_ID_FILE                      "0x0FDB00B062010001LL")
+set(FDB_PV_CLOSE_UNUSED_CONNECTION              "0x0FDB00B062010001LL")
+set(FDB_PV_DB_CORE_STATE                        "0x0FDB00B063010000LL")
+set(FDB_PV_TAG_THROTTLE_VALUE                   "0x0FDB00B063010000LL")
+set(FDB_PV_STORAGE_CACHE_VALUE                  "0x0FDB00B063010000LL")
+set(FDB_PV_RESTORE_STATUS_VALUE                 "0x0FDB00B063010000LL")
+set(FDB_PV_RESTORE_REQUEST_VALUE                "0x0FDB00B063010000LL")
+set(FDB_PV_RESTORE_REQUEST_DONE_VERSION_VALUE   "0x0FDB00B063010000LL")
+set(FDB_PV_RESTORE_REQUEST_TRIGGER_VALUE        "0x0FDB00B063010000LL")
+set(FDB_PV_RESTORE_WORKER_INTERFACE_VALUE       "0x0FDB00B063010000LL")
+set(FDB_PV_BACKUP_PROGRESS_VALUE                "0x0FDB00B063010000LL")
+set(FDB_PV_KEY_SERVER_VALUE_V2                  "0x0FDB00B063010000LL")
+set(FDB_PV_UNIFIED_TLOG_SPILLING                "0x0FDB00B063000000LL")
+set(FDB_PV_BACKUP_WORKER                        "0x0FDB00B063010000LL")
+set(FDB_PV_REPORT_CONFLICTING_KEYS              "0x0FDB00B063010000LL")
+set(FDB_PV_SMALL_ENDPOINTS                      "0x0FDB00B063010000LL")
+set(FDB_PV_CACHE_ROLE                           "0x0FDB00B063010000LL")
+set(FDB_PV_STABLE_INTERFACES                    "0x0FDB00B070010000LL")
+set(FDB_PV_SERVER_LIST_VALUE                    "0x0FDB00B070010001LL")
+set(FDB_PV_TAG_THROTTLE_VALUE_REASON            "0x0FDB00B070010001LL")
+set(FDB_PV_SPAN_CONTEXT                         "0x0FDB00B070010001LL")
+set(FDB_PV_TSS                                  "0x0FDB00B070010001LL")
+set(FDB_PV_CHANGE_FEED                          "0x0FDB00B071010000LL")
+set(FDB_PV_BLOB_GRANULE                         "0x0FDB00B071010000LL")
+set(FDB_PV_NETWORK_ADDRESS_HOSTNAME_FLAG        "0x0FDB00B071010000LL")
+set(FDB_PV_STORAGE_METADATA                     "0x0FDB00B071010000LL")
+set(FDB_PV_PERPETUAL_WIGGLE_METADATA            "0x0FDB00B071010000LL")
+set(FDB_PV_STORAGE_INTERFACE_READINESS          "0x0FDB00B071010000LL")
+set(FDB_PV_RESOLVER_PRIVATE_MUTATIONS           "0x0FDB00B071010000LL")
+set(FDB_PV_OTEL_SPAN_CONTEXT                    "0x0FDB00B072000000LL")
+set(FDB_PV_SW_VERSION_TRACKING                  "0x0FDB00B072000000LL")
+set(FDB_PV_ENCRYPTION_AT_REST                   "0x0FDB00B072000000LL")
+set(FDB_PV_SHARD_ENCODE_LOCATION_METADATA       "0x0FDB00B072000000LL")
+set(FDB_PV_TENANTS                              "0x0FDB00B072000000LL")
+set(FDB_PV_BLOB_GRANULE_FILE                    "0x0FDB00B072000000LL")
\ No newline at end of file
diff --git a/flow/TLSConfig.actor.cpp b/flow/TLSConfig.actor.cpp
index 83ce48f526..092e5e27c5 100644
--- a/flow/TLSConfig.actor.cpp
+++ b/flow/TLSConfig.actor.cpp
@@ -81,7 +81,7 @@ void LoadedTLSConfig::print(FILE* fp) {
 	int num_certs = 0;
 	boost::asio::ssl::context context(boost::asio::ssl::context::tls);
 	try {
-		ConfigureSSLContext(*this, &context);
+		ConfigureSSLContext(*this, context);
 	} catch (Error& e) {
 		fprintf(fp, "There was an error in loading the certificate chain.\n");
 		throw;
@@ -109,51 +109,58 @@ void LoadedTLSConfig::print(FILE* fp) {
 	X509_STORE_CTX_free(store_ctx);
 }
 
-void ConfigureSSLContext(const LoadedTLSConfig& loaded,
-                         boost::asio::ssl::context* context,
-                         std::function<void()> onPolicyFailure) {
+void ConfigureSSLContext(const LoadedTLSConfig& loaded, boost::asio::ssl::context& context) {
 	try {
-		context->set_options(boost::asio::ssl::context::default_workarounds);
-		context->set_verify_mode(boost::asio::ssl::context::verify_peer |
-		                         boost::asio::ssl::verify_fail_if_no_peer_cert);
+		context.set_options(boost::asio::ssl::context::default_workarounds);
+		auto verifyFailIfNoPeerCert = boost::asio::ssl::verify_fail_if_no_peer_cert;
+		// Servers get to accept connections without peer certs as "untrusted" clients
+		if (loaded.getEndpointType() == TLSEndpointType::SERVER)
+			verifyFailIfNoPeerCert = 0;
+		context.set_verify_mode(boost::asio::ssl::context::verify_peer | verifyFailIfNoPeerCert);
 
-		if (loaded.isTLSEnabled()) {
-			auto tlsPolicy = makeReference<TLSPolicy>(loaded.getEndpointType());
-			tlsPolicy->set_verify_peers({ loaded.getVerifyPeers() });
-
-			context->set_verify_callback(
-			    [policy = tlsPolicy, onPolicyFailure](bool preverified, boost::asio::ssl::verify_context& ctx) {
-				    bool success = policy->verify_peer(preverified, ctx.native_handle());
-				    if (!success) {
-					    onPolicyFailure();
-				    }
-				    return success;
-			    });
-		} else {
-			// Insecurely always except if TLS is not enabled.
-			context->set_verify_callback([](bool, boost::asio::ssl::verify_context&) { return true; });
-		}
-
-		context->set_password_callback([password = loaded.getPassword()](
-		                                   size_t, boost::asio::ssl::context::password_purpose) { return password; });
+		context.set_password_callback([password = loaded.getPassword()](
+		                                  size_t, boost::asio::ssl::context::password_purpose) { return password; });
 
 		const std::string& CABytes = loaded.getCABytes();
 		if (CABytes.size()) {
-			context->add_certificate_authority(boost::asio::buffer(CABytes.data(), CABytes.size()));
+			context.add_certificate_authority(boost::asio::buffer(CABytes.data(), CABytes.size()));
 		}
 
 		const std::string& keyBytes = loaded.getKeyBytes();
 		if (keyBytes.size()) {
-			context->use_private_key(boost::asio::buffer(keyBytes.data(), keyBytes.size()),
-			                         boost::asio::ssl::context::pem);
+			context.use_private_key(boost::asio::buffer(keyBytes.data(), keyBytes.size()),
+			                        boost::asio::ssl::context::pem);
 		}
 
 		const std::string& certBytes = loaded.getCertificateBytes();
 		if (certBytes.size()) {
-			context->use_certificate_chain(boost::asio::buffer(certBytes.data(), certBytes.size()));
+			context.use_certificate_chain(boost::asio::buffer(certBytes.data(), certBytes.size()));
 		}
 	} catch (boost::system::system_error& e) {
-		TraceEvent("TLSConfigureError")
+		TraceEvent("TLSContextConfigureError")
+		    .detail("What", e.what())
+		    .detail("Value", e.code().value())
+		    .detail("WhichMeans", TLSPolicy::ErrorString(e.code()));
+		throw tls_error();
+	}
+}
+
+void ConfigureSSLStream(Reference<TLSPolicy> policy,
+                        boost::asio::ssl::stream<boost::asio::ip::tcp::socket&>& stream,
+                        std::function<void(bool)> callback) {
+	try {
+		stream.set_verify_callback([policy, callback](bool preverified, boost::asio::ssl::verify_context& ctx) {
+			bool success = policy->verify_peer(preverified, ctx.native_handle());
+			if (!success) {
+				if (policy->on_failure)
+					policy->on_failure();
+			}
+			if (callback)
+				callback(success);
+			return success;
+		});
+	} catch (boost::system::system_error& e) {
+		TraceEvent("TLSStreamConfigureError")
 		    .detail("What", e.what())
 		    .detail("Value", e.code().value())
 		    .detail("WhichMeans", TLSPolicy::ErrorString(e.code()));
@@ -261,6 +268,11 @@ LoadedTLSConfig TLSConfig::loadSync() const {
 	return loaded;
 }
 
+TLSPolicy::TLSPolicy(const LoadedTLSConfig& loaded, std::function<void()> on_failure)
+  : rules(), on_failure(std::move(on_failure)), is_client(loaded.getEndpointType() == TLSEndpointType::CLIENT) {
+	set_verify_peers(loaded.getVerifyPeers());
+}
+
 // And now do the same thing, but async...
 
 ACTOR static Future<Void> readEntireFile(std::string filename, std::string* destination) {
diff --git a/flow/Trace.cpp b/flow/Trace.cpp
index bf4d5b62f6..39a6062912 100644
--- a/flow/Trace.cpp
+++ b/flow/Trace.cpp
@@ -37,6 +37,7 @@
 #include "flow/EventTypes.actor.h"
 #include "flow/TDMetric.actor.h"
 #include "flow/MetricSample.h"
+#include "flow/network.h"
 
 #ifdef _WIN32
 #include <windows.h>
@@ -568,6 +569,16 @@ public:
 		universalFields[name] = value;
 	}
 
+	void setLocalAddress(const NetworkAddress& addr) {
+		MutexHolder holder(mutex);
+		this->localAddress = addr;
+	}
+
+	void disposeWriter() {
+		writer->addref();
+		writer.clear();
+	}
+
 	Future<Void> pingWriterThread() {
 		auto ping = new WriterThread::Ping;
 		auto f = ping->ack.getFuture();
@@ -813,6 +824,18 @@ void addUniversalTraceField(const std::string& name, const std::string& value) {
 	g_traceLog.addUniversalTraceField(name, value);
 }
 
+void setTraceLocalAddress(const NetworkAddress& addr) {
+	g_traceLog.setLocalAddress(addr);
+}
+
+void disposeTraceFileWriter() {
+	g_traceLog.disposeWriter();
+}
+
+std::string getTraceFormatExtension() {
+	return std::string(g_traceLog.formatter->getExtension());
+}
+
 BaseTraceEvent::BaseTraceEvent() : initialized(true), enabled(false), logged(true) {}
 BaseTraceEvent::BaseTraceEvent(Severity severity, const char* type, UID id)
   : initialized(false), enabled(g_network == nullptr || FLOW_KNOBS->MIN_TRACE_SEVERITY <= severity), logged(false),
@@ -1286,6 +1309,10 @@ thread_local bool BaseTraceEvent::networkThread = false;
 void BaseTraceEvent::setNetworkThread() {
 	if (!networkThread) {
 		if (FLOW_KNOBS->ALLOCATION_TRACING_ENABLED) {
+			// Ensure that threadId is initialized before we enable allocation tracing, otherwise it would
+			// be initialized either by first normal usage of TraceEvent or by allocation tracing which is
+			// non-deterministic, and we could run into https://github.com/apple/foundationdb/issues/7872.
+			getTraceThreadId();
 			--g_allocation_tracing_disabled;
 		}
 
@@ -1337,7 +1364,9 @@ std::string BaseTraceEvent::printRealTime(double time) {
 }
 
 TraceInterval& TraceInterval::begin() {
-	pairID = nondeterministicRandom()->randomUniqueID();
+	if (!pairID.isValid()) {
+		pairID = nondeterministicRandom()->randomUniqueID();
+	}
 	count = 0;
 	return *this;
 }
diff --git a/flow/actorcompiler/ActorCompiler.cs b/flow/actorcompiler/ActorCompiler.cs
index 70465321f6..4071c64e8d 100644
--- a/flow/actorcompiler/ActorCompiler.cs
+++ b/flow/actorcompiler/ActorCompiler.cs
@@ -3,7 +3,7 @@
  *
  * This source file is part of the FoundationDB open source project
  *
- * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/flow/actorcompiler/ParseTree.cs b/flow/actorcompiler/ParseTree.cs
index 8e94773bc7..689e982d11 100644
--- a/flow/actorcompiler/ParseTree.cs
+++ b/flow/actorcompiler/ParseTree.cs
@@ -3,7 +3,7 @@
  *
  * This source file is part of the FoundationDB open source project
  *
- * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/flow/actorcompiler/Program.cs b/flow/actorcompiler/Program.cs
index e3a491c0e2..cdb8ee8d6e 100644
--- a/flow/actorcompiler/Program.cs
+++ b/flow/actorcompiler/Program.cs
@@ -3,7 +3,7 @@
  *
  * This source file is part of the FoundationDB open source project
  *
- * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/flow/include/flow/BlobCipher.h b/flow/include/flow/BlobCipher.h
index 466ebba1b0..27cbbc6c5f 100644
--- a/flow/include/flow/BlobCipher.h
+++ b/flow/include/flow/BlobCipher.h
@@ -19,8 +19,6 @@
  */
 #ifndef FLOW_BLOB_CIPHER_H
 #define FLOW_BLOB_CIPHER_H
-#include "flow/ProtocolVersion.h"
-#include "flow/serialize.h"
 #pragma once
 
 #include "flow/Arena.h"
@@ -28,10 +26,14 @@
 #include "flow/FastRef.h"
 #include "flow/flow.h"
 #include "flow/genericactors.actor.h"
+#include "flow/Knobs.h"
 #include "flow/network.h"
+#include "flow/ProtocolVersion.h"
+#include "flow/serialize.h"
 
 #include <boost/functional/hash.hpp>
 #include <cinttypes>
+#include <limits>
 #include <memory>
 #include <openssl/aes.h>
 #include <openssl/engine.h>
@@ -216,15 +218,20 @@ public:
 	BlobCipherKey(const EncryptCipherDomainId& domainId,
 	              const EncryptCipherBaseKeyId& baseCiphId,
 	              const uint8_t* baseCiph,
-	              int baseCiphLen);
+	              int baseCiphLen,
+	              const int64_t refreshAt,
+	              int64_t expireAt);
 	BlobCipherKey(const EncryptCipherDomainId& domainId,
 	              const EncryptCipherBaseKeyId& baseCiphId,
 	              const uint8_t* baseCiph,
 	              int baseCiphLen,
-	              const EncryptCipherRandomSalt& salt);
+	              const EncryptCipherRandomSalt& salt,
+	              const int64_t refreshAt,
+	              const int64_t expireAt);
 
 	uint8_t* data() const { return cipher.get(); }
-	uint64_t getCreationTime() const { return creationTime; }
+	uint64_t getRefreshAtTS() const { return refreshAtTS; }
+	uint64_t getExpireAtTS() const { return expireAtTS; }
 	EncryptCipherDomainId getDomainId() const { return encryptDomainId; }
 	EncryptCipherRandomSalt getSalt() const { return randomSalt; }
 	EncryptCipherBaseKeyId getBaseCipherId() const { return baseCipherId; }
@@ -243,6 +250,20 @@ public:
 		       randomSalt == details.salt;
 	}
 
+	inline bool needsRefresh() {
+		if (refreshAtTS == std::numeric_limits<int64_t>::max()) {
+			return false;
+		}
+		return now() >= refreshAtTS ? true : false;
+	}
+
+	inline bool isExpired() {
+		if (expireAtTS == std::numeric_limits<int64_t>::max()) {
+			return false;
+		}
+		return now() >= expireAtTS ? true : false;
+	}
+
 	void reset();
 
 private:
@@ -254,16 +275,20 @@ private:
 	EncryptCipherBaseKeyId baseCipherId;
 	// Random salt used for encryption cipher key derivation
 	EncryptCipherRandomSalt randomSalt;
-	// Creation timestamp for the derived encryption cipher key
-	uint64_t creationTime;
 	// Derived encryption cipher key
 	std::unique_ptr<uint8_t[]> cipher;
+	// CipherKey needs refreshAt
+	int64_t refreshAtTS;
+	// CipherKey is valid until
+	int64_t expireAtTS;
 
 	void initKey(const EncryptCipherDomainId& domainId,
 	             const uint8_t* baseCiph,
 	             int baseCiphLen,
 	             const EncryptCipherBaseKeyId& baseCiphId,
-	             const EncryptCipherRandomSalt& salt);
+	             const EncryptCipherRandomSalt& salt,
+	             const int64_t refreshAt,
+	             const int64_t expireAt);
 	void applyHmacSha256Derivation();
 };
 
@@ -326,7 +351,9 @@ public:
 
 	Reference<BlobCipherKey> insertBaseCipherKey(const EncryptCipherBaseKeyId& baseCipherId,
 	                                             const uint8_t* baseCipher,
-	                                             int baseCipherLen);
+	                                             int baseCipherLen,
+	                                             const int64_t refreshAt,
+	                                             const int64_t expireAt);
 
 	// API enables inserting base encryption cipher details to the BlobCipherKeyIdCache
 	// Given cipherKeys are immutable, attempting to re-insert same 'identical' cipherKey
@@ -341,7 +368,9 @@ public:
 	Reference<BlobCipherKey> insertBaseCipherKey(const EncryptCipherBaseKeyId& baseCipherId,
 	                                             const uint8_t* baseCipher,
 	                                             int baseCipherLen,
-	                                             const EncryptCipherRandomSalt& salt);
+	                                             const EncryptCipherRandomSalt& salt,
+	                                             const int64_t refreshAt,
+	                                             const int64_t expireAt);
 
 	// API cleanup the cache by dropping all cached cipherKeys
 	void cleanup();
@@ -377,7 +406,9 @@ public:
 	Reference<BlobCipherKey> insertCipherKey(const EncryptCipherDomainId& domainId,
 	                                         const EncryptCipherBaseKeyId& baseCipherId,
 	                                         const uint8_t* baseCipher,
-	                                         int baseCipherLen);
+	                                         int baseCipherLen,
+	                                         const int64_t refreshAt,
+	                                         const int64_t expireAt);
 
 	// Enable clients to insert base encryption cipher details to the BlobCipherKeyCache.
 	// The cipherKeys are indexed using 'baseCipherId', given cipherKeys are immutable,
@@ -394,7 +425,9 @@ public:
 	                                         const EncryptCipherBaseKeyId& baseCipherId,
 	                                         const uint8_t* baseCipher,
 	                                         int baseCipherLen,
-	                                         const EncryptCipherRandomSalt& salt);
+	                                         const EncryptCipherRandomSalt& salt,
+	                                         const int64_t refreshAt,
+	                                         const int64_t expireAt);
 
 	// API returns the last insert cipherKey for a given encryption domain Id.
 	// If domain Id is invalid, it would throw 'encrypt_invalid_id' exception,
diff --git a/flow/include/flow/DebugTrace.h b/flow/include/flow/DebugTrace.h
index f97ff508b1..bcc2670eae 100644
--- a/flow/include/flow/DebugTrace.h
+++ b/flow/include/flow/DebugTrace.h
@@ -26,4 +26,7 @@
 constexpr bool debugLogTraces = false;
 #define DebugLogTraceEvent(...) DebugTraceEvent(debugLogTraces, __VA_ARGS__)
 
+constexpr bool debugRelocationTraces = false;
+#define DebugRelocationTraceEvent(...) DebugTraceEvent(debugRelocationTraces, __VA_ARGS__)
+
 #endif // FOUNDATIONDB_DEBUGTRACE_H
diff --git a/flow/include/flow/Knobs.h b/flow/include/flow/Knobs.h
index 36ee936f13..35f0973cb9 100644
--- a/flow/include/flow/Knobs.h
+++ b/flow/include/flow/Knobs.h
@@ -195,6 +195,9 @@ public:
 	int NETWORK_TEST_REQUEST_SIZE;
 	bool NETWORK_TEST_SCRIPT_MODE;
 
+	// Authorization
+	int PUBLIC_KEY_FILE_MAX_SIZE;
+	int PUBLIC_KEY_FILE_REFRESH_INTERVAL_SECONDS;
 	int MAX_CACHED_EXPIRED_TOKENS;
 
 	// AsyncFileCached
diff --git a/flow/include/flow/MkCert.h b/flow/include/flow/MkCert.h
index 70b091fe12..3d7fd5cd19 100644
--- a/flow/include/flow/MkCert.h
+++ b/flow/include/flow/MkCert.h
@@ -39,7 +39,7 @@ void printPrivateKey(FILE* out, StringRef privateKeyPem);
 
 PrivateKey makeEcP256();
 
-PrivateKey makeRsa2048Bit();
+PrivateKey makeRsa4096Bit();
 
 struct Asn1EntryRef {
 	// field must match one of ASN.1 object short/long names: e.g. "C", "countryName", "CN", "commonName",
diff --git a/flow/include/flow/ObjectSerializer.h b/flow/include/flow/ObjectSerializer.h
index 992c02e3c9..ab3a54a4b6 100644
--- a/flow/include/flow/ObjectSerializer.h
+++ b/flow/include/flow/ObjectSerializer.h
@@ -102,7 +102,7 @@ public:
 			// Some file identifiers are changed in 7.0, so file identifier mismatches
 			// are expected during a downgrade from 7.0 to 6.3
 			bool expectMismatch = mProtocolVersion.get() >= ProtocolVersion(0x0FDB00B070000000LL) &&
-			                      currentProtocolVersion < ProtocolVersion(0x0FDB00B070000000LL);
+			                      currentProtocolVersion() < ProtocolVersion(0x0FDB00B070000000LL);
 			{
 				TraceEvent te(expectMismatch ? SevInfo : SevError, "MismatchedFileIdentifier");
 				if (expectMismatch) {
diff --git a/flow/include/flow/Platform.h b/flow/include/flow/Platform.h
index 17dfff2650..5e25d0cf51 100644
--- a/flow/include/flow/Platform.h
+++ b/flow/include/flow/Platform.h
@@ -282,6 +282,10 @@ void getLocalTime(const time_t* timep, struct tm* result);
 // get GMT time string from an epoch seconds double
 std::string epochsToGMTString(double epochs);
 
+#define ENVIRONMENT_KNOB_OPTION_PREFIX "FDB_KNOB_"
+// returns list of environment variables with prefix ENVIRONMENT_KNOB_OPTION_PREFIX
+std::vector<std::string> getEnvironmentKnobOptions();
+
 void setMemoryQuota(size_t limit);
 
 void* allocate(size_t length, bool allowLargePages, bool includeGuardPages);
@@ -316,7 +320,7 @@ std::string readFileBytes(std::string const& filename, int maxSize);
 
 // Read a file into memory supplied by the caller
 // If 'len' is greater than file size, then read the filesize bytes.
-void readFileBytes(std::string const& filename, uint8_t* buff, int64_t len);
+size_t readFileBytes(std::string const& filename, uint8_t* buff, int64_t len);
 
 // Write data buffer into file
 void writeFileBytes(std::string const& filename, const char* data, size_t count);
diff --git a/flow/include/flow/TLSConfig.actor.h b/flow/include/flow/TLSConfig.actor.h
index 248f9aa3cd..94ecb7752c 100644
--- a/flow/include/flow/TLSConfig.actor.h
+++ b/flow/include/flow/TLSConfig.actor.h
@@ -33,6 +33,8 @@
 #include <string>
 #include <vector>
 #include <boost/system/system_error.hpp>
+#include <boost/asio/ip/tcp.hpp>
+#include <boost/asio/ssl.hpp>
 #include "flow/FastRef.h"
 #include "flow/Knobs.h"
 #include "flow/flow.h"
@@ -201,21 +203,23 @@ private:
 	TLSEndpointType endpointType = TLSEndpointType::UNSET;
 };
 
-namespace boost {
-namespace asio {
-namespace ssl {
-struct context;
-}
-} // namespace asio
-} // namespace boost
-void ConfigureSSLContext(
-    const LoadedTLSConfig& loaded,
-    boost::asio::ssl::context* context,
-    std::function<void()> onPolicyFailure = []() {});
+class TLSPolicy;
+
+void ConfigureSSLContext(const LoadedTLSConfig& loaded, boost::asio::ssl::context& context);
+
+// Set up SSL for stream object based on policy.
+// Optionally arm a callback that gets called with verify-outcome of each cert in peer certificate chain:
+// e.g. for peer with a valid, trusted length-3 certificate chain (root CA, intermediate CA, and server certs),
+// callback(true) will be called 3 times.
+void ConfigureSSLStream(Reference<TLSPolicy> policy,
+                        boost::asio::ssl::stream<boost::asio::ip::tcp::socket&>& stream,
+                        std::function<void(bool)> callback);
 
 class TLSPolicy : ReferenceCounted<TLSPolicy> {
+	void set_verify_peers(std::vector<std::string> verify_peers);
+
 public:
-	TLSPolicy(TLSEndpointType client) : is_client(client == TLSEndpointType::CLIENT) {}
+	TLSPolicy(const LoadedTLSConfig& loaded, std::function<void()> on_failure);
 	virtual ~TLSPolicy();
 
 	virtual void addref() { ReferenceCounted<TLSPolicy>::addref(); }
@@ -223,7 +227,6 @@ public:
 
 	static std::string ErrorString(boost::system::error_code e);
 
-	void set_verify_peers(std::vector<std::string> verify_peers);
 	bool verify_peer(bool preverified, X509_STORE_CTX* store_ctx);
 
 	std::string toString() const;
@@ -242,6 +245,7 @@ public:
 	};
 
 	std::vector<Rule> rules;
+	std::function<void()> on_failure;
 	bool is_client;
 };
 
diff --git a/flow/include/flow/Trace.h b/flow/include/flow/Trace.h
index f153f144de..6e3b01ce63 100644
--- a/flow/include/flow/Trace.h
+++ b/flow/include/flow/Trace.h
@@ -256,6 +256,7 @@ FORMAT_TRACEABLE(long int, "%ld");
 FORMAT_TRACEABLE(unsigned long int, "%lu");
 FORMAT_TRACEABLE(long long int, "%lld");
 FORMAT_TRACEABLE(unsigned long long int, "%llu");
+FORMAT_TRACEABLE(float, "%g");
 FORMAT_TRACEABLE(double, "%g");
 FORMAT_TRACEABLE(void*, "%p");
 FORMAT_TRACEABLE(volatile long, "%ld");
@@ -513,7 +514,7 @@ public:
 
 	void log();
 
-	void disable() { enabled = false; } // Disables the trace event so it doesn't get
+	void disable() { enabled = false; } // Disables the trace event so it doesn't get logged
 
 	virtual ~BaseTraceEvent(); // Actually logs the event
 
@@ -583,7 +584,7 @@ private:
 class StringRef;
 
 struct TraceInterval {
-	TraceInterval(const char* type) : type(type), count(-1), severity(SevInfo) {}
+	TraceInterval(const char* type, UID id = UID()) : type(type), pairID(id), count(-1), severity(SevInfo) {}
 
 	TraceInterval& begin();
 	TraceInterval& end() { return *this; }
@@ -661,6 +662,9 @@ void removeTraceRole(std::string const& role);
 void retrieveTraceLogIssues(std::set<std::string>& out);
 void setTraceLogGroup(const std::string& role);
 void addUniversalTraceField(std::string const& name, std::string const& value);
+void setTraceLocalAddress(const NetworkAddress& addr);
+void disposeTraceFileWriter();
+std::string getTraceFormatExtension();
 uint64_t getTraceThreadId();
 
 template <class T>
diff --git a/flow/include/flow/WatchFile.actor.h b/flow/include/flow/WatchFile.actor.h
new file mode 100644
index 0000000000..bad2c58e14
--- /dev/null
+++ b/flow/include/flow/WatchFile.actor.h
@@ -0,0 +1,77 @@
+/*
+ * WatchFile.actor.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+// When actually compiled (NO_INTELLISENSE), include the generated
+// version of this file.  In intellisense use the source version.
+#if defined(NO_INTELLISENSE) && !defined(FLOW_WATCH_FILE_ACTOR_G_H)
+#define FLOW_WATCH_FILE_ACTOR_G_H
+#include "flow/WatchFile.actor.g.h"
+#elif !defined(FLOW_WATCH_FILE_ACTOR_H)
+#define FLOW_WATCH_FILE_ACTOR_H
+
+#include <ctime>
+#include <string>
+#include "flow/IAsyncFile.h"
+#include "flow/genericactors.actor.h"
+#include "flow/actorcompiler.h"
+
+ACTOR static Future<Void> watchFileForChanges(std::string filename,
+                                              AsyncTrigger* fileChanged,
+                                              const int* intervalSeconds,
+                                              const char* errorType) {
+	if (filename == "") {
+		return Never();
+	}
+	state bool firstRun = true;
+	state bool statError = false;
+	state std::time_t lastModTime = 0;
+	loop {
+		try {
+			std::time_t modtime = wait(IAsyncFileSystem::filesystem()->lastWriteTime(filename));
+			if (firstRun) {
+				lastModTime = modtime;
+				firstRun = false;
+			}
+			if (lastModTime != modtime || statError) {
+				lastModTime = modtime;
+				statError = false;
+				fileChanged->trigger();
+			}
+		} catch (Error& e) {
+			if (e.code() == error_code_io_error) {
+				// EACCES, ELOOP, ENOENT all come out as io_error(), but are more of a system
+				// configuration issue than an FDB problem.  If we managed to load valid
+				// certificates, then there's no point in crashing, but we should complain
+				// loudly.  IAsyncFile will log the error, but not necessarily as a warning.
+				TraceEvent(SevWarnAlways, errorType).detail("File", filename);
+				statError = true;
+			} else {
+				throw;
+			}
+		}
+		wait(delay(*intervalSeconds));
+	}
+}
+
+#include "flow/unactorcompiler.h"
+
+#endif // FLOW_WATCH_FILE_ACTOR_H
diff --git a/flow/include/flow/error_definitions.h b/flow/include/flow/error_definitions.h
index bc75885925..d4d33d9ef2 100755
--- a/flow/include/flow/error_definitions.h
+++ b/flow/include/flow/error_definitions.h
@@ -65,7 +65,7 @@ ERROR( database_locked, 1038, "Database is locked" )
 ERROR( cluster_version_changed, 1039, "The protocol version of the cluster has changed" )
 ERROR( external_client_already_loaded, 1040, "External client has already been loaded" )
 ERROR( lookup_failed, 1041, "DNS lookup failed" )
-ERROR( proxy_memory_limit_exceeded, 1042, "CommitProxy commit memory limit exceeded" )
+ERROR( commit_proxy_memory_limit_exceeded, 1042, "CommitProxy commit memory limit exceeded" )
 ERROR( shutdown_in_progress, 1043, "Operation no longer supported due to shutdown" )
 ERROR( serialization_failed, 1044, "Failed to deserialize an object" )
 ERROR( connection_unreferenced, 1048, "No peer references for connection" )
@@ -97,6 +97,9 @@ ERROR( unexpected_encoding_type, 1073, "Page content decoding failed" )
 ERROR( encryption_key_not_found, 1074, "Encryption key not found" )
 ERROR( data_move_cancelled, 1075, "Data move was cancelled" )
 ERROR( data_move_dest_team_not_found, 1076, "Dest team was not found for data move" )
+ERROR( blob_worker_full, 1077, "Blob worker cannot take on more granule assignments" )
+ERROR( grv_proxy_memory_limit_exceeded, 1078, "GetReadVersion proxy memory limit exceeded" )
+ERROR( blob_granule_request_failed, 1079, "BlobGranule request failed" )
 
 ERROR( broken_promise, 1100, "Broken promise" )
 ERROR( operation_cancelled, 1101, "Asynchronous operation cancelled" )
@@ -123,7 +126,7 @@ ERROR( dd_tracker_cancelled, 1215, "The data distribution tracker has been cance
 ERROR( failed_to_progress, 1216, "Process has failed to make sufficient progress" )
 ERROR( invalid_cluster_id, 1217, "Attempted to join cluster with a different cluster ID" )
 ERROR( restart_cluster_controller, 1218, "Restart cluster controller process" )
-ERROR( please_reboot_remote_kv_store, 1219, "Need to reboot the storage engine process as it died abnormally")
+ERROR( please_reboot_kv_store, 1219, "Need to reboot the storage engine")
 ERROR( incompatible_software_version, 1220, "Current software does not support database format" )
 
 // 15xx Platform errors
@@ -150,6 +153,7 @@ ERROR( http_bad_request_id, 1525, "HTTP response contained an unexpected X-Reque
 ERROR( rest_invalid_uri, 1526, "Invalid REST URI")
 ERROR( rest_invalid_rest_client_knob, 1527, "Invalid RESTClient knob")
 ERROR( rest_connectpool_key_not_found, 1528, "ConnectKey not found in connection pool")
+ERROR( lock_file_failure, 1529, "Unable to lock the file")
 
 
 // 2xxx Attempt (presumably by a _client_) to do something illegal.  If an error is known to
@@ -236,6 +240,19 @@ ERROR( illegal_tenant_access, 2138, "Illegal tenant access" )
 ERROR( invalid_tenant_group_name, 2139, "Tenant group name cannot begin with \\xff" )
 ERROR( invalid_tenant_configuration, 2140, "Tenant configuration is invalid" )
 ERROR( cluster_no_capacity, 2141, "Cluster does not have capacity to perform the specified operation" )
+ERROR( tenant_removed, 2142, "The tenant was removed" )
+ERROR( invalid_tenant_state, 2143, "Operation cannot be applied to tenant in its current state" )
+
+ERROR( invalid_cluster_name, 2160, "Data cluster name cannot begin with \\xff" )
+ERROR( invalid_metacluster_operation, 2161, "Metacluster operation performed on non-metacluster" )
+ERROR( cluster_already_exists, 2162, "A data cluster with the given name already exists" )
+ERROR( cluster_not_found, 2163, "Data cluster does not exist" )
+ERROR( cluster_not_empty, 2164, "Cluster must be empty" )
+ERROR( cluster_already_registered, 2165, "Data cluster is already registered with a metacluster" )
+ERROR( metacluster_no_capacity, 2166, "Metacluster does not have capacity to create new tenants" )
+ERROR( management_cluster_invalid_access, 2167, "Standard transactions cannot be run against the management cluster" )
+ERROR( tenant_creation_permanently_failed, 2168, "The tenant creation did not complete in a timely manner and has permanently failed" )
+ERROR( cluster_removed, 2169, "The cluster is being removed from the metacluster" )
 
 // 2200 - errors from bindings and official APIs
 ERROR( api_version_unset, 2200, "API version is not set" )
diff --git a/flow/include/flow/genericactors.actor.h b/flow/include/flow/genericactors.actor.h
index 385f0e2365..b271b8a5fb 100644
--- a/flow/include/flow/genericactors.actor.h
+++ b/flow/include/flow/genericactors.actor.h
@@ -83,7 +83,7 @@ Future<Optional<T>> stopAfter(Future<T> what) {
 		ret = Optional<T>(_);
 	} catch (Error& e) {
 		bool ok = e.code() == error_code_please_reboot || e.code() == error_code_please_reboot_delete ||
-		          e.code() == error_code_actor_cancelled || e.code() == error_code_please_reboot_remote_kv_store;
+		          e.code() == error_code_actor_cancelled;
 		TraceEvent(ok ? SevInfo : SevError, "StopAfterError").error(e);
 		if (!ok) {
 			fprintf(stderr, "Fatal Error: %s\n", e.what());
@@ -239,14 +239,6 @@ Future<T> delayed(Future<T> what, double time = 0.0, TaskPriority taskID = TaskP
 	}
 }
 
-// wait <interval> then call what() in a loop forever
-ACTOR template <class Func>
-Future<Void> recurring(Func what, double interval, TaskPriority taskID = TaskPriority::DefaultDelay) {
-	loop choose {
-		when(wait(delay(interval, taskID))) { what(); }
-	}
-}
-
 ACTOR template <class Func>
 Future<Void> trigger(Func what, Future<Void> signal) {
 	wait(signal);
@@ -1219,6 +1211,48 @@ inline Future<Void> operator||(Future<Void> const& lhs, Future<Void> const& rhs)
 	return chooseActor(lhs, rhs);
 }
 
+// wait <interval> then call what() in a loop forever
+ACTOR template <class Func>
+Future<Void> recurring(Func what, double interval, TaskPriority taskID = TaskPriority::DefaultDelay) {
+	loop choose {
+		when(wait(delay(interval, taskID))) { what(); }
+	}
+}
+
+// Invoke actorFunc() forever in a loop
+// At least wait<interval> between two actor functor invocations
+ACTOR template <class F>
+Future<Void> recurringAsync(
+    F actorFunc, // Callback actor functor
+    double interval, // Interval between two subsequent invocations of actor functor.
+    bool absoluteIntervalDelay, // Flag guarantees "interval" delay between two subequent actor functor invocations. If
+                                // not selected, guarantees provided are "at least 'interval' delay" between two
+                                // subsequent actor functor invocations, however, due to either 'poorly choose' interval
+                                // value AND/OR actor functor taking longer than expected to return, could cause actor
+                                // functor to run with no-delay
+    double initialDelay, // Initial delay interval
+    TaskPriority taskID = TaskPriority::DefaultDelay) {
+
+	wait(delay(initialDelay));
+
+	state Future<Void> val;
+
+	loop {
+		val = actorFunc();
+
+		if (absoluteIntervalDelay) {
+			wait(val);
+			// Ensure subsequent actorFunc executions observe client supplied delay interval.
+			wait(delay(interval));
+		} else {
+			// Guarantee at-least client supplied interval delay; two possible scenarios:
+			// 1. The actorFunc executions finishes before 'interval' delay
+			// 2. The actorFunc executions takes > 'interval' delay.
+			wait(val && delay(interval));
+		}
+	}
+}
+
 ACTOR template <class T>
 Future<T> brokenPromiseToNever(Future<T> in) {
 	try {
diff --git a/flow/include/flow/network.h b/flow/include/flow/network.h
index a110edfbec..d1bf24973d 100644
--- a/flow/include/flow/network.h
+++ b/flow/include/flow/network.h
@@ -104,7 +104,7 @@ enum class TaskPriority {
 	BlobWorkerReadChangeFeed = 2720,
 	BlobWorkerUpdateFDB = 2710,
 	BlobWorkerUpdateStorage = 2700,
-	CheckPoint = 2500,
+	FetchKeys = 2500,
 	RestoreApplierWriteDB = 2310,
 	RestoreApplierReceiveMutations = 2300,
 	RestoreLoaderFinishVersionBatch = 2220,
@@ -467,6 +467,11 @@ public:
 	// this may not be an address we can connect to!
 	virtual NetworkAddress getPeerAddress() const = 0;
 
+	// Returns whether the peer is trusted.
+	// For TLS-enabled connections, this is true if the peer has presented a valid chain of certificates trusted by the
+	// local endpoint. For non-TLS connections this is always true for any valid open connection.
+	virtual bool hasTrustedPeer() const = 0;
+
 	virtual UID getDebugID() const = 0;
 
 	// At present, implemented by Sim2Conn where we want to disable bits flip for connections between parent process and
@@ -579,7 +584,7 @@ public:
 	// Returns true if the current thread is the main thread
 
 	virtual void onMainThread(Promise<Void>&& signal, TaskPriority taskID) = 0;
-	// Executes signal.send(Void()) on a/the thread belonging to this network
+	// Executes signal.send(Void()) on a/the thread belonging to this network in FIFO order
 
 	virtual THREAD_HANDLE startThread(THREAD_FUNC_RETURN (*func)(void*),
 	                                  void* arg,
diff --git a/flow/include/flow/serialize.h b/flow/include/flow/serialize.h
index 5059be2229..97417c50c3 100644
--- a/flow/include/flow/serialize.h
+++ b/flow/include/flow/serialize.h
@@ -373,7 +373,7 @@ struct _Unversioned {
 };
 
 // These functions return valid options to the VersionOptions parameter of the constructor of each archive type
-inline _IncludeVersion IncludeVersion(ProtocolVersion defaultVersion = currentProtocolVersion) {
+inline _IncludeVersion IncludeVersion(ProtocolVersion defaultVersion = currentProtocolVersion()) {
 	return _IncludeVersion(defaultVersion);
 }
 inline _AssumeVersion AssumeVersion(ProtocolVersion version) {
diff --git a/packaging/docker/build-images.sh b/packaging/docker/build-images.sh
index 02574515d9..2ce492645d 100755
--- a/packaging/docker/build-images.sh
+++ b/packaging/docker/build-images.sh
@@ -2,12 +2,17 @@
 set -Eeuo pipefail
 script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd -P)
 reset=$(tput sgr0)
+red=$(tput setaf 1)
 blue=$(tput setaf 4)
 
-function logg() {
+function logg () {
     printf "${blue}##### $(date +"%H:%M:%S") #  %-56.55s #####${reset}\n" "${1}"
 }
 
+function loge () {
+    printf "${red}##### $(date +"%H:%M:%S") #  %-56.55s #####${reset}\n" "${1}"
+}
+
 function pushd () {
     command pushd "$@" > /dev/null
 }
@@ -16,7 +21,19 @@ function popd () {
     command popd > /dev/null
 }
 
-function create_fake_website_directory() {
+function error_exit () {
+    echo "${red}################################################################################${reset}"
+    loge "${0} FAILED"
+    echo "${red}################################################################################${reset}"
+}
+
+trap error_exit ERR
+
+function create_fake_website_directory () {
+    if [ ${#} -ne 1 ]; then
+        loge "INCORRECT NUMBER OF ARGS FOR ${FUNCNAME[0]}"
+    fi
+    local stripped_binaries_and_from_where="${1}"
     fdb_binaries=( 'fdbbackup' 'fdbcli' 'fdbserver' 'fdbmonitor' )
     logg "PREPARING WEBSITE"
     website_directory="${script_dir}/website"
@@ -112,7 +129,7 @@ function create_fake_website_directory() {
     fdb_website="file:///tmp/website"
 }
 
-function compile_ycsb() {
+function compile_ycsb () {
     logg "COMPILING YCSB"
     if [ "${use_development_java_bindings}" == "true" ]; then
         logg "INSTALL JAVA BINDINGS"
@@ -150,7 +167,13 @@ function compile_ycsb() {
     popd || exit 128
 }
 
-function build_and_push_images(){
+function build_and_push_images () {
+    if [ ${#} -ne 3 ]; then
+        loge "INCORRECT NUMBER OF ARGS FOR ${FUNCNAME[0]}"
+    fi
+    local dockerfile_name="${1}"
+    local use_development_java_bindings="${2}"
+    local push_docker_images="${3}"
     declare -a tags_to_push=()
     for image in "${image_list[@]}"; do
         logg "BUILDING ${image}"
@@ -237,11 +260,6 @@ image_list=(
 )
 registry=""
 tag_base="foundationdb/"
-# THESE CONTROL THE PATH OF FUNCTIONS THAT ARE CALLED BELOW
-stripped_binaries_and_from_where="stripped_local" # MUST BE ONE OF ( "unstripped_artifactory" "stripped_artifactory" "unstripped_local" "stripped_local" )
-dockerfile_name="Dockerfile"
-use_development_java_bindings="false"
-push_docker_images="false"
 
 if [ -n "${OKTETO_NAMESPACE+x}" ]; then
     logg "RUNNING IN OKTETO/AWS"
@@ -258,19 +276,24 @@ if [ -n "${OKTETO_NAMESPACE+x}" ]; then
     else
         tag_postfix="${OKTETO_NAME:-dev}"
     fi
-    stripped_binaries_and_from_where="unstripped_local" # MUST BE ONE OF ( "unstripped_artifactory" "stripped_artifactory" "unstripped_local" "stripped_local" )
-    dockerfile_name="Dockerfile.eks"
-    use_development_java_bindings="true"
-    push_docker_images="true"
+
+    # build regular images
+    create_fake_website_directory stripped_local
+    build_and_push_images Dockerfile true true
+
+    # build debug images
+    create_fake_website_directory unstripped_local
+    build_and_push_images Dockerfile.eks true true
 else
     echo "Dear ${USER}, you probably need to edit this file before running it. "
     echo "${0} has a very narrow set of situations where it will be successful,"
     echo "or even useful, when executed unedited"
     exit 1
+    # this set of options will creat standard images from a local build
+    # create_fake_website_directory stripped_local
+    # build_and_push_images Dockerfile false false
 fi
 
-create_fake_website_directory
-build_and_push_images
 
 echo "${blue}################################################################################${reset}"
 logg "COMPLETED ${0}"
diff --git a/pull_request_template.md b/pull_request_template.md
index f4b6480fd9..416adbc743 100644
--- a/pull_request_template.md
+++ b/pull_request_template.md
@@ -1,8 +1,8 @@
-Put description here...
+Replace this text with your description here...
 
 # Code-Reviewer Section
 
-The general guidelines can be found [here](https://github.com/apple/foundationdb/wiki/FoundationDB-Commit-Process).
+The general pull request guidelines can be found [here](https://github.com/apple/foundationdb/wiki/FoundationDB-Commit-Process).
 
 Please check each of the following things and check *all* boxes before accepting a PR.
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index d979dc8d78..f2a5ea76e2 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -10,6 +10,7 @@ set(TEST_AGGREGATE_TRACES "NONE" CACHE STRING "Create aggregated trace files (NO
 set(TEST_LOG_FORMAT "xml" CACHE STRING "Format for test trace files (xml, json)")
 set(TEST_INCLUDE ".*" CACHE STRING "Include only tests that match the given regex")
 set(TEST_EXCLUDE ".^" CACHE STRING "Exclude all tests matching the given regex")
+set(SANITIZER_OPTIONS "UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1;TSAN_OPTIONS=suppressions=${CMAKE_SOURCE_DIR}/contrib/tsan.suppressions" CACHE STRING "Environment variables setting sanitizer options")
 
 # for the restart test we optimally want to use the last stable fdbserver
 # to test upgrades
@@ -39,7 +40,7 @@ if(WITH_PYTHON)
 
   configure_testing(TEST_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
                     ERROR_ON_ADDITIONAL_FILES
-                    IGNORE_PATTERNS ".*/CMakeLists.txt")
+                    IGNORE_PATTERNS ".*/CMakeLists.txt" ".*/requirements.txt")
 
   add_fdb_test(TEST_FILES AsyncFileCorrectness.txt UNIT IGNORE)
   add_fdb_test(TEST_FILES AsyncFileMix.txt UNIT IGNORE)
@@ -183,6 +184,7 @@ if(WITH_PYTHON)
   add_fdb_test(TEST_FILES fast/SystemRebootTestCycle.toml)
   add_fdb_test(TEST_FILES fast/TaskBucketCorrectness.toml)
   add_fdb_test(TEST_FILES fast/TenantCycle.toml)
+  add_fdb_test(TEST_FILES fast/TenantEntryCache.toml)
   add_fdb_test(TEST_FILES fast/TimeKeeperCorrectness.toml)
   add_fdb_test(TEST_FILES fast/TxnStateStoreCycleTest.toml)
   add_fdb_test(TEST_FILES fast/UDP.toml)
@@ -192,9 +194,8 @@ if(WITH_PYTHON)
   add_fdb_test(TEST_FILES fast/WriteDuringRead.toml)
   add_fdb_test(TEST_FILES fast/WriteDuringReadClean.toml)
   add_fdb_test(TEST_FILES noSim/RandomUnitTests.toml UNIT)
-  add_fdb_test(TEST_FILES noSim/SystemDataTest.toml UNIT)
   if (WITH_ROCKSDB_EXPERIMENTAL)
-    add_fdb_test(TEST_FILES noSim/KeyValueStoreRocksDBTest.toml)
+    add_fdb_test(TEST_FILES noSim/KeyValueStoreRocksDBTest.toml UNIT)
     add_fdb_test(TEST_FILES noSim/ShardedRocksDBTest.toml UNIT)
     add_fdb_test(TEST_FILES fast/PhysicalShardMove.toml)
     add_fdb_test(TEST_FILES fast/StorageServerCheckpointRestore.toml)
@@ -204,6 +205,7 @@ if(WITH_PYTHON)
     add_fdb_test(TEST_FILES fast/PhysicalShardMove.toml IGNORE)
     add_fdb_test(TEST_FILES fast/StorageServerCheckpointRestore.toml  IGNORE)
   endif()
+  add_fdb_test(TEST_FILES rare/BlobGranuleRanges.toml)
   add_fdb_test(TEST_FILES rare/CheckRelocation.toml)
   add_fdb_test(TEST_FILES rare/ClogUnclog.toml)
   add_fdb_test(TEST_FILES rare/CloggedCycleWithKills.toml)
@@ -259,12 +261,12 @@ if(WITH_PYTHON)
   add_fdb_test(
     TEST_FILES restarting/from_6.3.13/CycleTestRestart-1.txt
                restarting/from_6.3.13/CycleTestRestart-2.txt)
-  add_fdb_test(
-    TEST_FILES restarting/from_6.3.13/DrUpgradeRestart-1.txt
-               restarting/from_6.3.13/DrUpgradeRestart-2.txt)
   add_fdb_test(
     TEST_FILES restarting/from_6.3.13/StorefrontTestRestart-1.txt
                restarting/from_6.3.13/StorefrontTestRestart-2.txt)
+  add_fdb_test(
+    TEST_FILES restarting/from_6.3.13_until_7.2.0/DrUpgradeRestart-1.txt
+               restarting/from_6.3.13_until_7.2.0/DrUpgradeRestart-2.txt)
   add_fdb_test(
     TEST_FILES restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml
                restarting/from_7.0.0/UpgradeAndBackupRestore-2.toml)
@@ -295,6 +297,9 @@ if(WITH_PYTHON)
   add_fdb_test(
     TEST_FILES restarting/from_7.1.0/VersionVectorEnableRestart-1.toml
                restarting/from_7.1.0/VersionVectorEnableRestart-2.toml)
+  add_fdb_test(
+    TEST_FILES restarting/from_7.2.0/DrUpgradeRestart-1.txt
+               restarting/from_7.2.0/DrUpgradeRestart-2.txt)
 
 
   add_fdb_test(TEST_FILES slow/ApiCorrectness.toml)
@@ -319,6 +324,7 @@ if(WITH_PYTHON)
   add_fdb_test(TEST_FILES slow/DiskFailureCycle.toml)
   add_fdb_test(TEST_FILES slow/FastTriggeredWatches.toml)
   add_fdb_test(TEST_FILES slow/LowLatencyWithFailures.toml)
+  add_fdb_test(TEST_FILES slow/MetaclusterManagement.toml)
   add_fdb_test(TEST_FILES slow/MoveKeysClean.toml)
   add_fdb_test(TEST_FILES slow/MoveKeysSideband.toml)
   add_fdb_test(TEST_FILES slow/RyowCorrectness.toml)
@@ -332,7 +338,9 @@ if(WITH_PYTHON)
   add_fdb_test(TEST_FILES slow/SwizzledRollbackTimeLapse.toml)
   add_fdb_test(TEST_FILES slow/SwizzledRollbackTimeLapseIncrement.toml)
   add_fdb_test(TEST_FILES slow/SwizzledTenantManagement.toml)
+  add_fdb_test(TEST_FILES slow/SwizzledTenantManagementMetacluster.toml)
   add_fdb_test(TEST_FILES slow/TenantManagement.toml)
+  add_fdb_test(TEST_FILES slow/TenantManagementConcurrency.toml)
   add_fdb_test(TEST_FILES slow/VersionStampBackupToDB.toml)
   add_fdb_test(TEST_FILES slow/VersionStampSwitchover.toml)
   add_fdb_test(TEST_FILES slow/WriteDuringReadAtomicRestore.toml)
@@ -366,12 +374,12 @@ if(WITH_PYTHON)
       NAME multiversion_client/unit_tests
       COMMAND $<TARGET_FILE:fdbserver> -r unittests -f /fdbclient/multiversionclient/
     )
-    set_tests_properties("multiversion_client/unit_tests" PROPERTIES ENVIRONMENT UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1)
+    set_tests_properties("multiversion_client/unit_tests" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}")
     add_test(
       NAME threadsafe_threadfuture_to_future/unit_tests
       COMMAND $<TARGET_FILE:fdbserver> -r unittests -f /flow/safeThreadFutureToFuture/
     )
-    set_tests_properties("threadsafe_threadfuture_to_future/unit_tests" PROPERTIES ENVIRONMENT UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1)
+    set_tests_properties("threadsafe_threadfuture_to_future/unit_tests" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}")
   endif()
 
   if(NOT OPEN_FOR_IDE)
@@ -388,6 +396,39 @@ if(WITH_PYTHON)
       create_valgrind_correctness_package()
     endif()
   endif()
+
+  if (NOT WIN32)
+    # setup venv for testing token-based authorization
+    set(authz_venv_dir ${CMAKE_CURRENT_BINARY_DIR}/authorization_test_venv)
+    set(authz_venv_activate ". ${authz_venv_dir}/bin/activate")
+    set(authz_venv_stamp_file ${authz_venv_dir}/venv.ready)
+    set(authz_venv_cmd "")
+    string(APPEND authz_venv_cmd "[[ ! -f ${authz_venv_stamp_file} ]] && ")
+    string(APPEND authz_venv_cmd "${Python3_EXECUTABLE} -m venv ${authz_venv_dir} ")
+    string(APPEND authz_venv_cmd "&& ${authz_venv_activate} ")
+    string(APPEND authz_venv_cmd "&& pip install --upgrade pip ")
+    string(APPEND authz_venv_cmd "&& pip install --upgrade -r ${CMAKE_SOURCE_DIR}/tests/authorization/requirements.txt ")
+    string(APPEND authz_venv_cmd "&& (cd ${CMAKE_BINARY_DIR}/bindings/python && python3 setup.py install) ")
+    string(APPEND authz_venv_cmd "&& touch ${authz_venv_stamp_file} ")
+    string(APPEND authz_venv_cmd "|| echo 'venv already set up'")
+    add_test(
+      NAME authorization_venv_setup
+      COMMAND bash -c ${authz_venv_cmd}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    set_tests_properties(authorization_venv_setup PROPERTIES FIXTURES_SETUP authz_virtual_env TIMEOUT 60)
+
+	set(authz_script_dir ${CMAKE_SOURCE_DIR}/tests/authorization)
+    set(authz_test_cmd "")
+    string(APPEND authz_test_cmd "${authz_venv_activate} && ")
+    string(APPEND authz_test_cmd "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/lib pytest ${authz_script_dir}/authz_test.py -rA --build-dir ${CMAKE_BINARY_DIR} -vvv")
+    add_test(
+      NAME token_based_tenant_authorization
+      WORKING_DIRECTORY ${authz_script_dir}
+      COMMAND bash -c ${authz_test_cmd})
+    set_tests_properties(token_based_tenant_authorization PROPERTIES ENVIRONMENT PYTHONPATH=${CMAKE_SOURCE_DIR}/tests/TestRunner) # (local|tmp)_cluster.py
+    set_tests_properties(token_based_tenant_authorization PROPERTIES FIXTURES_REQUIRED authz_virtual_env)
+    set_tests_properties(token_based_tenant_authorization PROPERTIES TIMEOUT 120)
+  endif()
 else()
   message(WARNING "Python not found, won't configure ctest")
 endif()
diff --git a/tests/TestRunner/binary_download.py b/tests/TestRunner/binary_download.py
index 2863d1e1f2..ea224dc360 100644
--- a/tests/TestRunner/binary_download.py
+++ b/tests/TestRunner/binary_download.py
@@ -10,9 +10,13 @@ import hashlib
 
 from local_cluster import random_secret_string
 
+CURRENT_VERSION = "7.2.0"
+FUTURE_VERSION = "7.3.0"
+
 SUPPORTED_PLATFORMS = ["x86_64", "aarch64"]
 SUPPORTED_VERSIONS = [
-    "7.2.0",
+    FUTURE_VERSION,
+    CURRENT_VERSION,
     "7.1.9",
     "7.1.8",
     "7.1.7",
@@ -67,7 +71,6 @@ SUPPORTED_VERSIONS = [
 ]
 FDB_DOWNLOAD_ROOT = "https://github.com/apple/foundationdb/releases/download/"
 LOCAL_OLD_BINARY_REPO = "/opt/foundationdb/old/"
-CURRENT_VERSION = "7.2.0"
 MAX_DOWNLOAD_ATTEMPTS = 5
 
 
@@ -93,18 +96,17 @@ def read_to_str(filename):
         return f.read()
 
 
+def is_local_build_version(version):
+    return version == CURRENT_VERSION or version == FUTURE_VERSION
+
+
 class FdbBinaryDownloader:
-    def __init__(
-        self,
-        build_dir
-    ):
+    def __init__(self, build_dir):
         self.build_dir = Path(build_dir).resolve()
         assert self.build_dir.exists(), "{} does not exist".format(build_dir)
         assert self.build_dir.is_dir(), "{} is not a directory".format(build_dir)
         self.platform = platform.machine()
-        assert self.platform in SUPPORTED_PLATFORMS, "Unsupported platform {}".format(
-            self.platform
-        )
+        assert self.platform in SUPPORTED_PLATFORMS, "Unsupported platform {}".format(self.platform)
         self.tmp_dir = self.build_dir.joinpath("tmp", random_secret_string(16))
         self.tmp_dir.mkdir(parents=True)
         self.download_dir = self.build_dir.joinpath("tmp", "old_binaries")
@@ -117,7 +119,7 @@ class FdbBinaryDownloader:
         return (self.local_binary_repo is not None) and (self.local_binary_repo.joinpath(version).exists())
 
     def binary_path(self, version, bin_name):
-        if version == CURRENT_VERSION:
+        if is_local_build_version(version):
             return self.build_dir.joinpath("bin", bin_name)
         elif self.version_in_local_repo(version):
             return self.local_binary_repo.joinpath(version, "bin", "{}-{}".format(bin_name, version))
@@ -125,7 +127,7 @@ class FdbBinaryDownloader:
             return self.download_dir.joinpath(version, bin_name)
 
     def lib_dir(self, version):
-        if version == CURRENT_VERSION:
+        if is_local_build_version(version):
             return self.build_dir.joinpath("lib")
         else:
             return self.download_dir.joinpath(version)
@@ -134,9 +136,7 @@ class FdbBinaryDownloader:
         return self.lib_dir(version).joinpath("libfdb_c.so")
 
     # Download an old binary of a given version from a remote repository
-    def download_old_binary(
-        self, version, target_bin_name, remote_bin_name, make_executable
-    ):
+    def download_old_binary(self, version, target_bin_name, remote_bin_name, make_executable):
         local_file = self.download_dir.joinpath(version, target_bin_name)
         if local_file.exists():
             return
@@ -152,9 +152,7 @@ class FdbBinaryDownloader:
 
         for attempt_cnt in range(MAX_DOWNLOAD_ATTEMPTS + 1):
             if attempt_cnt == MAX_DOWNLOAD_ATTEMPTS:
-                assert False, "Failed to download {} after {} attempts".format(
-                    local_file_tmp, MAX_DOWNLOAD_ATTEMPTS
-                )
+                assert False, "Failed to download {} after {} attempts".format(local_file_tmp, MAX_DOWNLOAD_ATTEMPTS)
             try:
                 print("Downloading '{}' to '{}'...".format(remote_file, local_file_tmp))
                 request.urlretrieve(remote_file, local_file_tmp)
@@ -172,11 +170,7 @@ class FdbBinaryDownloader:
             if expected_checksum == actual_checkum:
                 print("Checksum OK")
                 break
-            print(
-                "Checksum mismatch. Expected: {} Actual: {}".format(
-                    expected_checksum, actual_checkum
-                )
-            )
+            print("Checksum mismatch. Expected: {} Actual: {}".format(expected_checksum, actual_checkum))
 
         os.rename(local_file_tmp, local_file)
         os.remove(local_sha256)
@@ -202,22 +196,14 @@ class FdbBinaryDownloader:
 
     # Download all old binaries required for testing the specified upgrade path
     def download_old_binaries(self, version):
-        if version == CURRENT_VERSION:
+        if is_local_build_version(version):
             return
 
         if self.version_in_local_repo(version):
             self.copy_clientlib_from_local_repo(version)
             return
 
-        self.download_old_binary(
-            version, "fdbserver", "fdbserver.{}".format(self.platform), True
-        )
-        self.download_old_binary(
-            version, "fdbmonitor", "fdbmonitor.{}".format(self.platform), True
-        )
-        self.download_old_binary(
-            version, "fdbcli", "fdbcli.{}".format(self.platform), True
-        )
-        self.download_old_binary(
-            version, "libfdb_c.so", "libfdb_c.{}.so".format(self.platform), False
-        )
+        self.download_old_binary(version, "fdbserver", "fdbserver.{}".format(self.platform), True)
+        self.download_old_binary(version, "fdbmonitor", "fdbmonitor.{}".format(self.platform), True)
+        self.download_old_binary(version, "fdbcli", "fdbcli.{}".format(self.platform), True)
+        self.download_old_binary(version, "libfdb_c.so", "libfdb_c.{}.so".format(self.platform), False)
diff --git a/tests/TestRunner/local_cluster.py b/tests/TestRunner/local_cluster.py
index 9500e94b67..6375a7a01e 100644
--- a/tests/TestRunner/local_cluster.py
+++ b/tests/TestRunner/local_cluster.py
@@ -39,18 +39,20 @@ def is_port_in_use(port):
 
 valid_letters_for_secret = string.ascii_letters + string.digits
 
+
 class TLSConfig:
     # Passing a negative chain length generates expired leaf certificate
     def __init__(
         self,
         server_chain_len: int = 3,
         client_chain_len: int = 2,
-        verify_peers = "Check.Valid=1",
+        verify_peers="Check.Valid=1",
     ):
         self.server_chain_len = server_chain_len
         self.client_chain_len = client_chain_len
         self.verify_peers = verify_peers
 
+
 def random_secret_string(length):
     return "".join(random.choice(valid_letters_for_secret) for _ in range(length))
 
@@ -84,6 +86,9 @@ datadir = {datadir}/$ID
 logdir = {logdir}
 {bg_knob_line}
 {tls_config}
+{authz_public_key_config}
+{custom_config}
+{use_future_protocol_version}
 # logsize = 10MiB
 # maxlogssize = 100MiB
 # machine-id =
@@ -110,9 +115,12 @@ logdir = {logdir}
         port=None,
         ip_address=None,
         blob_granules_enabled: bool = False,
+        use_future_protocol_version: bool = False,
         redundancy: str = "single",
         tls_config: TLSConfig = None,
         mkcert_binary: str = "",
+        custom_config: dict = {},
+        public_key_json_str: str = "",
     ):
         self.basedir = Path(basedir)
         self.etc = self.basedir.joinpath("etc")
@@ -133,10 +141,12 @@ logdir = {logdir}
         self.redundancy = redundancy
         self.ip_address = "127.0.0.1" if ip_address is None else ip_address
         self.first_port = port
+        self.custom_config = custom_config
         self.blob_granules_enabled = blob_granules_enabled
         if blob_granules_enabled:
             # add extra process for blob_worker
             self.process_number += 1
+        self.use_future_protocol_version = use_future_protocol_version
 
         if self.first_port is not None:
             self.last_used_port = int(self.first_port) - 1
@@ -153,6 +163,7 @@ logdir = {logdir}
         self.coordinators = set()
         self.active_servers = set(self.server_ports.keys())
         self.tls_config = tls_config
+        self.public_key_json_file = None
         self.mkcert_binary = Path(mkcert_binary)
         self.server_cert_file = self.cert.joinpath("server_cert.pem")
         self.client_cert_file = self.cert.joinpath("client_cert.pem")
@@ -161,6 +172,11 @@ logdir = {logdir}
         self.server_ca_file = self.cert.joinpath("server_ca.pem")
         self.client_ca_file = self.cert.joinpath("client_ca.pem")
 
+        if public_key_json_str:
+            self.public_key_json_file = self.etc.joinpath("public_keys.json")
+            with open(self.public_key_json_file, "w") as pubkeyfile:
+                pubkeyfile.write(public_key_json_str)
+
         if create_config:
             self.create_cluster_file()
             self.save_config()
@@ -168,6 +184,8 @@ logdir = {logdir}
         if self.tls_config is not None:
             self.create_tls_cert()
 
+        self.cluster_file = self.etc.joinpath("fdb.cluster")
+
     def __next_port(self):
         if self.first_port is None:
             return get_free_port()
@@ -193,7 +211,10 @@ logdir = {logdir}
                     ip_address=self.ip_address,
                     bg_knob_line=bg_knob_line,
                     tls_config=self.tls_conf_string(),
+                    authz_public_key_config=self.authz_public_key_conf_string(),
                     optional_tls=":tls" if self.tls_config is not None else "",
+                    custom_config='\n'.join(["{} = {}".format(key, value) for key, value in self.custom_config.items()]),
+                    use_future_protocol_version="use-future-protocol-version = true" if self.use_future_protocol_version else "",
                 )
             )
             # By default, the cluster only has one process
@@ -278,9 +299,16 @@ logdir = {logdir}
     def __fdbcli_exec(self, cmd, stdout, stderr, timeout):
         args = [self.fdbcli_binary, "-C", self.cluster_file, "--exec", cmd]
         if self.tls_config:
-            args += ["--tls-certificate-file", self.client_cert_file,
-                     "--tls-key-file", self.client_key_file,
-                     "--tls-ca-file", self.server_ca_file]
+            args += [
+                "--tls-certificate-file",
+                self.client_cert_file,
+                "--tls-key-file",
+                self.client_key_file,
+                "--tls-ca-file",
+                self.server_ca_file,
+            ]
+        if self.use_future_protocol_version:
+            args += ["--use-future-protocol-version"]
         res = subprocess.run(args, env=self.process_env(), stderr=stderr, stdout=stdout, timeout=timeout)
         assert res.returncode == 0, "fdbcli command {} failed with {}".format(cmd, res.returncode)
         return res.stdout
@@ -307,22 +335,32 @@ logdir = {logdir}
     # Generate and install test certificate chains and keys
     def create_tls_cert(self):
         assert self.tls_config is not None, "TLS not enabled"
-        assert self.mkcert_binary.exists() and self.mkcert_binary.is_file(), "{} does not exist".format(self.mkcert_binary)
+        assert self.mkcert_binary.exists() and self.mkcert_binary.is_file(), "{} does not exist".format(
+            self.mkcert_binary
+        )
         self.cert.mkdir(exist_ok=True)
         server_chain_len = abs(self.tls_config.server_chain_len)
         client_chain_len = abs(self.tls_config.client_chain_len)
-        expire_server_cert = (self.tls_config.server_chain_len < 0)
-        expire_client_cert = (self.tls_config.client_chain_len < 0)
+        expire_server_cert = self.tls_config.server_chain_len < 0
+        expire_client_cert = self.tls_config.client_chain_len < 0
         args = [
             str(self.mkcert_binary),
-            "--server-chain-length", str(server_chain_len),
-            "--client-chain-length", str(client_chain_len),
-            "--server-cert-file", str(self.server_cert_file),
-            "--client-cert-file", str(self.client_cert_file),
-            "--server-key-file", str(self.server_key_file),
-            "--client-key-file", str(self.client_key_file),
-            "--server-ca-file", str(self.server_ca_file),
-            "--client-ca-file", str(self.client_ca_file),
+            "--server-chain-length",
+            str(server_chain_len),
+            "--client-chain-length",
+            str(client_chain_len),
+            "--server-cert-file",
+            str(self.server_cert_file),
+            "--client-cert-file",
+            str(self.client_cert_file),
+            "--server-key-file",
+            str(self.server_key_file),
+            "--client-key-file",
+            str(self.client_key_file),
+            "--server-ca-file",
+            str(self.server_ca_file),
+            "--client-ca-file",
+            str(self.client_ca_file),
             "--print-args",
         ]
         if expire_server_cert:
@@ -344,6 +382,12 @@ logdir = {logdir}
             }
             return "\n".join("{} = {}".format(k, v) for k, v in conf_map.items())
 
+    def authz_public_key_conf_string(self):
+        if self.public_key_json_file is not None:
+            return "authorization-public-key-file = {}".format(self.public_key_json_file)
+        else:
+            return ""
+
     # Get cluster status using fdbcli
     def get_status(self):
         status_output = self.fdbcli_exec_and_get("status json")
@@ -372,6 +416,7 @@ logdir = {logdir}
     def get_coordinators_from_status(self):
         def is_coordinator(proc_status):
             return any(entry["role"] == "coordinator" for entry in proc_status["roles"])
+
         return self.get_servers_from_status(is_coordinator)
 
     def process_env(self):
@@ -404,35 +449,38 @@ logdir = {logdir}
     def wait_for_server_update(self, timeout=CLUSTER_UPDATE_TIMEOUT_SEC):
         time_limit = time.time() + timeout
         servers_found = set()
-        while (time.time() <= time_limit):
+        while time.time() <= time_limit:
             servers_found = self.get_all_servers_from_status()
-            if (servers_found != self.active_servers):
+            if servers_found != self.active_servers:
                 break
             time.sleep(RETRY_INTERVAL_SEC)
         assert "Failed to apply server changes after {}sec. Expected: {}, Actual: {}".format(
-            timeout,  self.active_servers, servers_found)
+            timeout, self.active_servers, servers_found
+        )
 
     # Apply changes to the set of the coordinators, based on the current value of self.coordinators
     def update_coordinators(self):
         urls = ["{}:{}".format(self.ip_address, self.server_ports[id]) for id in self.coordinators]
         self.fdbcli_exec("coordinators {}".format(" ".join(urls)))
 
-     # Wait until the changes to the set of the coordinators are applied
+    # Wait until the changes to the set of the coordinators are applied
     def wait_for_coordinator_update(self, timeout=CLUSTER_UPDATE_TIMEOUT_SEC):
         time_limit = time.time() + timeout
         coord_found = set()
-        while (time.time() <= time_limit):
+        while time.time() <= time_limit:
             coord_found = self.get_coordinators_from_status()
-            if (coord_found != self.coordinators):
+            if coord_found != self.coordinators:
                 break
             time.sleep(RETRY_INTERVAL_SEC)
         assert "Failed to apply coordinator changes after {}sec. Expected: {}, Actual: {}".format(
-            timeout, self.coordinators, coord_found)
+            timeout, self.coordinators, coord_found
+        )
         # Check if the cluster file was successfully updated too
         connection_string = open(self.cluster_file, "r").read()
         for server_id in self.coordinators:
-            assert connection_string.find(str(self.server_ports[server_id])) != -1, \
-                "Missing coordinator {} port {} in the cluster file".format(server_id, self.server_ports[server_id])
+            assert (
+                connection_string.find(str(self.server_ports[server_id])) != -1
+            ), "Missing coordinator {} port {} in the cluster file".format(server_id, self.server_ports[server_id])
 
     # Exclude the servers with the given ID from the cluster, i.e. move out their data
     # The method waits until the changes are applied
@@ -445,19 +493,25 @@ logdir = {logdir}
         old_servers = self.active_servers.copy()
         new_servers = set()
         print("Starting cluster wiggle")
-        print("Old servers: {} on ports {}".format(old_servers, [
-              self.server_ports[server_id] for server_id in old_servers]))
+        print(
+            "Old servers: {} on ports {}".format(
+                old_servers, [self.server_ports[server_id] for server_id in old_servers]
+            )
+        )
         print("Old coordinators: {}".format(self.coordinators))
 
         # Step 1: add new servers
         start_time = time.time()
         for _ in range(len(old_servers)):
             new_servers.add(self.add_server())
-        print("New servers: {} on ports {}".format(new_servers, [
-              self.server_ports[server_id] for server_id in new_servers]))
+        print(
+            "New servers: {} on ports {}".format(
+                new_servers, [self.server_ports[server_id] for server_id in new_servers]
+            )
+        )
         self.save_config()
         self.wait_for_server_update()
-        print("New servers successfully added to the cluster. Time: {}s".format(time.time()-start_time))
+        print("New servers successfully added to the cluster. Time: {}s".format(time.time() - start_time))
 
         # Step 2: change coordinators
         start_time = time.time()
@@ -466,12 +520,12 @@ logdir = {logdir}
         self.coordinators = new_coordinators.copy()
         self.update_coordinators()
         self.wait_for_coordinator_update()
-        print("Coordinators successfully changed. Time: {}s".format(time.time()-start_time))
+        print("Coordinators successfully changed. Time: {}s".format(time.time() - start_time))
 
         # Step 3: exclude old servers from the cluster, i.e. move out their data
         start_time = time.time()
         self.exclude_servers(old_servers)
-        print("Old servers successfully excluded from the cluster. Time: {}s".format(time.time()-start_time))
+        print("Old servers successfully excluded from the cluster. Time: {}s".format(time.time() - start_time))
 
         # Step 4: remove the old servers
         start_time = time.time()
@@ -479,4 +533,4 @@ logdir = {logdir}
             self.remove_server(server_id)
         self.save_config()
         self.wait_for_server_update()
-        print("Old servers successfully removed from the cluster. Time: {}s".format(time.time()-start_time))
+        print("Old servers successfully removed from the cluster. Time: {}s".format(time.time() - start_time))
diff --git a/tests/TestRunner/tmp_cluster.py b/tests/TestRunner/tmp_cluster.py
index ebe789ab6b..dc3f62f391 100755
--- a/tests/TestRunner/tmp_cluster.py
+++ b/tests/TestRunner/tmp_cluster.py
@@ -18,6 +18,10 @@ class TempCluster(LocalCluster):
         port: str = None,
         blob_granules_enabled: bool = False,
         tls_config: TLSConfig = None,
+        public_key_json_str: str = None,
+        remove_at_exit: bool = True,
+        custom_config: dict = {},
+        enable_tenants: bool = True,
     ):
         self.build_dir = Path(build_dir).resolve()
         assert self.build_dir.exists(), "{} does not exist".format(build_dir)
@@ -25,6 +29,8 @@ class TempCluster(LocalCluster):
         tmp_dir = self.build_dir.joinpath("tmp", random_secret_string(16))
         tmp_dir.mkdir(parents=True)
         self.tmp_dir = tmp_dir
+        self.remove_at_exit = remove_at_exit
+        self.enable_tenants = enable_tenants
         super().__init__(
             tmp_dir,
             self.build_dir.joinpath("bin", "fdbserver"),
@@ -35,20 +41,27 @@ class TempCluster(LocalCluster):
             blob_granules_enabled=blob_granules_enabled,
             tls_config=tls_config,
             mkcert_binary=self.build_dir.joinpath("bin", "mkcert"),
+            public_key_json_str=public_key_json_str,
+            custom_config=custom_config,
         )
 
     def __enter__(self):
         super().__enter__()
-        super().create_database()
+        if self.enable_tenants:
+            super().create_database()
+        else:
+            super().create_database(enable_tenants=False)
         return self
 
     def __exit__(self, xc_type, exc_value, traceback):
         super().__exit__(xc_type, exc_value, traceback)
-        shutil.rmtree(self.tmp_dir)
+        if self.remove_at_exit:
+            shutil.rmtree(self.tmp_dir)
 
     def close(self):
         super().__exit__(None, None, None)
-        shutil.rmtree(self.tmp_dir)
+        if self.remove_at_exit:
+            shutil.rmtree(self.tmp_dir)
 
 
 if __name__ == "__main__":
@@ -91,6 +104,12 @@ if __name__ == "__main__":
         help="Do not dump cluster log on error",
         action="store_true",
     )
+    parser.add_argument(
+        "--disable-tenants",
+        help="Do not enable tenant mode",
+        action="store_true",
+        default=False
+    )
     parser.add_argument(
         "--blob-granules-enabled", help="Enable blob granules", action="store_true"
     )
@@ -115,6 +134,12 @@ if __name__ == "__main__":
         default="Check.Valid=1",
     )
     args = parser.parse_args()
+
+    if args.disable_tenants:
+        enable_tenants = False
+    else:
+        enable_tenants = True
+
     tls_config = None
     if args.tls_enabled:
         tls_config = TLSConfig(server_chain_len=args.server_cert_chain_len,
@@ -125,15 +150,16 @@ if __name__ == "__main__":
         args.process_number,
         blob_granules_enabled=args.blob_granules_enabled,
         tls_config=tls_config,
+        enable_tenants=enable_tenants,
     ) as cluster:
         print("log-dir: {}".format(cluster.log))
         print("etc-dir: {}".format(cluster.etc))
         print("data-dir: {}".format(cluster.data))
-        print("cluster-file: {}".format(cluster.etc.joinpath("fdb.cluster")))
+        print("cluster-file: {}".format(cluster.cluster_file))
         cmd_args = []
         for cmd in args.cmd:
             if cmd == "@CLUSTER_FILE@":
-                cmd_args.append(str(cluster.etc.joinpath("fdb.cluster")))
+                cmd_args.append(str(cluster.cluster_file))
             elif cmd == "@DATA_DIR@":
                 cmd_args.append(str(cluster.data))
             elif cmd == "@LOG_DIR@":
@@ -160,7 +186,7 @@ if __name__ == "__main__":
                 cmd_args.append(cmd)
         env = dict(**os.environ)
         env["FDB_CLUSTER_FILE"] = env.get(
-            "FDB_CLUSTER_FILE", cluster.etc.joinpath("fdb.cluster")
+            "FDB_CLUSTER_FILE", cluster.cluster_file
         )
         errcode = subprocess.run(
             cmd_args, stdout=sys.stdout, stderr=sys.stderr, env=env
diff --git a/tests/TestRunner/upgrade_test.py b/tests/TestRunner/upgrade_test.py
index 62055ce0a7..abf63a32c5 100755
--- a/tests/TestRunner/upgrade_test.py
+++ b/tests/TestRunner/upgrade_test.py
@@ -11,7 +11,7 @@ import sys
 from threading import Thread, Event
 import traceback
 import time
-from binary_download import FdbBinaryDownloader, SUPPORTED_VERSIONS, CURRENT_VERSION
+from binary_download import FdbBinaryDownloader, SUPPORTED_VERSIONS, CURRENT_VERSION, FUTURE_VERSION
 from local_cluster import LocalCluster, random_secret_string
 
 CLUSTER_ACTIONS = ["wiggle"]
@@ -44,10 +44,7 @@ def random_sleep(min_sec, max_sec):
 
 
 class UpgradeTest:
-    def __init__(
-        self,
-        args
-    ):
+    def __init__(self, args):
         self.build_dir = Path(args.build_dir).resolve()
         assert self.build_dir.exists(), "{} does not exist".format(args.build_dir)
         assert self.build_dir.is_dir(), "{} is not a directory".format(args.build_dir)
@@ -62,6 +59,10 @@ class UpgradeTest:
         self.downloader = FdbBinaryDownloader(args.build_dir)
         self.download_old_binaries()
         self.create_external_lib_dir()
+        self.testing_future_version = FUTURE_VERSION in self.upgrade_path
+        self.future_version_client_lib_path = (
+            self.downloader.lib_path(FUTURE_VERSION) if self.testing_future_version else None
+        )
         init_version = self.upgrade_path[0]
         self.cluster = LocalCluster(
             self.tmp_dir,
@@ -70,19 +71,16 @@ class UpgradeTest:
             self.downloader.binary_path(init_version, "fdbcli"),
             args.process_number,
             create_config=False,
-            redundancy=args.redundancy
+            redundancy=args.redundancy,
+            blob_granules_enabled=args.blob_granules_enabled,
         )
         self.cluster.create_cluster_file()
         self.configure_version(init_version)
         self.log = self.cluster.log
         self.etc = self.cluster.etc
         self.data = self.cluster.data
-        self.input_pipe_path = self.tmp_dir.joinpath(
-            "input.{}".format(random_secret_string(8))
-        )
-        self.output_pipe_path = self.tmp_dir.joinpath(
-            "output.{}".format(random_secret_string(8))
-        )
+        self.input_pipe_path = self.tmp_dir.joinpath("input.{}".format(random_secret_string(8)))
+        self.output_pipe_path = self.tmp_dir.joinpath("output.{}".format(random_secret_string(8)))
         os.mkfifo(self.input_pipe_path)
         os.mkfifo(self.output_pipe_path)
         self.progress_event = Event()
@@ -91,6 +89,7 @@ class UpgradeTest:
         self.tester_proc = None
         self.output_pipe = None
         self.ctrl_pipe = None
+        self.determine_api_version()
 
     # Download all old binaries required for testing the specified upgrade path
     def download_old_binaries(self):
@@ -103,11 +102,11 @@ class UpgradeTest:
         self.external_lib_dir = self.tmp_dir.joinpath("client_libs")
         self.external_lib_dir.mkdir(parents=True)
         for version in self.used_versions:
+            if version == FUTURE_VERSION:
+                continue
             src_file_path = self.downloader.lib_path(version)
             assert src_file_path.exists(), "{} does not exist".format(src_file_path)
-            target_file_path = self.external_lib_dir.joinpath(
-                "libfdb_c.{}.so".format(version)
-            )
+            target_file_path = self.external_lib_dir.joinpath("libfdb_c.{}.so".format(version))
             shutil.copyfile(src_file_path, target_file_path)
 
     # Perform a health check of the cluster: Use fdbcli status command to check if the number of
@@ -123,19 +122,16 @@ class UpgradeTest:
                 continue
             num_proc = len(status["cluster"]["processes"])
             if num_proc != self.cluster.process_number:
-                print(
-                    "Health check: {} of {} processes found. Retrying".format(
-                        num_proc, self.cluster.process_number
-                    )
-                )
+                print("Health check: {} of {} processes found. Retrying".format(num_proc, self.cluster.process_number))
                 time.sleep(1)
                 continue
+            expected_version = self.cluster_version
+            if expected_version == FUTURE_VERSION:
+                expected_version = CURRENT_VERSION
             for (_, proc_stat) in status["cluster"]["processes"].items():
                 proc_ver = proc_stat["version"]
-                assert (
-                    proc_ver == self.cluster_version
-                ), "Process version: expected: {}, actual: {}".format(
-                    self.cluster_version, proc_ver
+                assert proc_ver == expected_version, "Process version: expected: {}, actual: {}".format(
+                    expected_version, proc_ver
                 )
             print("Health check: OK")
             return
@@ -146,9 +142,10 @@ class UpgradeTest:
         self.cluster.fdbmonitor_binary = self.downloader.binary_path(version, "fdbmonitor")
         self.cluster.fdbserver_binary = self.downloader.binary_path(version, "fdbserver")
         self.cluster.fdbcli_binary = self.downloader.binary_path(version, "fdbcli")
-        self.cluster.set_env_var("LD_LIBRARY_PATH", self.downloader.lib_dir(version))
-        if version_before(version, "7.1.0"):
-            self.cluster.use_legacy_conf_syntax = True
+        self.cluster.set_env_var("LD_LIBRARY_PATH", "%s:%s" % (
+            self.downloader.lib_dir(version), os.getenv("LD_LIBRARY_PATH")))
+        self.cluster.use_legacy_conf_syntax = version_before(version, "7.1.0")
+        self.cluster.use_future_protocol_version = version == FUTURE_VERSION
         self.cluster.save_config()
         self.cluster_version = version
 
@@ -164,7 +161,7 @@ class UpgradeTest:
     def __enter__(self):
         print("Starting cluster version {}".format(self.cluster_version))
         self.cluster.start_cluster()
-        self.cluster.create_database(enable_tenants=False)
+        self.cluster.create_database(enable_tenants=(self.api_version >= 720))
         return self
 
     def __exit__(self, xc_type, exc_value, traceback):
@@ -181,7 +178,6 @@ class UpgradeTest:
     def exec_workload(self, test_file):
         self.tester_retcode = 1
         try:
-            self.determine_api_version()
             cmd_args = [
                 self.tester_bin,
                 "--cluster-file",
@@ -205,19 +201,17 @@ class UpgradeTest:
                 "--transaction-retry-limit",
                 str(TRANSACTION_RETRY_LIMIT),
                 "--stats-interval",
-                str(TESTER_STATS_INTERVAL_SEC * 1000)
+                str(TESTER_STATS_INTERVAL_SEC * 1000),
             ]
             if RUN_WITH_GDB:
                 cmd_args = ["gdb", "-ex", "run", "--args"] + cmd_args
-            print(
-                "Executing test command: {}".format(
-                    " ".join([str(c) for c in cmd_args])
-                )
-            )
+            if FUTURE_VERSION in self.upgrade_path:
+                cmd_args += ["--future-version-client-library", self.future_version_client_lib_path]
+            if self.cluster.blob_granules_enabled:
+                cmd_args += ["--blob-granule-local-file-path", str(self.cluster.data.joinpath("fdbblob")) + "/"]
+            print("Executing test command: {}".format(" ".join([str(c) for c in cmd_args])))
 
-            self.tester_proc = subprocess.Popen(
-                cmd_args, stdout=sys.stdout, stderr=sys.stderr
-            )
+            self.tester_proc = subprocess.Popen(cmd_args, stdout=sys.stdout, stderr=sys.stderr)
             self.tester_retcode = self.tester_proc.wait()
             self.tester_proc = None
 
@@ -241,9 +235,7 @@ class UpgradeTest:
         if self.progress_event.is_set():
             print("Progress check: OK")
         else:
-            assert False, "Progress check failed after upgrade to version {}".format(
-                self.cluster_version
-            )
+            assert False, "Progress check failed after upgrade to version {}".format(self.cluster_version)
 
     # The main function of a thread for reading and processing
     # the notifications received from the tester
@@ -324,11 +316,7 @@ class UpgradeTest:
 
     def grep_logs_for_events(self, severity):
         return (
-            subprocess.getoutput(
-                "grep -r 'Severity=\"{}\"' {}".format(
-                    severity, self.cluster.log.as_posix()
-                )
-            )
+            subprocess.getoutput("grep -r 'Severity=\"{}\"' {}".format(severity, self.cluster.log.as_posix()))
             .rstrip()
             .splitlines()
         )
@@ -336,9 +324,7 @@ class UpgradeTest:
     # Check the cluster log for errors
     def check_cluster_logs(self, error_limit=100):
         sev40s = (
-            subprocess.getoutput(
-                "grep -r 'Severity=\"40\"' {}".format(self.cluster.log.as_posix())
-            )
+            subprocess.getoutput("grep -r 'Severity=\"40\"' {}".format(self.cluster.log.as_posix()))
             .rstrip()
             .splitlines()
         )
@@ -349,7 +335,7 @@ class UpgradeTest:
             # correct asan annotations so that it shouldn't produce any false positives.
             if line.endswith(
                 "WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false "
-                "positives in some cases! "
+                "positives in some cases!"
             ):
                 continue
             if err_cnt < error_limit:
@@ -368,9 +354,7 @@ class UpgradeTest:
     # Check the server and client logs for warnings and dump them
     def dump_warnings_in_logs(self, limit=100):
         sev30s = (
-            subprocess.getoutput(
-                "grep -r 'Severity=\"30\"' {}".format(self.cluster.log.as_posix())
-            )
+            subprocess.getoutput("grep -r 'Severity=\"30\"' {}".format(self.cluster.log.as_posix()))
             .rstrip()
             .splitlines()
         )
@@ -378,11 +362,7 @@ class UpgradeTest:
         if len(sev30s) == 0:
             print("No warnings found in logs")
         else:
-            print(
-                ">>>>>>>>>>>>>>>>>>>> Found {} severity 30 events (warnings):".format(
-                    len(sev30s)
-                )
-            )
+            print(">>>>>>>>>>>>>>>>>>>> Found {} severity 30 events (warnings):".format(len(sev30s)))
             for line in sev30s[:limit]:
                 print(line)
 
@@ -418,8 +398,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "--upgrade-path",
         nargs="+",
-        help="Cluster upgrade path: a space separated list of versions.\n" +
-        "The list may also contain cluster change actions: {}".format(CLUSTER_ACTIONS),
+        help="Cluster upgrade path: a space separated list of versions.\n"
+        + "The list may also contain cluster change actions: {}".format(CLUSTER_ACTIONS),
         default=[CURRENT_VERSION],
     )
     parser.add_argument(
@@ -445,9 +425,8 @@ if __name__ == "__main__":
         help="Do not dump cluster log on error",
         action="store_true",
     )
-    parser.add_argument(
-        "--run-with-gdb", help="Execute the tester binary from gdb", action="store_true"
-    )
+    parser.add_argument("--blob-granules-enabled", help="Enable blob granules", action="store_true")
+    parser.add_argument("--run-with-gdb", help="Execute the tester binary from gdb", action="store_true")
     args = parser.parse_args()
     if args.process_number == 0:
         args.process_number = random.randint(1, 5)
diff --git a/tests/authorization/admin_server.py b/tests/authorization/admin_server.py
new file mode 100644
index 0000000000..9f2c2adc9d
--- /dev/null
+++ b/tests/authorization/admin_server.py
@@ -0,0 +1,135 @@
+#!/usr/bin/python
+#
+# admin_server.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import fdb
+from multiprocessing import Pipe, Process
+from typing import Union, List
+from util import to_str, to_bytes, cleanup_tenant
+
+class _admin_request(object):
+    def __init__(self, op: str, args: List[Union[str, bytes]]=[]):
+        self.op = op
+        self.args = args
+
+    def __str__(self):
+        return f"admin_request({self.op}, {self.args})"
+
+    def __repr__(self):
+        return f"admin_request({self.op}, {self.args})"
+
+def main_loop(main_pipe, pipe):
+    main_pipe.close()
+    db = None
+    while True:
+        try:
+            req = pipe.recv()
+        except EOFError:
+            return
+        if not isinstance(req, _admin_request):
+            pipe.send(TypeError("unexpected type {}".format(type(req))))
+            continue
+        op = req.op
+        args = req.args
+        resp = True
+        try:
+            if op == "connect":
+                db = fdb.open(req.args[0])
+            elif op == "configure_tls":
+                keyfile, certfile, cafile = req.args[:3]
+                fdb.options.set_tls_key_path(keyfile)
+                fdb.options.set_tls_cert_path(certfile)
+                fdb.options.set_tls_ca_path(cafile)
+            elif op == "create_tenant":
+                if db is None:
+                    resp = Exception("db not open")
+                else:
+                    for tenant in req.args:
+                        tenant_str = to_str(tenant)
+                        tenant_bytes = to_bytes(tenant)
+                        fdb.tenant_management.create_tenant(db, tenant_bytes)
+            elif op == "delete_tenant":
+                if db is None:
+                    resp = Exception("db not open")
+                else:
+                    for tenant in req.args:
+                        tenant_str = to_str(tenant)
+                        tenant_bytes = to_bytes(tenant)
+                        cleanup_tenant(db, tenant_bytes)
+            elif op == "cleanup_database":
+                if db is None:
+                    resp = Exception("db not open")
+                else:
+                    tr = db.create_transaction()
+                    del tr[b'':b'\xff']
+                    tr.commit().wait()
+                    tenants = list(map(lambda x: x.key, list(fdb.tenant_management.list_tenants(db, b'', b'\xff', 0).to_list())))
+                    for tenant in tenants:
+                        fdb.tenant_management.delete_tenant(db, tenant)
+            elif op == "terminate":
+                pipe.send(True)
+                return
+            else:
+                resp = ValueError("unknown operation: {}".format(req))
+        except Exception as e:
+            resp = e
+        pipe.send(resp)
+
+_admin_server = None
+
+def get():
+    return _admin_server
+
+# server needs to be a singleton running in subprocess, because FDB network layer (including active TLS config) is a global var
+class Server(object):
+    def __init__(self):
+        global _admin_server
+        assert _admin_server is None, "admin server may be setup once per process"
+        _admin_server = self
+        self._main_pipe, self._admin_pipe = Pipe(duplex=True)
+        self._admin_proc = Process(target=main_loop, args=(self._main_pipe, self._admin_pipe))
+
+    def start(self):
+        self._admin_proc.start()
+
+    def join(self):
+        self._main_pipe.close()
+        self._admin_pipe.close()
+        self._admin_proc.join()
+
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self.join()
+
+    def request(self, op, args=[]):
+        req = _admin_request(op, args)
+        try:
+            self._main_pipe.send(req)
+            resp = self._main_pipe.recv()
+            if resp != True:
+                print("{} failed: {}".format(req, resp))
+                raise resp
+            else:
+                print("{} succeeded".format(req))
+        except Exception as e:
+            print("{} failed by exception: {}".format(req, e))
+            raise
diff --git a/tests/authorization/authz_test.py b/tests/authorization/authz_test.py
new file mode 100644
index 0000000000..fe379144fa
--- /dev/null
+++ b/tests/authorization/authz_test.py
@@ -0,0 +1,297 @@
+#!/usr/bin/python
+#
+# authz_test.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import admin_server
+import argparse
+import authlib
+import fdb
+import os
+import pytest
+import random
+import sys
+import time
+from multiprocessing import Process, Pipe
+from typing import Union
+from util import alg_from_kty, public_keyset_from_keys, random_alphanum_str, random_alphanum_bytes, to_str, to_bytes, KeyFileReverter, token_claim_1h, wait_until_tenant_tr_succeeds, wait_until_tenant_tr_fails
+
+special_key_ranges = [
+    ("transaction description", b"/description", b"/description\x00"),
+    ("global knobs", b"/globalKnobs", b"/globalKnobs\x00"),
+    ("knobs", b"/knobs0", b"/knobs0\x00"),
+    ("conflicting keys", b"/transaction/conflicting_keys/", b"/transaction/conflicting_keys/\xff\xff"),
+    ("read conflict range", b"/transaction/read_conflict_range/", b"/transaction/read_conflict_range/\xff\xff"),
+    ("conflicting keys", b"/transaction/write_conflict_range/", b"/transaction/write_conflict_range/\xff\xff"),
+    ("data distribution stats", b"/metrics/data_distribution_stats/", b"/metrics/data_distribution_stats/\xff\xff"),
+    ("kill storage", b"/globals/killStorage", b"/globals/killStorage\x00"),
+]
+
+def test_simple_tenant_access(private_key, token_gen, default_tenant, tenant_tr_gen):
+    token = token_gen(private_key, token_claim_1h(default_tenant))
+    tr = tenant_tr_gen(default_tenant)
+    tr.options.set_authorization_token(token)
+    tr[b"abc"] = b"def"
+    tr.commit().wait()
+    tr = tenant_tr_gen(default_tenant)
+    tr.options.set_authorization_token(token)
+    assert tr[b"abc"] == b"def", "tenant write transaction not visible"
+
+def test_cross_tenant_access_disallowed(private_key, default_tenant, token_gen, tenant_gen, tenant_tr_gen):
+    # use default tenant token with second tenant transaction and see it fail
+    second_tenant = random_alphanum_bytes(12)
+    tenant_gen(second_tenant)
+    token_second = token_gen(private_key, token_claim_1h(second_tenant))
+    tr_second = tenant_tr_gen(second_tenant)
+    tr_second.options.set_authorization_token(token_second)
+    tr_second[b"abc"] = b"def"
+    tr_second.commit().wait()
+    token_default = token_gen(private_key, token_claim_1h(default_tenant))
+    tr_second = tenant_tr_gen(second_tenant)
+    tr_second.options.set_authorization_token(token_default)
+    # test that read transaction fails
+    try:
+        value = tr_second[b"abc"].value
+        assert False, f"expected permission denied, but read transaction went through, value: {value}"
+    except fdb.FDBError as e:
+        assert e.code == 6000, f"expected permission_denied, got {e} instead"
+    # test that write transaction fails
+    tr_second = tenant_tr_gen(second_tenant)
+    tr_second.options.set_authorization_token(token_default)
+    try:
+        tr_second[b"def"] = b"ghi"
+        tr_second.commit().wait()
+        assert False, "expected permission denied, but write transaction went through"
+    except fdb.FDBError as e:
+        assert e.code == 6000, f"expected permission_denied, got {e} instead"
+
+def test_system_and_special_key_range_disallowed(db, tenant_tr_gen, token_gen):
+    second_tenant = random_alphanum_bytes(12)
+    try:
+        fdb.tenant_management.create_tenant(db, second_tenant)
+        assert False, "disallowed create_tenant has succeeded"
+    except fdb.FDBError as e:
+        assert e.code == 6000, f"expected permission_denied, got {e} instead"
+
+    try:
+        tr = db.create_transaction()
+        tr.options.set_access_system_keys()
+        kvs = tr.get_range(b"\xff", b"\xff\xff", limit=1).to_list()
+        assert False, f"disallowed system keyspace read has succeeded. found item: {kvs}"
+    except fdb.FDBError as e:
+        assert e.code == 6000, f"expected permission_denied, got {e} instead"
+
+    for range_name, special_range_begin, special_range_end in special_key_ranges:
+        tr = db.create_transaction()
+        tr.options.set_access_system_keys()
+        tr.options.set_special_key_space_relaxed()
+        try:
+            kvs = tr.get_range(special_range_begin, special_range_end, limit=1).to_list()
+            assert False, f"disallowed special keyspace read for range {range_name} has succeeded. found item {kvs}"
+        except fdb.FDBError as e:
+            assert e.code == 6000, f"expected permission_denied from attempted read to range {range_name}, got {e} instead"
+
+    try:
+        tr = db.create_transaction()
+        tr.options.set_access_system_keys()
+        del tr[b"\xff":b"\xff\xff"]
+        tr.commit().wait()
+        assert False, f"disallowed system keyspace write has succeeded"
+    except fdb.FDBError as e:
+        assert e.code == 6000, f"expected permission_denied, got {e} instead"
+
+    for range_name, special_range_begin, special_range_end in special_key_ranges:
+        tr = db.create_transaction()
+        tr.options.set_access_system_keys()
+        tr.options.set_special_key_space_relaxed()
+        try:
+            del tr[special_range_begin:special_range_end]
+            tr.commit().wait()
+            assert False, f"write to disallowed special keyspace range {range_name} has succeeded"
+        except fdb.FDBError as e:
+            assert e.code == 6000, f"expected permission_denied from attempted write to range {range_name}, got {e} instead"
+
+    try:
+        tr = db.create_transaction()
+        tr.options.set_access_system_keys()
+        kvs = tr.get_range(b"", b"\xff", limit=1).to_list()
+        assert False, f"disallowed normal keyspace read has succeeded. found item {kvs}"
+    except fdb.FDBError as e:
+        assert e.code == 6000, f"expected permission_denied, got {e} instead"
+
+def test_public_key_set_rollover(
+        kty, private_key_gen, private_key, public_key_refresh_interval,
+        cluster, default_tenant, token_gen, tenant_gen, tenant_tr_gen):
+    new_kid = random_alphanum_str(12)
+    new_kty = "EC" if kty == "RSA" else "RSA"
+    new_key = private_key_gen(kty=new_kty, kid=new_kid)
+    token_default = token_gen(private_key, token_claim_1h(default_tenant))
+
+    second_tenant = random_alphanum_bytes(12)
+    tenant_gen(second_tenant)
+    token_second = token_gen(new_key, token_claim_1h(second_tenant))
+
+    interim_set = public_keyset_from_keys([new_key, private_key])
+    max_repeat = 10
+
+    print(f"interim keyset: {interim_set}")
+    old_key_json = None
+    with open(cluster.public_key_json_file, "r") as keyfile:
+        old_key_json = keyfile.read()
+
+    delay = public_key_refresh_interval
+
+    with KeyFileReverter(cluster.public_key_json_file, old_key_json, delay):
+        with open(cluster.public_key_json_file, "w") as keyfile:
+            keyfile.write(interim_set)
+        wait_until_tenant_tr_succeeds(second_tenant, new_key, tenant_tr_gen, token_gen, max_repeat, delay)
+        print("interim key set activated")
+        final_set = public_keyset_from_keys([new_key])
+        print(f"final keyset: {final_set}")
+        with open(cluster.public_key_json_file, "w") as keyfile:
+            keyfile.write(final_set)
+        wait_until_tenant_tr_fails(default_tenant, private_key, tenant_tr_gen, token_gen, max_repeat, delay)
+
+def test_public_key_set_broken_file_tolerance(
+        private_key, public_key_refresh_interval,
+        cluster, public_key_jwks_str, default_tenant, token_gen, tenant_tr_gen):
+    delay = public_key_refresh_interval
+    # retry limit in waiting for keyset file update to propagate to FDB server's internal keyset
+    max_repeat = 10
+
+    with KeyFileReverter(cluster.public_key_json_file, public_key_jwks_str, delay):
+        # key file update should take effect even after witnessing broken key file
+        with open(cluster.public_key_json_file, "w") as keyfile:
+            keyfile.write(public_key_jwks_str.strip()[:10]) # make the file partial, injecting parse error
+        time.sleep(delay * 2)
+        # should still work; internal key set only clears with a valid, empty key set file
+        tr_default = tenant_tr_gen(default_tenant)
+        tr_default.options.set_authorization_token(token_gen(private_key, token_claim_1h(default_tenant)))
+        tr_default[b"abc"] = b"def"
+        tr_default.commit().wait()
+        with open(cluster.public_key_json_file, "w") as keyfile:
+            keyfile.write('{"keys":[]}')
+        # eventually internal key set will become empty and won't accept any new tokens
+        wait_until_tenant_tr_fails(default_tenant, private_key, tenant_tr_gen, token_gen, max_repeat, delay)
+
+def test_public_key_set_deletion_tolerance(
+        private_key, public_key_refresh_interval,
+        cluster, public_key_jwks_str, default_tenant, token_gen, tenant_tr_gen):
+    delay = public_key_refresh_interval
+    # retry limit in waiting for keyset file update to propagate to FDB server's internal keyset
+    max_repeat = 10
+
+    with KeyFileReverter(cluster.public_key_json_file, public_key_jwks_str, delay):
+        # key file update should take effect even after witnessing deletion of key file
+        with open(cluster.public_key_json_file, "w") as keyfile:
+            keyfile.write('{"keys":[]}')
+        time.sleep(delay)
+        wait_until_tenant_tr_fails(default_tenant, private_key, tenant_tr_gen, token_gen, max_repeat, delay)
+        os.remove(cluster.public_key_json_file)
+        time.sleep(delay * 2)
+        with open(cluster.public_key_json_file, "w") as keyfile:
+            keyfile.write(public_key_jwks_str)
+        # eventually updated key set should take effect and transaction should be accepted
+        wait_until_tenant_tr_succeeds(default_tenant, private_key, tenant_tr_gen, token_gen, max_repeat, delay)
+
+def test_public_key_set_empty_file_tolerance(
+        private_key, public_key_refresh_interval,
+        cluster, public_key_jwks_str, default_tenant, token_gen, tenant_tr_gen):
+    delay = public_key_refresh_interval
+    # retry limit in waiting for keyset file update to propagate to FDB server's internal keyset
+    max_repeat = 10
+
+    with KeyFileReverter(cluster.public_key_json_file, public_key_jwks_str, delay):
+        # key file update should take effect even after witnessing an empty file
+        with open(cluster.public_key_json_file, "w") as keyfile:
+            keyfile.write('{"keys":[]}')
+        # eventually internal key set will become empty and won't accept any new tokens
+        wait_until_tenant_tr_fails(default_tenant, private_key, tenant_tr_gen, token_gen, max_repeat, delay)
+        # empty the key file
+        with open(cluster.public_key_json_file, "w") as keyfile:
+            pass
+        time.sleep(delay * 2)
+        with open(cluster.public_key_json_file, "w") as keyfile:
+            keyfile.write(public_key_jwks_str)
+        # eventually key file should update and transactions should go through
+        wait_until_tenant_tr_succeeds(default_tenant, private_key, tenant_tr_gen, token_gen, max_repeat, delay)
+
+def test_bad_token(private_key, token_gen, default_tenant, tenant_tr_gen):
+    def del_attr(d, attr):
+        del d[attr]
+        return d
+
+    def set_attr(d, attr, value):
+        d[attr] = value
+        return d
+
+    claim_mutations = [
+        ("no nbf", lambda claim: del_attr(claim, "nbf")),
+        ("no exp", lambda claim: del_attr(claim, "exp")),
+        ("no iat", lambda claim: del_attr(claim, "iat")),
+        ("too early", lambda claim: set_attr(claim, "nbf", time.time() + 30)),
+        ("too late", lambda claim: set_attr(claim, "exp", time.time() - 10)),
+        ("no tenants", lambda claim: del_attr(claim, "tenants")),
+        ("empty tenants", lambda claim: set_attr(claim, "tenants", [])),
+    ]
+    for case_name, mutation in claim_mutations:
+        tr = tenant_tr_gen(default_tenant)
+        tr.options.set_authorization_token(token_gen(private_key, mutation(token_claim_1h(default_tenant))))
+        try:
+            value = tr[b"abc"].value
+            assert False, f"expected permission_denied for case {case_name}, but read transaction went through"
+        except fdb.FDBError as e:
+            assert e.code == 6000, f"expected permission_denied for case {case_name}, got {e} instead"
+        tr = tenant_tr_gen(default_tenant)
+        tr.options.set_authorization_token(token_gen(private_key, mutation(token_claim_1h(default_tenant))))
+        tr[b"abc"] = b"def"
+        try:
+            tr.commit().wait()
+            assert False, f"expected permission_denied for case {case_name}, but write transaction went through"
+        except fdb.FDBError as e:
+            assert e.code == 6000, f"expected permission_denied for case {case_name}, got {e} instead"
+
+    # unknown key case: override "kid" field in header
+    # first, update only the kid field of key with export-update-import
+    key_dict = private_key.as_dict(is_private=True)
+    key_dict["kid"] = random_alphanum_str(10)
+    renamed_key = authlib.jose.JsonWebKey.import_key(key_dict)
+    unknown_key_token = token_gen(
+                renamed_key,
+                token_claim_1h(default_tenant),
+                headers={
+                    "typ": "JWT",
+                    "kty": renamed_key.kty,
+                    "alg": alg_from_kty(renamed_key.kty),
+                    "kid": renamed_key.kid,
+                })
+    tr = tenant_tr_gen(default_tenant)
+    tr.options.set_authorization_token(unknown_key_token)
+    try:
+        value = tr[b"abc"].value
+        assert False, f"expected permission_denied for 'unknown key' case, but read transaction went through"
+    except fdb.FDBError as e:
+        assert e.code == 6000, f"expected permission_denied for 'unknown key' case, got {e} instead"
+    tr = tenant_tr_gen(default_tenant)
+    tr.options.set_authorization_token(unknown_key_token)
+    tr[b"abc"] = b"def"
+    try:
+        tr.commit().wait()
+        assert False, f"expected permission_denied for 'unknown key' case, but write transaction went through"
+    except fdb.FDBError as e:
+        assert e.code == 6000, f"expected permission_denied for 'unknown key' case, got {e} instead"
diff --git a/tests/authorization/conftest.py b/tests/authorization/conftest.py
new file mode 100644
index 0000000000..5780f39b89
--- /dev/null
+++ b/tests/authorization/conftest.py
@@ -0,0 +1,173 @@
+#!/usr/bin/python
+#
+# conftest.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import fdb
+import pytest
+import subprocess
+import admin_server
+from authlib.jose import JsonWebKey, KeySet, jwt
+from local_cluster import TLSConfig
+from tmp_cluster import TempCluster
+from typing import Union
+from util import alg_from_kty, public_keyset_from_keys, random_alphanum_str, random_alphanum_bytes, to_str, to_bytes
+
+fdb.api_version(720)
+
+cluster_scope = "module"
+
+def pytest_addoption(parser):
+    parser.addoption(
+            "--build-dir", action="store", dest="build_dir", help="FDB build directory", required=True)
+    parser.addoption(
+            "--kty", action="store", choices=["EC", "RSA"], default="EC", dest="kty", help="Token signature algorithm")
+    parser.addoption(
+            "--trusted-client",
+            action="store_true",
+            default=False,
+            dest="trusted_client",
+            help="Whether client shall be configured trusted, i.e. mTLS-ready")
+    parser.addoption(
+            "--public-key-refresh-interval",
+            action="store",
+            default=1,
+            dest="public_key_refresh_interval",
+            help="How frequently server refreshes authorization public key file")
+
+@pytest.fixture(scope="session")
+def build_dir(request):
+    return request.config.option.build_dir
+
+@pytest.fixture(scope="session")
+def kty(request):
+    return request.config.option.kty
+
+@pytest.fixture(scope="session")
+def trusted_client(request):
+    return request.config.option.trusted_client
+
+@pytest.fixture(scope="session")
+def public_key_refresh_interval(request):
+    return request.config.option.public_key_refresh_interval
+
+@pytest.fixture(scope="session")
+def alg(kty):
+    if kty == "EC":
+        return "ES256"
+    else:
+        return "RS256"
+
+@pytest.fixture(scope="session")
+def kid():
+    return random_alphanum_str(12)
+
+@pytest.fixture(scope="session")
+def private_key_gen():
+    def fn(kty: str, kid: str):
+        if kty == "EC":
+            return JsonWebKey.generate_key(kty=kty, crv_or_size="P-256", is_private=True, options={"kid": kid})
+        else:
+            return JsonWebKey.generate_key(kty=kty, crv_or_size=4096, is_private=True, options={"kid": kid})
+    return fn
+
+@pytest.fixture(scope="session")
+def private_key(kty, kid, private_key_gen):
+    return private_key_gen(kty, kid)
+
+@pytest.fixture(scope="session")
+def public_key_jwks_str(private_key):
+    return public_keyset_from_keys([private_key])
+
+@pytest.fixture(scope="session")
+def token_gen():
+    def fn(private_key, claims, headers={}):
+        if not headers:
+            headers = {
+                "typ": "JWT",
+                "kty": private_key.kty,
+                "alg": alg_from_kty(private_key.kty),
+                "kid": private_key.kid,
+            }
+        return jwt.encode(headers, claims, private_key)
+    return fn
+
+@pytest.fixture(scope=cluster_scope)
+def admin_ipc():
+    server = admin_server.Server()
+    server.start()
+    yield server
+    server.join()
+
+@pytest.fixture(autouse=True, scope=cluster_scope)
+def cluster(admin_ipc, build_dir, public_key_jwks_str, public_key_refresh_interval, trusted_client):
+    with TempCluster(
+            build_dir=build_dir,
+            tls_config=TLSConfig(server_chain_len=3, client_chain_len=2),
+            public_key_json_str=public_key_jwks_str,
+            remove_at_exit=True,
+            custom_config={
+                "knob-public-key-file-refresh-interval-seconds": public_key_refresh_interval,
+            }) as cluster:
+        keyfile = str(cluster.client_key_file)
+        certfile = str(cluster.client_cert_file)
+        cafile = str(cluster.server_ca_file)
+        fdb.options.set_tls_key_path(keyfile if trusted_client else "")
+        fdb.options.set_tls_cert_path(certfile if trusted_client else "")
+        fdb.options.set_tls_ca_path(cafile)
+        fdb.options.set_trace_enable()
+        admin_ipc.request("configure_tls", [keyfile, certfile, cafile])
+        admin_ipc.request("connect", [str(cluster.cluster_file)])
+        yield cluster
+
+@pytest.fixture
+def db(cluster, admin_ipc):
+    db = fdb.open(str(cluster.cluster_file))
+    db.options.set_transaction_timeout(2000) # 2 seconds
+    db.options.set_transaction_retry_limit(3)
+    yield db
+    admin_ipc.request("cleanup_database")
+    db = None
+
+@pytest.fixture
+def tenant_gen(db, admin_ipc):
+    def fn(tenant):
+        tenant = to_bytes(tenant)
+        admin_ipc.request("create_tenant", [tenant])
+    return fn
+
+@pytest.fixture
+def tenant_del(db, admin_ipc):
+    def fn(tenant):
+        tenant = to_str(tenant)
+        admin_ipc.request("delete_tenant", [tenant])
+    return fn
+
+@pytest.fixture
+def default_tenant(tenant_gen, tenant_del):
+    tenant = random_alphanum_bytes(8)
+    tenant_gen(tenant)
+    yield tenant
+    tenant_del(tenant)
+
+@pytest.fixture
+def tenant_tr_gen(db):
+    def fn(tenant):
+        tenant = db.open_tenant(to_bytes(tenant))
+        return tenant.create_transaction()
+    return fn
diff --git a/tests/authorization/requirements.txt b/tests/authorization/requirements.txt
new file mode 100644
index 0000000000..a60c422406
--- /dev/null
+++ b/tests/authorization/requirements.txt
@@ -0,0 +1,12 @@
+attrs==22.1.0
+Authlib==1.0.1
+cffi==1.15.1
+cryptography==37.0.4
+iniconfig==1.1.1
+packaging==21.3
+pluggy==1.0.0
+py==1.11.0
+pycparser==2.21
+pyparsing==3.0.9
+pytest==7.1.2
+tomli==2.0.1
diff --git a/tests/authorization/util.py b/tests/authorization/util.py
new file mode 100644
index 0000000000..aa38c05f32
--- /dev/null
+++ b/tests/authorization/util.py
@@ -0,0 +1,124 @@
+import fdb
+import json
+import random
+import string
+import time
+from typing import Union, List
+
+def to_str(s: Union[str, bytes]):
+    if isinstance(s, bytes):
+        s = s.decode("utf8")
+    return s
+
+def to_bytes(s: Union[str, bytes]):
+    if isinstance(s, str):
+        s = s.encode("utf8")
+    return s
+
+def random_alphanum_str(k: int):
+    return ''.join(random.choices(string.ascii_letters + string.digits, k=k))
+
+def random_alphanum_bytes(k: int):
+    return random_alphanum_str(k).encode("ascii")
+
+def cleanup_tenant(db, tenant_name):
+    try:
+        tenant = db.open_tenant(tenant_name)
+        del tenant[:]
+        fdb.tenant_management.delete_tenant(db, tenant_name)
+    except fdb.FDBError as e:
+        if e.code == 2131: # tenant not found
+            pass
+        else:
+            raise
+
+def alg_from_kty(kty: str):
+    if kty == "EC":
+        return "ES256"
+    else:
+        return "RS256"
+
+def public_keyset_from_keys(keys: List):
+    keys = list(map(lambda key: key.as_dict(is_private=False, alg=alg_from_kty(key.kty)), keys))
+    return json.dumps({ "keys": keys })
+
+class KeyFileReverter(object):
+    def __init__(self, filename: str, content: str, refresh_delay: int):
+        self.filename = filename
+        self.content = content
+        self.refresh_delay = refresh_delay
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        with open(self.filename, "w") as keyfile:
+            keyfile.write(self.content)
+        print(f"key file reverted. waiting {self.refresh_delay * 2} seconds for the update to take effect...")
+        time.sleep(self.refresh_delay * 2)
+
+# JWT claim that is valid for 1 hour since time of invocation
+def token_claim_1h(tenant_name):
+    now = time.time()
+    return {
+        "iss": "fdb-authz-tester",
+        "sub": "authz-test",
+        "aud": ["tmp-cluster"],
+        "iat": now,
+        "nbf": now - 1,
+        "exp": now + 60 * 60, 
+        "jti": random_alphanum_str(10),
+        "tenants": [to_str(tenant_name)],
+    }
+
+# repeat try-wait loop up to max_repeat times until both read and write tr fails for tenant with permission_denied
+# important: only use this function if you don't have any data dependencies to key "abc"
+def wait_until_tenant_tr_fails(tenant, private_key, tenant_tr_gen, token_gen, max_repeat, delay):
+    repeat = 0
+    read_blocked = False
+    write_blocked = False
+    while (not read_blocked or not write_blocked) and repeat < max_repeat:
+        time.sleep(delay)
+        tr = tenant_tr_gen(tenant)
+        # a token needs to be generated at every iteration because once it is accepted/cached,
+        # it will pass verification by caching until it expires
+        tr.options.set_authorization_token(token_gen(private_key, token_claim_1h(tenant)))
+        try:
+            if not read_blocked:
+                value = tr[b"abc"].value
+        except fdb.FDBError as e:
+            assert e.code == 6000, f"expected permission_denied, got {e} instead"
+            read_blocked = True
+        if not read_blocked:
+            repeat += 1
+            continue
+
+        try:
+            if not write_blocked:
+                tr[b"abc"] = b"def"
+                tr.commit().wait()
+        except fdb.FDBError as e:
+            assert e.code == 6000, f"expected permission_denied, got {e} instead"
+            write_blocked = True
+        if not write_blocked:
+            repeat += 1
+    assert repeat < max_repeat, f"tenant transaction did not start to fail in {max_repeat * delay} seconds"
+
+# repeat try-wait loop up to max_repeat times until both read and write tr succeeds for tenant
+# important: only use this function if you don't have any data dependencies to key "abc"
+def wait_until_tenant_tr_succeeds(tenant, private_key, tenant_tr_gen, token_gen, max_repeat, delay):
+    repeat = 0
+    token = token_gen(private_key, token_claim_1h(tenant))
+    while repeat < max_repeat:
+        try:
+            time.sleep(delay)
+            tr = tenant_tr_gen(tenant)
+            tr.options.set_authorization_token(token)
+            value = tr[b"abc"].value
+            tr[b"abc"] = b"qwe"
+            tr.commit().wait()
+            break
+        except fdb.FDBError as e:
+            assert e.code == 6000, f"expected permission_denied, got {e} instead"
+            repeat += 1
+    assert repeat < max_repeat, f"tenant transaction did not start to succeed in {max_repeat * delay} seconds"
diff --git a/tests/fast/AtomicBackupToDBCorrectness.toml b/tests/fast/AtomicBackupToDBCorrectness.toml
index 1b601da923..84214337d9 100644
--- a/tests/fast/AtomicBackupToDBCorrectness.toml
+++ b/tests/fast/AtomicBackupToDBCorrectness.toml
@@ -1,5 +1,5 @@
 [configuration]
-extraDB = 1
+extraDatabaseMode = 'LocalOrSingle'
 
 [[test]]
 testTitle = 'BackupAndRestore'
diff --git a/tests/fast/BackupAzureBlobCorrectness.toml b/tests/fast/BackupAzureBlobCorrectness.toml
index 48598d5c6b..2401f395df 100644
--- a/tests/fast/BackupAzureBlobCorrectness.toml
+++ b/tests/fast/BackupAzureBlobCorrectness.toml
@@ -1,3 +1,5 @@
+testClass = "Backup"
+
 [[test]]
 testTitle = 'Cycle'
 clearAfterTest = 'false'
@@ -39,7 +41,9 @@ testTitle = 'Backup'
     testName = 'BackupToBlob'
     backupAfter = 0.0
     backupTag = 'default'
-    backupURL = 'azure://0.0.0.0:10000/devstoreaccount1/test_container/'
+    # azure storage account style format: azure://<account_name>.blob.core.windows.net/<container_name>
+    # general ip-port style format: azure://<IP|Hostname>:<Port>/<account_name>/<container_name>
+    backupURL = 'azure://0.0.0.0:10000/devstoreaccount1/testcontainer'
     
     [[test.workload]]
     testName = 'RandomClogging'
@@ -72,7 +76,7 @@ clearAfterTest = 'false'
     testName = 'RestoreFromBlob'
     restoreAfter = 0.0
     backupTag = 'default'
-    backupURL = 'azure://0.0.0.0:10000/devstoreaccount1/test_container/'
+    backupURL = 'azure://0.0.0.0:10000/devstoreaccount1/testcontainer'
 
     [[test.workload]]
     testName = 'RandomClogging'
diff --git a/tests/fast/BackupCorrectness.toml b/tests/fast/BackupCorrectness.toml
index 1315b1af5f..0582bfa0eb 100644
--- a/tests/fast/BackupCorrectness.toml
+++ b/tests/fast/BackupCorrectness.toml
@@ -1,3 +1,5 @@
+testClass = "Backup"
+
 [[test]]
 testTitle = 'BackupAndRestore'
 clearAfterTest = false
diff --git a/tests/fast/BackupCorrectnessClean.toml b/tests/fast/BackupCorrectnessClean.toml
index d5fc3d945e..d581e0caea 100644
--- a/tests/fast/BackupCorrectnessClean.toml
+++ b/tests/fast/BackupCorrectnessClean.toml
@@ -1,3 +1,5 @@
+testClass = "Backup"
+
 [[test]]
 testTitle = 'BackupAndRestore'
 clearAfterTest = false
diff --git a/tests/fast/BackupS3BlobCorrectness.toml b/tests/fast/BackupS3BlobCorrectness.toml
index e80fa847b0..d642cd4811 100644
--- a/tests/fast/BackupS3BlobCorrectness.toml
+++ b/tests/fast/BackupS3BlobCorrectness.toml
@@ -1,3 +1,5 @@
+testClass = "Backup"
+
 [[test]]
 testTitle = 'Cycle'
 clearAfterTest = 'false'
diff --git a/tests/fast/BackupToDBCorrectness.toml b/tests/fast/BackupToDBCorrectness.toml
index cf50093657..b8c9e859b8 100644
--- a/tests/fast/BackupToDBCorrectness.toml
+++ b/tests/fast/BackupToDBCorrectness.toml
@@ -1,5 +1,7 @@
+testClass = "Backup"
+
 [configuration]
-extraDB = 1
+extraDatabaseMode = 'LocalOrSingle'
 
 [[test]]
 testTitle = 'BackupAndRestore'
diff --git a/tests/fast/BackupToDBCorrectnessClean.toml b/tests/fast/BackupToDBCorrectnessClean.toml
index 9c2e9135e5..e6a9921383 100644
--- a/tests/fast/BackupToDBCorrectnessClean.toml
+++ b/tests/fast/BackupToDBCorrectnessClean.toml
@@ -1,5 +1,7 @@
+testClass = "Backup"
+
 [configuration]
-extraDB = 1
+extraDatabaseMode = 'LocalOrSingle'
 
 [[test]]
 testTitle = 'BackupAndRestore'
diff --git a/tests/fast/BlobGranuleMoveVerifyCycle.toml b/tests/fast/BlobGranuleMoveVerifyCycle.toml
index 43d524d534..4f1267aac7 100644
--- a/tests/fast/BlobGranuleMoveVerifyCycle.toml
+++ b/tests/fast/BlobGranuleMoveVerifyCycle.toml
@@ -1,12 +1,10 @@
 [configuration]
+testClass = "BlobGranule"
 blobGranulesEnabled = true 
 allowDefaultTenant = false
 # FIXME: re-enable rocks at some point
 storageEngineExcludeTypes = [4]
 
-[[knobs]]
-bg_range_source = "blobRangeKeys"
-
 [[test]]
 testTitle = 'BlobGranuleMoveVerifyCycle'
 
@@ -23,6 +21,8 @@ testTitle = 'BlobGranuleMoveVerifyCycle'
     [[test.workload]]
     testName = 'BlobGranuleVerifier'
     testDuration = 60.0
+    # cycle does its own workload checking, don't want clear racing with its checking
+    clearAndMergeCheck = false
 
     [[test.workload]]
     testName = 'RandomClogging'
diff --git a/tests/fast/BlobGranuleVerifyAtomicOps.toml b/tests/fast/BlobGranuleVerifyAtomicOps.toml
index 408a5ded50..84e946c004 100644
--- a/tests/fast/BlobGranuleVerifyAtomicOps.toml
+++ b/tests/fast/BlobGranuleVerifyAtomicOps.toml
@@ -1,4 +1,5 @@
 [configuration]
+testClass = "BlobGranule"
 blobGranulesEnabled = true 
 allowDefaultTenant = false
 injectTargetedSSRestart = true
@@ -6,9 +7,6 @@ injectSSDelay = true
 # FIXME: re-enable rocks at some point
 storageEngineExcludeTypes = [4, 5]
 
-[[knobs]]
-bg_range_source = "blobRangeKeys"
-
 [[test]]
 testTitle = 'BlobGranuleVerifyAtomicOps'
 
diff --git a/tests/fast/BlobGranuleVerifyCycle.toml b/tests/fast/BlobGranuleVerifyCycle.toml
index e0ec5524d7..1cf056b87b 100644
--- a/tests/fast/BlobGranuleVerifyCycle.toml
+++ b/tests/fast/BlobGranuleVerifyCycle.toml
@@ -1,4 +1,5 @@
 [configuration]
+testClass = "BlobGranule"
 blobGranulesEnabled = true 
 allowDefaultTenant = false
 injectTargetedSSRestart = true
@@ -6,9 +7,6 @@ injectSSDelay = true
 # FIXME: re-enable rocks at some point
 storageEngineExcludeTypes = [4, 5]
 
-[[knobs]]
-bg_range_source = "blobRangeKeys"
-
 [[test]]
 testTitle = 'BlobGranuleVerifyCycle'
 
@@ -21,6 +19,8 @@ testTitle = 'BlobGranuleVerifyCycle'
     [[test.workload]]
     testName = 'BlobGranuleVerifier'
     testDuration = 60.0
+    # cycle does its own workload checking, don't want clear racing with its checking
+    clearAndMergeCheck = false
 
     [[test.workload]]
     testName = 'RandomClogging'
diff --git a/tests/fast/BlobGranuleVerifySmall.toml b/tests/fast/BlobGranuleVerifySmall.toml
index df2ae14ffd..ba50b0fda6 100644
--- a/tests/fast/BlobGranuleVerifySmall.toml
+++ b/tests/fast/BlobGranuleVerifySmall.toml
@@ -1,14 +1,11 @@
 [configuration]
+testClass = "BlobGranule"
 blobGranulesEnabled = true 
 allowDefaultTenant = false
 injectTargetedSSRestart = true
 injectSSDelay = true
-# FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet
 # FIXME: re-enable rocks at some point
-storageEngineExcludeTypes = [3, 4, 5] 
-
-[[knobs]]
-bg_range_source = "blobRangeKeys"
+storageEngineExcludeTypes = [4, 5]
 
 [[test]]
 testTitle = 'BlobGranuleVerifySmall'
diff --git a/tests/fast/BlobGranuleVerifySmallClean.toml b/tests/fast/BlobGranuleVerifySmallClean.toml
index 840e6198a4..ef957b9f53 100644
--- a/tests/fast/BlobGranuleVerifySmallClean.toml
+++ b/tests/fast/BlobGranuleVerifySmallClean.toml
@@ -1,12 +1,9 @@
 [configuration]
 blobGranulesEnabled = true
 allowDefaultTenant = false
-# FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet
 # FIXME: re-enable rocks at some point
-storageEngineExcludeTypes = [3, 4, 5]
-
-[[knobs]]
-bg_range_source = "blobRangeKeys"
+storageEngineExcludeTypes = [4, 5]
+testClass = "BlobGranule"
 
 [[test]]
 testTitle = 'BlobGranuleVerifySmallClean'
@@ -20,3 +17,4 @@ testTitle = 'BlobGranuleVerifySmallClean'
     [[test.workload]]
     testName = 'BlobGranuleVerifier'
     testDuration = 60.0
+    enablePurging = false
diff --git a/tests/fast/ChangeFeedOperations.toml b/tests/fast/ChangeFeedOperations.toml
index ca3d47e08e..344b912657 100644
--- a/tests/fast/ChangeFeedOperations.toml
+++ b/tests/fast/ChangeFeedOperations.toml
@@ -1,7 +1,6 @@
 [configuration]
 allowDefaultTenant = false
-
-# TODO add failure events, and then add a version that also supports randomMoveKeys
+testClass = "ChangeFeeds"
 
 [[test]]
 testTitle = 'ChangeFeedOperationsTest'
diff --git a/tests/fast/ChangeFeedOperationsMove.toml b/tests/fast/ChangeFeedOperationsMove.toml
index a9852f2760..a44666b5fe 100644
--- a/tests/fast/ChangeFeedOperationsMove.toml
+++ b/tests/fast/ChangeFeedOperationsMove.toml
@@ -1,39 +1,38 @@
 [configuration]
+testClass = "ChangeFeeds"
 allowDefaultTenant = false
 
-# TODO add failure events, and then add a version that also supports randomMoveKeys
-
 [[test]]
 testTitle = 'ChangeFeedOperationsTest'
 
     [[test.workload]]
     testName = 'ChangeFeedOperations'
-    testDuration = 60.0
+    testDuration = 30.0
 
     [[test.workload]]
     testName = 'RandomMoveKeys'
-    testDuration = 60.0
+    testDuration = 30.0
 
     [[test.workload]]
     testName = 'RandomClogging'
-    testDuration = 60.0
+    testDuration = 30.0
 
     [[test.workload]]
     testName = 'Rollback'
-    meanDelay = 60.0
-    testDuration = 60.0
+    meanDelay = 30.0
+    testDuration = 30.0
 
     [[test.workload]]
     testName = 'Attrition'
     machinesToKill = 10
     machinesToLeave = 3
     reboot = true
-    testDuration = 60.0
+    testDuration = 30.0
 
     [[test.workload]]
     testName = 'Attrition'
     machinesToKill = 10
     machinesToLeave = 3
     reboot = true
-    testDuration = 60.0
+    testDuration = 30.0
 
diff --git a/tests/fast/ChangeFeeds.toml b/tests/fast/ChangeFeeds.toml
index 8f2d348dd3..e51e8c7c7a 100644
--- a/tests/fast/ChangeFeeds.toml
+++ b/tests/fast/ChangeFeeds.toml
@@ -1,4 +1,5 @@
 [configuration]
+testClass = "ChangeFeeds"
 allowDefaultTenant = false
 
 [[test]]
diff --git a/tests/fast/EncryptKeyProxyTest.toml b/tests/fast/EncryptKeyProxyTest.toml
index 3f5f7e12f1..199cc72da6 100644
--- a/tests/fast/EncryptKeyProxyTest.toml
+++ b/tests/fast/EncryptKeyProxyTest.toml
@@ -1,3 +1,6 @@
+[configuration]
+testClass = "Encryption"
+
 [[knobs]]
 enable_encryption = true
 
diff --git a/tests/fast/EncryptionOps.toml b/tests/fast/EncryptionOps.toml
index 72d54369fe..9aaa09f177 100644
--- a/tests/fast/EncryptionOps.toml
+++ b/tests/fast/EncryptionOps.toml
@@ -1,3 +1,9 @@
+[configuration]
+testClass = "Encryption"
+
+[[knobs]]
+enable_encryption = false
+
 [[test]]
 testTitle = 'EncryptDecrypt'
 
diff --git a/tests/fast/PhysicalShardMove.toml b/tests/fast/PhysicalShardMove.toml
index bb0a28068e..4d41f0ef70 100644
--- a/tests/fast/PhysicalShardMove.toml
+++ b/tests/fast/PhysicalShardMove.toml
@@ -9,6 +9,7 @@ allowDefaultTenant = false
 [[knobs]]
 shard_encode_location_metadata = true
 storage_server_shard_aware = true
+enable_dd_physical_shard = true
 
 [[test]]
 testTitle = 'PhysicalShardMove'
diff --git a/tests/fast/TenantEntryCache.toml b/tests/fast/TenantEntryCache.toml
new file mode 100644
index 0000000000..c60423ef89
--- /dev/null
+++ b/tests/fast/TenantEntryCache.toml
@@ -0,0 +1,10 @@
+[configuration]
+allowDefaultTenant = false
+allowDisablingTenants = false
+
+[[test]]
+testTitle = 'TenantEntryCacheTest'
+
+    [[test.workload]]
+    testName = 'TenantEntryCache'
+    maxTenants = 100
\ No newline at end of file
diff --git a/tests/loopback_cluster/run_custom_cluster.sh b/tests/loopback_cluster/run_custom_cluster.sh
index e984c952d0..72d9b239fc 100755
--- a/tests/loopback_cluster/run_custom_cluster.sh
+++ b/tests/loopback_cluster/run_custom_cluster.sh
@@ -14,6 +14,7 @@ KNOBS=""
 LOGS_TASKSET=""
 STATELESS_TASKSET=""
 STORAGE_TASKSET=""
+LOGROUTER_COUNT=0
 
 function usage {
 	echo "Usage"
@@ -38,12 +39,36 @@ function start_servers {
 		DATA=${DIR}/${SERVER_COUNT}/data
 		mkdir -p ${LOG} ${DATA}
 		PORT=$(( $PORT_PREFIX + $SERVER_COUNT ))
-		ZONE=$(( $j % $REPLICATION_COUNT ))
-		$2 ${FDB} -p auto:${PORT} "$KNOBS" -c $3 -d $DATA -L $LOG -C $CLUSTER --locality-zoneid Z-$ZONE --locality-machineid M-$SERVER_COUNT &
+		ZONE=$4-Z-$(( $j % $REPLICATION_COUNT ))
+		$2 ${FDB} -p auto:${PORT} $KNOBS -c $3 -d $DATA -L $LOG -C $CLUSTER --datacenter_id=$4 --locality-zoneid $ZONE --locality-machineid M-$SERVER_COUNT &
 		SERVER_COUNT=$(( $SERVER_COUNT + 1 ))
 	done
 }
 
+function create_fileconfig {
+	cat > /tmp/fdbfileconfig.json <<EOF
+{
+	"regions": [{
+		"datacenters": [{
+			"id": "DC1",
+			"priority": 1
+		}, {
+			"id": "DC2",
+			"priority": 0,
+			"satellite": 1,
+			"satellite_logs": 8
+		}],
+		"satellite_redundancy_mode": "one_satellite_double"
+	}, {
+		"datacenters": [{
+			"id": "DC3",
+			"priority": -1
+		}]
+	}]
+}
+EOF
+}
+
 if (( $# < 1 )) ; then
 	echo Wrong number of arguments
 	usage
@@ -82,6 +107,9 @@ while [[ $# -gt 0 ]]; do
 		--replication_count)
 			REPLICATION_COUNT=$2
 			;;
+		--logrouter_count)
+			LOGROUTER_COUNT=$2
+			;;
 	esac
 	shift; shift
 done
@@ -112,12 +140,23 @@ echo $CLUSTER_FILE > $CLUSTER
 
 echo "Starting Cluster: " $CLUSTER_FILE
 
-start_servers $STATELESS_COUNT "$STATELESS_TASKSET" stateless
-start_servers $LOGS_COUNT "$LOGS_TASKSET" log
-start_servers $STORAGE_COUNT "$STORAGE_TASKSET" storage
+start_servers $STATELESS_COUNT "$STATELESS_TASKSET" stateless DC1
+start_servers $LOGS_COUNT "$LOGS_TASKSET" log DC1
+start_servers $STORAGE_COUNT "$STORAGE_TASKSET" storage DC1
 
 CLI="$BUILD/bin/fdbcli -C ${CLUSTER} --exec"
 echo "configure new ssd $replication - stand by"
 
 # sleep 2 seconds to wait for workers to join cluster, then configure database and coordinators
 ( sleep 2 ; $CLI "configure new ssd $replication" ; $CLI "coordinators auto")
+
+if [ $LOGROUTER_COUNT -gt 0 ]; then
+	start_servers $LOGROUTER_COUNT "$STORAGE_TASKSET" router DC3
+	# Same number remote/satellite logs and ss as primary
+	start_servers $LOGS_COUNT "$LOGS_TASKSET" log DC2
+	start_servers $LOGS_COUNT "$LOGS_TASKSET" log DC3
+	start_servers $STORAGE_COUNT "$STORAGE_TASKSET" storage DC3
+	create_fileconfig
+	$CLI "fileconfigure /tmp/fdbfileconfig.json"
+	echo "Wait for data to be fully replicated (Healthy), then issue: $CLI configure usable_regions=2"
+fi
diff --git a/tests/rare/BlobGranuleRanges.toml b/tests/rare/BlobGranuleRanges.toml
new file mode 100644
index 0000000000..aa38cdf5f8
--- /dev/null
+++ b/tests/rare/BlobGranuleRanges.toml
@@ -0,0 +1,38 @@
+[configuration]
+blobGranulesEnabled = true 
+allowDefaultTenant = false
+injectTargetedSSRestart = true
+injectSSDelay = true
+# FIXME: re-enable rocks at some point
+storageEngineExcludeTypes = [4, 5]
+
+[[test]]
+testTitle = 'BlobGranuleRanges'
+
+    [[test.workload]]
+    testName = 'BlobGranuleRanges'
+    testDuration = 30.0
+
+    [[test.workload]]
+    testName = 'RandomClogging'
+    testDuration = 30.0
+
+    [[test.workload]]
+    testName = 'Rollback'
+    meanDelay = 20.0
+    testDuration = 30.0
+
+    [[test.workload]]
+    testName = 'Attrition'
+    machinesToKill = 10
+    machinesToLeave = 3
+    reboot = true
+    testDuration = 30.0
+
+    [[test.workload]]
+    testName = 'Attrition'
+    machinesToKill = 10
+    machinesToLeave = 3
+    reboot = true
+    testDuration = 30.0
+
diff --git a/tests/restarting/from_5.1.7_until_6.3.0/DrUpgradeRestart-2.txt b/tests/restarting/from_5.1.7_until_6.3.0/DrUpgradeRestart-2.txt
index 439ccedb5e..366d8bc2a3 100644
--- a/tests/restarting/from_5.1.7_until_6.3.0/DrUpgradeRestart-2.txt
+++ b/tests/restarting/from_5.1.7_until_6.3.0/DrUpgradeRestart-2.txt
@@ -1,4 +1,4 @@
-extraDB=3
+extraDatabaseMode=Local
 
 testTitle=DrUpgrade
 runSetup=false
diff --git a/tests/restarting/from_6.3.13/DrUpgradeRestart-1.txt b/tests/restarting/from_6.3.13_until_7.2.0/DrUpgradeRestart-1.txt
similarity index 100%
rename from tests/restarting/from_6.3.13/DrUpgradeRestart-1.txt
rename to tests/restarting/from_6.3.13_until_7.2.0/DrUpgradeRestart-1.txt
diff --git a/tests/restarting/from_6.3.13/DrUpgradeRestart-2.txt b/tests/restarting/from_6.3.13_until_7.2.0/DrUpgradeRestart-2.txt
similarity index 93%
rename from tests/restarting/from_6.3.13/DrUpgradeRestart-2.txt
rename to tests/restarting/from_6.3.13_until_7.2.0/DrUpgradeRestart-2.txt
index 439ccedb5e..366d8bc2a3 100644
--- a/tests/restarting/from_6.3.13/DrUpgradeRestart-2.txt
+++ b/tests/restarting/from_6.3.13_until_7.2.0/DrUpgradeRestart-2.txt
@@ -1,4 +1,4 @@
-extraDB=3
+extraDatabaseMode=Local
 
 testTitle=DrUpgrade
 runSetup=false
diff --git a/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml b/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml
index f0f2141778..04144efd6b 100644
--- a/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml
+++ b/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml
@@ -1,3 +1,4 @@
+testClass = "Backup"
 storageEngineExcludeTypes=3
 
 [[test]]
diff --git a/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-2.toml b/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-2.toml
index b5ac855d52..22f016deab 100644
--- a/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-2.toml
+++ b/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-2.toml
@@ -1,3 +1,5 @@
+testClass = "Backup"
+
 [[test]]
 testTitle = 'SecondCycleTest'
 simBackupAgents = 'BackupToFile'
diff --git a/tests/restarting/from_7.1.0/SnapCycleRestart-1.txt b/tests/restarting/from_7.1.0/SnapCycleRestart-1.txt
index 2df70a8f0f..84eadec755 100644
--- a/tests/restarting/from_7.1.0/SnapCycleRestart-1.txt
+++ b/tests/restarting/from_7.1.0/SnapCycleRestart-1.txt
@@ -1,3 +1,4 @@
+testClass=SnapshotTest
 storageEngineExcludeTypes=[3, 4, 5]
 
 ;Take snap and do cycle test
diff --git a/tests/restarting/from_7.1.0/SnapCycleRestart-2.txt b/tests/restarting/from_7.1.0/SnapCycleRestart-2.txt
index 9ec734929e..c0366ae2f4 100644
--- a/tests/restarting/from_7.1.0/SnapCycleRestart-2.txt
+++ b/tests/restarting/from_7.1.0/SnapCycleRestart-2.txt
@@ -1,3 +1,4 @@
+testClass=SnapshotTest
 storageEngineExcludeTypes=[4, 5]
 buggify=off
 
diff --git a/tests/restarting/from_7.1.0/SnapIncrementalRestore-1.txt b/tests/restarting/from_7.1.0/SnapIncrementalRestore-1.txt
index b464eda8e3..22bd11c01f 100644
--- a/tests/restarting/from_7.1.0/SnapIncrementalRestore-1.txt
+++ b/tests/restarting/from_7.1.0/SnapIncrementalRestore-1.txt
@@ -1,3 +1,4 @@
+testClass=SnapshotTest
 storageEngineExcludeTypes=[3, 4, 5]
 
 logAntiQuorum = 0
diff --git a/tests/restarting/from_7.1.0/SnapIncrementalRestore-2.txt b/tests/restarting/from_7.1.0/SnapIncrementalRestore-2.txt
index 1aa1596fc7..fc6034373a 100644
--- a/tests/restarting/from_7.1.0/SnapIncrementalRestore-2.txt
+++ b/tests/restarting/from_7.1.0/SnapIncrementalRestore-2.txt
@@ -1,3 +1,4 @@
+testClass=SnapshotTest
 storageEngineExcludeTypes=[4, 5]
 
 testTitle=RestoreBackup
diff --git a/tests/restarting/from_7.1.0/SnapTestAttrition-1.txt b/tests/restarting/from_7.1.0/SnapTestAttrition-1.txt
index 84ff6bccbc..826d756e88 100644
--- a/tests/restarting/from_7.1.0/SnapTestAttrition-1.txt
+++ b/tests/restarting/from_7.1.0/SnapTestAttrition-1.txt
@@ -1,3 +1,4 @@
+testClass=SnapshotTest
 storageEngineExcludeTypes=[3, 4, 5]
 
 ;write 1000 Keys ending with even numbers
diff --git a/tests/restarting/from_7.1.0/SnapTestAttrition-2.txt b/tests/restarting/from_7.1.0/SnapTestAttrition-2.txt
index dde9dab0e2..8a62641c1e 100644
--- a/tests/restarting/from_7.1.0/SnapTestAttrition-2.txt
+++ b/tests/restarting/from_7.1.0/SnapTestAttrition-2.txt
@@ -1,3 +1,4 @@
+testClass=SnapshotTest
 storageEngineExcludeTypes=[4, 5]
 
 buggify=off
diff --git a/tests/restarting/from_7.1.0/SnapTestRestart-1.txt b/tests/restarting/from_7.1.0/SnapTestRestart-1.txt
index e47db390e1..d501cad91a 100644
--- a/tests/restarting/from_7.1.0/SnapTestRestart-1.txt
+++ b/tests/restarting/from_7.1.0/SnapTestRestart-1.txt
@@ -1,3 +1,4 @@
+testClass=SnapshotTest
 storageEngineExcludeTypes=[3, 4, 5]
 
 ;write 1000 Keys ending with even numbers
diff --git a/tests/restarting/from_7.1.0/SnapTestRestart-2.txt b/tests/restarting/from_7.1.0/SnapTestRestart-2.txt
index 8ee3b0b5ab..2c67691278 100644
--- a/tests/restarting/from_7.1.0/SnapTestRestart-2.txt
+++ b/tests/restarting/from_7.1.0/SnapTestRestart-2.txt
@@ -1,3 +1,4 @@
+testClass=SnapshotTest
 storageEngineExcludeTypes=[4, 5]
 
 buggify=off
diff --git a/tests/restarting/from_7.1.0/SnapTestSimpleRestart-1.txt b/tests/restarting/from_7.1.0/SnapTestSimpleRestart-1.txt
index bb5a85efe1..82532d4c90 100644
--- a/tests/restarting/from_7.1.0/SnapTestSimpleRestart-1.txt
+++ b/tests/restarting/from_7.1.0/SnapTestSimpleRestart-1.txt
@@ -1,3 +1,4 @@
+testClass=SnapshotTest
 storageEngineExcludeTypes=[3, 4, 5]
 
 ;write 1000 Keys ending with even number
diff --git a/tests/restarting/from_7.1.0/SnapTestSimpleRestart-2.txt b/tests/restarting/from_7.1.0/SnapTestSimpleRestart-2.txt
index 103eb53d1d..2e03bf7eac 100644
--- a/tests/restarting/from_7.1.0/SnapTestSimpleRestart-2.txt
+++ b/tests/restarting/from_7.1.0/SnapTestSimpleRestart-2.txt
@@ -1,3 +1,4 @@
+testClass=SnapshotTest
 storageEngineExcludeTypes=[4, 5]
 
 buggify=off
diff --git a/tests/restarting/from_7.2.0/DrUpgradeRestart-1.txt b/tests/restarting/from_7.2.0/DrUpgradeRestart-1.txt
new file mode 100644
index 0000000000..81f932bada
--- /dev/null
+++ b/tests/restarting/from_7.2.0/DrUpgradeRestart-1.txt
@@ -0,0 +1,21 @@
+storageEngineExcludeTypes=3
+extraDatabaseMode=Local
+
+testTitle=DrUpgrade
+clearAfterTest=false
+simBackupAgents=BackupToDB
+
+    testName=Cycle
+    nodeCount=30000
+    transactionsPerSecond=2500.0
+    testDuration=30.0
+    expectedRate=0
+
+    testName=BackupToDBUpgrade
+    backupAfter=10.0
+    stopDifferentialAfter=50.0
+    backupRangesCount=-1
+    
+    testName=SaveAndKill
+    restartInfoLocation=simfdb/restartInfo.ini
+    testDuration=40.0
diff --git a/tests/restarting/from_7.2.0/DrUpgradeRestart-2.txt b/tests/restarting/from_7.2.0/DrUpgradeRestart-2.txt
new file mode 100644
index 0000000000..366d8bc2a3
--- /dev/null
+++ b/tests/restarting/from_7.2.0/DrUpgradeRestart-2.txt
@@ -0,0 +1,18 @@
+extraDatabaseMode=Local
+
+testTitle=DrUpgrade
+runSetup=false
+clearAfterTest=false
+simBackupAgents=BackupToDB
+waitForQuiescenceBegin=false
+
+    testName=Cycle
+    nodeCount=30000
+    transactionsPerSecond=2500.0
+    testDuration=30.0
+    expectedRate=0
+
+    testName=BackupToDBUpgrade
+    backupAfter=10.0
+    backupRangesCount=-1
+    stopDifferentialAfter=70.0
diff --git a/tests/slow/ApiCorrectnessSwitchover.toml b/tests/slow/ApiCorrectnessSwitchover.toml
index 3b4bfd5680..42679e9445 100644
--- a/tests/slow/ApiCorrectnessSwitchover.toml
+++ b/tests/slow/ApiCorrectnessSwitchover.toml
@@ -1,5 +1,5 @@
 [configuration]
-extraDB = 2
+extraDatabaseMode = 'Single'
 
 [[test]]
 testTitle = 'ApiCorrectnessTest'
diff --git a/tests/slow/BlobGranuleCorrectness.toml b/tests/slow/BlobGranuleCorrectness.toml
index 1352330cd4..00aa86b07d 100644
--- a/tests/slow/BlobGranuleCorrectness.toml
+++ b/tests/slow/BlobGranuleCorrectness.toml
@@ -8,7 +8,6 @@ injectSSDelay = true
 storageEngineExcludeTypes = [4, 5]
 
 [[knobs]]
-bg_range_source = "tenant"
 bg_metadata_source = "tenant"
 bg_key_tuple_truncate_offset = 1
 enable_encryption = true
diff --git a/tests/slow/BlobGranuleCorrectnessClean.toml b/tests/slow/BlobGranuleCorrectnessClean.toml
index 4f92946cbe..832e3d2b86 100644
--- a/tests/slow/BlobGranuleCorrectnessClean.toml
+++ b/tests/slow/BlobGranuleCorrectnessClean.toml
@@ -6,7 +6,6 @@ allowDisablingTenants = false
 storageEngineExcludeTypes = [4, 5]
 
 [[knobs]]
-bg_range_source = "tenant"
 bg_metadata_source = "tenant"
 enable_encryption = true
 
diff --git a/tests/slow/BlobGranuleVerifyBalance.toml b/tests/slow/BlobGranuleVerifyBalance.toml
index cb138f1350..91f97d6d04 100644
--- a/tests/slow/BlobGranuleVerifyBalance.toml
+++ b/tests/slow/BlobGranuleVerifyBalance.toml
@@ -6,9 +6,6 @@ injectSSDelay = true
 # FIXME: re-enable rocks at some point
 storageEngineExcludeTypes = [4, 5]
 
-[[knobs]]
-bg_range_source = "blobRangeKeys"
-
 [[test]]
 testTitle = 'BlobGranuleVerifyBalance'
 
diff --git a/tests/slow/BlobGranuleVerifyBalanceClean.toml b/tests/slow/BlobGranuleVerifyBalanceClean.toml
index 12d6c19395..4ea976020e 100644
--- a/tests/slow/BlobGranuleVerifyBalanceClean.toml
+++ b/tests/slow/BlobGranuleVerifyBalanceClean.toml
@@ -4,9 +4,6 @@ allowDefaultTenant = false
 # FIXME: re-enable rocks at some point
 storageEngineExcludeTypes = [4, 5]
 
-[[knobs]]
-bg_range_source = "blobRangeKeys"
-
 [[test]]
 testTitle = 'BlobGranuleVerifyBalanceClean'
 
@@ -24,3 +21,4 @@ testTitle = 'BlobGranuleVerifyBalanceClean'
     [[test.workload]]
     testName = 'BlobGranuleVerifier'
     testDuration = 120.0
+    enablePurging = false
diff --git a/tests/slow/BlobGranuleVerifyLarge.toml b/tests/slow/BlobGranuleVerifyLarge.toml
index 7ee7b52364..01aac91356 100644
--- a/tests/slow/BlobGranuleVerifyLarge.toml
+++ b/tests/slow/BlobGranuleVerifyLarge.toml
@@ -6,9 +6,6 @@ injectSSDelay = true
 # FIXME: re-enable rocks at some point
 storageEngineExcludeTypes = [4, 5]
 
-[[knobs]]
-bg_range_source = "blobRangeKeys"
-
 [[test]]
 testTitle = 'BlobGranuleVerifyLarge'
 
diff --git a/tests/slow/BlobGranuleVerifyLargeClean.toml b/tests/slow/BlobGranuleVerifyLargeClean.toml
index b1b6902170..a7adc4243a 100644
--- a/tests/slow/BlobGranuleVerifyLargeClean.toml
+++ b/tests/slow/BlobGranuleVerifyLargeClean.toml
@@ -4,9 +4,6 @@ allowDefaultTenant = false
 # FIXME: re-enable rocks at some point
 storageEngineExcludeTypes = [4, 5]
 
-[[knobs]]
-bg_range_source = "blobRangeKeys"
-
 [[test]]
 testTitle = 'BlobGranuleVerifyLargeClean'
 
@@ -28,3 +25,4 @@ testTitle = 'BlobGranuleVerifyLargeClean'
     [[test.workload]]
     testName = 'BlobGranuleVerifier'
     testDuration = 120.0
+    enablePurging = false
diff --git a/tests/slow/DifferentClustersSameRV.toml b/tests/slow/DifferentClustersSameRV.toml
index add7a2377e..31121b5ccc 100644
--- a/tests/slow/DifferentClustersSameRV.toml
+++ b/tests/slow/DifferentClustersSameRV.toml
@@ -1,5 +1,5 @@
 [configuration]
-extraDB = 2
+extraDatabaseMode = 'Single'
 
 [[test]]
 testTitle = 'DifferentClustersSameRV'
diff --git a/tests/slow/MetaclusterManagement.toml b/tests/slow/MetaclusterManagement.toml
new file mode 100644
index 0000000000..5e8db254d7
--- /dev/null
+++ b/tests/slow/MetaclusterManagement.toml
@@ -0,0 +1,18 @@
+[configuration]
+allowDefaultTenant = false
+allowDisablingTenants = false
+allowCreatingTenants = false
+extraDatabaseMode = 'Multiple'
+extraDatabaseCount = 5
+
+[[test]]
+testTitle = 'MetaclusterManagementTest'
+clearAfterTest = true
+timeout = 2100
+runSetup = true
+
+    [[test.workload]]
+    testName = 'MetaclusterManagement'
+	maxTenants = 1000
+	maxTenantGroups = 20
+	testDuration = 60
diff --git a/tests/slow/SharedBackupCorrectness.toml b/tests/slow/SharedBackupCorrectness.toml
index c03b89831a..f202cae840 100644
--- a/tests/slow/SharedBackupCorrectness.toml
+++ b/tests/slow/SharedBackupCorrectness.toml
@@ -1,5 +1,5 @@
 [configuration]
-extraDB = 1
+extraDatabaseMode = 'LocalOrSingle'
 
 [[test]]
 testTitle = 'BackupAndRestore'
diff --git a/tests/slow/SharedBackupToDBCorrectness.toml b/tests/slow/SharedBackupToDBCorrectness.toml
index 3a3a07dfbd..cebee48a2e 100644
--- a/tests/slow/SharedBackupToDBCorrectness.toml
+++ b/tests/slow/SharedBackupToDBCorrectness.toml
@@ -1,5 +1,5 @@
 [configuration]
-extraDB = 1
+extraDatabaseMode = 'LocalOrSingle'
 
 [[test]]
 testTitle = 'BackupAndRestore'
diff --git a/tests/slow/SwizzledTenantManagement.toml b/tests/slow/SwizzledTenantManagement.toml
index d34544eaa8..8a8731915a 100644
--- a/tests/slow/SwizzledTenantManagement.toml
+++ b/tests/slow/SwizzledTenantManagement.toml
@@ -11,35 +11,35 @@ runSetup = true
     [[test.workload]]
     testName = 'TenantManagement'
     maxTenants = 1000
-    testDuration = 60
+    testDuration = 120
 
     [[test.workload]]
     testName = 'RandomClogging'
-    testDuration = 120.0
+    testDuration = 180.0
     swizzle = 1
 
     [[test.workload]]
     testName = 'Rollback'
-    testDuration = 120.0
+    testDuration = 180.0
     meanDelay = 10.0
 
     [[test.workload]]
     testName = 'Attrition'
-    testDuration = 120.0
+    testDuration = 180.0
 
     [[test.workload]]
     testName = 'Attrition'
     machinesToKill = 10
     machinesToLeave = 3
     reboot = true
-    testDuration = 120.0
+    testDuration = 180.0
 
     [[test.workload]]
     testName = 'Attrition'
     machinesToKill = 10
     machinesToLeave = 3
     reboot = true
-    testDuration = 120.0
+    testDuration = 180.0
 
     [[test.workload]]
     testName = 'ChangeConfig'
diff --git a/tests/slow/SwizzledTenantManagementMetacluster.toml b/tests/slow/SwizzledTenantManagementMetacluster.toml
new file mode 100644
index 0000000000..af3a40d639
--- /dev/null
+++ b/tests/slow/SwizzledTenantManagementMetacluster.toml
@@ -0,0 +1,40 @@
+[configuration]
+allowDefaultTenant = false
+allowDisablingTenants = false
+allowCreatingTenants = false
+extraDatabaseMode = 'Single'
+
+[[test]]
+testTitle = 'TenantManagementTest'
+clearAfterTest = true
+timeout = 2100
+runSetup = true
+
+    [[test.workload]]
+    testName = 'TenantManagement'
+    maxTenants = 1000
+    testDuration = 120
+
+    [[test.workload]]
+    testName = 'RandomClogging'
+    testDuration = 180.0
+    swizzle = 1
+
+    [[test.workload]]
+    testName = 'Rollback'
+    testDuration = 180.0
+    meanDelay = 10.0
+
+    [[test.workload]]
+    testName = 'Attrition'
+    machinesToKill = 10
+    machinesToLeave = 3
+    reboot = true
+    testDuration = 180.0
+
+    [[test.workload]]
+    testName = 'Attrition'
+    machinesToKill = 10
+    machinesToLeave = 3
+    reboot = true
+    testDuration = 180.0
diff --git a/tests/slow/TenantManagement.toml b/tests/slow/TenantManagement.toml
index f03bc421f2..023c826153 100644
--- a/tests/slow/TenantManagement.toml
+++ b/tests/slow/TenantManagement.toml
@@ -1,6 +1,8 @@
 [configuration]
 allowDefaultTenant = false
 allowDisablingTenants = false
+allowCreatingTenants = false
+extraDatabaseMode = 'Single'
 
 [[test]]
 testTitle = 'TenantManagementTest'
@@ -11,4 +13,4 @@ runSetup = true
     [[test.workload]]
     testName = 'TenantManagement'
     maxTenants = 1000
-    testDuration = 60
+    testDuration = 120
diff --git a/tests/slow/TenantManagementConcurrency.toml b/tests/slow/TenantManagementConcurrency.toml
new file mode 100644
index 0000000000..3b04d76586
--- /dev/null
+++ b/tests/slow/TenantManagementConcurrency.toml
@@ -0,0 +1,16 @@
+[configuration]
+allowDefaultTenant = false
+allowDisablingTenants = false
+allowCreatingTenants = false
+extraDatabaseMode = 'Single' 
+
+[[test]]
+testTitle = 'TenantManagementConcurrencyTest'
+clearAfterTest = true
+timeout = 2100
+runSetup = true
+
+    [[test.workload]]
+    testName = 'TenantManagementConcurrency'
+    maxTenants = 100
+    testDuration = 120
diff --git a/tests/slow/VersionStampBackupToDB.toml b/tests/slow/VersionStampBackupToDB.toml
index 4b36182dd0..88e2182cb6 100644
--- a/tests/slow/VersionStampBackupToDB.toml
+++ b/tests/slow/VersionStampBackupToDB.toml
@@ -1,5 +1,5 @@
 [configuration]
-extraDB = 2
+extraDatabaseMode = 'Single'
 
 [[test]]
 testTitle = 'VersionStampBackupToDB'
diff --git a/tests/slow/VersionStampSwitchover.toml b/tests/slow/VersionStampSwitchover.toml
index 328c199b93..16603193be 100644
--- a/tests/slow/VersionStampSwitchover.toml
+++ b/tests/slow/VersionStampSwitchover.toml
@@ -1,5 +1,5 @@
 [configuration]
-extraDB = 2
+extraDatabaseMode = 'Single'
 
 [[test]]
 testTitle = 'VersionStampCorrectnessTest'
diff --git a/tests/slow/WriteDuringReadSwitchover.toml b/tests/slow/WriteDuringReadSwitchover.toml
index b5232c9119..f7eb0cf01f 100644
--- a/tests/slow/WriteDuringReadSwitchover.toml
+++ b/tests/slow/WriteDuringReadSwitchover.toml
@@ -1,6 +1,6 @@
 [configuration]
 StderrSeverity = 30
-extraDB = 2
+extraDatabaseMode = 'Single'
 
 [[test]]
 testTitle = 'WriteDuringReadTest'