Merge remote-tracking branch 'origin/main' into fix-remove-quota-ubsan-failure

2022-08-22 09:33:47 -07:00 · 2022-08-22 09:33:47 -07:00 · 4e12249778
parent 489b62c29e b966c4de0c
commit 4e12249778
114 changed files with 7352 additions and 1242 deletions
--- a/bindings/c/CMakeLists.txt
+++ b/bindings/c/CMakeLists.txt
@ -141,6 +141,7 @@ if(NOT WIN32)
    test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp
    test/apitester/TesterCancelTransactionWorkload.cpp
    test/apitester/TesterCorrectnessWorkload.cpp
+    test/apitester/TesterExampleWorkload.cpp
    test/apitester/TesterKeyValueStore.cpp
    test/apitester/TesterKeyValueStore.h
    test/apitester/TesterOptions.h
@ -341,6 +342,17 @@ if(NOT WIN32)
      )
  set_tests_properties("fdb_c_upgrade_to_future_version" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}")

+if (0) # reenable after stabilizing the test
+  add_test(NAME fdb_c_upgrade_to_future_version_blob_granules
+    COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
+        --build-dir ${CMAKE_BINARY_DIR}
+        --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml
+        --upgrade-path "7.2.0" "7.3.0" "7.2.0"
+        --blob-granules-enabled
+        --process-number 3
+      )
+endif()
+
  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT USE_SANITIZER)
    add_test(NAME fdb_c_upgrade_single_threaded_630api
      COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@ -239,6 +239,10 @@ fdb_error_t fdb_future_get_version_v619(FDBFuture* f, int64_t* out_version) {
 	CATCH_AND_RETURN(*out_version = TSAV(Version, f)->get(););
 }

+extern "C" DLLEXPORT fdb_error_t fdb_future_get_bool(FDBFuture* f, fdb_bool_t* out_value) {
+	CATCH_AND_RETURN(*out_value = TSAV(bool, f)->get(););
+}
+
 extern "C" DLLEXPORT fdb_error_t fdb_future_get_int64(FDBFuture* f, int64_t* out_value) {
 	CATCH_AND_RETURN(*out_value = TSAV(int64_t, f)->get(););
 }
@ -494,6 +498,54 @@ extern "C" DLLEXPORT FDBFuture* fdb_database_wait_purge_granules_complete(FDBDat
 	    FDBFuture*)(DB(db)->waitPurgeGranulesComplete(StringRef(purge_key_name, purge_key_name_length)).extractPtr());
 }

+extern "C" DLLEXPORT FDBFuture* fdb_database_blobbify_range(FDBDatabase* db,
+                                                            uint8_t const* begin_key_name,
+                                                            int begin_key_name_length,
+                                                            uint8_t const* end_key_name,
+                                                            int end_key_name_length) {
+	return (FDBFuture*)(DB(db)
+	                        ->blobbifyRange(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length),
+	                                                    StringRef(end_key_name, end_key_name_length)))
+	                        .extractPtr());
+}
+
+extern "C" DLLEXPORT FDBFuture* fdb_database_unblobbify_range(FDBDatabase* db,
+                                                              uint8_t const* begin_key_name,
+                                                              int begin_key_name_length,
+                                                              uint8_t const* end_key_name,
+                                                              int end_key_name_length) {
+	return (FDBFuture*)(DB(db)
+	                        ->unblobbifyRange(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length),
+	                                                      StringRef(end_key_name, end_key_name_length)))
+	                        .extractPtr());
+}
+
+extern "C" DLLEXPORT FDBFuture* fdb_database_list_blobbified_ranges(FDBDatabase* db,
+                                                                    uint8_t const* begin_key_name,
+                                                                    int begin_key_name_length,
+                                                                    uint8_t const* end_key_name,
+                                                                    int end_key_name_length,
+                                                                    int rangeLimit) {
+	return (FDBFuture*)(DB(db)
+	                        ->listBlobbifiedRanges(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length),
+	                                                           StringRef(end_key_name, end_key_name_length)),
+	                                               rangeLimit)
+	                        .extractPtr());
+}
+
+extern "C" DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_verify_blob_range(FDBDatabase* db,
+                                                                                  uint8_t const* begin_key_name,
+                                                                                  int begin_key_name_length,
+                                                                                  uint8_t const* end_key_name,
+                                                                                  int end_key_name_length,
+                                                                                  int64_t version) {
+	return (FDBFuture*)(DB(db)
+	                        ->verifyBlobRange(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length),
+	                                                      StringRef(end_key_name, end_key_name_length)),
+	                                          version)
+	                        .extractPtr());
+}
+
 extern "C" DLLEXPORT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant, FDBTransaction** out_transaction) {
 	CATCH_AND_RETURN(*out_transaction = (FDBTransaction*)TENANT(tenant)->createTransaction().extractPtr(););
 }
@ -856,11 +908,12 @@ extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_blob_granule_ranges(FDBTrans
                                                                        uint8_t const* begin_key_name,
                                                                        int begin_key_name_length,
                                                                        uint8_t const* end_key_name,
-                                                                        int end_key_name_length) {
+                                                                        int end_key_name_length,
+                                                                        int rangeLimit) {
 	RETURN_FUTURE_ON_ERROR(
 	    Standalone<VectorRef<KeyRangeRef>>,
 	    KeyRangeRef range(KeyRef(begin_key_name, begin_key_name_length), KeyRef(end_key_name, end_key_name_length));
-	    return (FDBFuture*)(TXN(tr)->getBlobGranuleRanges(range).extractPtr()););
+	    return (FDBFuture*)(TXN(tr)->getBlobGranuleRanges(range, rangeLimit).extractPtr()););
 }

 extern "C" DLLEXPORT FDBResult* fdb_transaction_read_blob_granules(FDBTransaction* tr,
--- a/bindings/c/foundationdb/fdb_c.h
+++ b/bindings/c/foundationdb/fdb_c.h
@ -227,6 +227,8 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_set_callback(FDBFuture* f,
 DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_error(FDBFuture* f);
 #endif

+DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_bool(FDBFuture* f, fdb_bool_t* out);
+
 DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_int64(FDBFuture* f, int64_t* out);

 DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_uint64(FDBFuture* f, uint64_t* out);
@ -321,6 +323,32 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_wait_purge_granules_complet
                                                                                  uint8_t const* purge_key_name,
                                                                                  int purge_key_name_length);

+DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_blobbify_range(FDBDatabase* db,
+                                                                    uint8_t const* begin_key_name,
+                                                                    int begin_key_name_length,
+                                                                    uint8_t const* end_key_name,
+                                                                    int end_key_name_length);
+
+DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_unblobbify_range(FDBDatabase* db,
+                                                                      uint8_t const* begin_key_name,
+                                                                      int begin_key_name_length,
+                                                                      uint8_t const* end_key_name,
+                                                                      int end_key_name_length);
+
+DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_list_blobbified_ranges(FDBDatabase* db,
+                                                                            uint8_t const* begin_key_name,
+                                                                            int begin_key_name_length,
+                                                                            uint8_t const* end_key_name,
+                                                                            int end_key_name_length,
+                                                                            int rangeLimit);
+
+DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_verify_blob_range(FDBDatabase* db,
+                                                                       uint8_t const* begin_key_name,
+                                                                       int begin_key_name_length,
+                                                                       uint8_t const* end_key_name,
+                                                                       int end_key_name_length,
+                                                                       int64_t version);
+
 DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant,
                                                                       FDBTransaction** out_transaction);

@ -479,7 +507,8 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_blob_granule_ranges(
                                                                                uint8_t const* begin_key_name,
                                                                                int begin_key_name_length,
                                                                                uint8_t const* end_key_name,
-                                                                                int end_key_name_length);
+                                                                                int end_key_name_length,
+                                                                                int rangeLimit);

 /* LatestVersion (-2) for readVersion means get read version from transaction
   Separated out as optional because BG reads can support longer-lived reads than normal FDB transactions */
--- a/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp
+++ b/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp
@ -124,8 +124,10 @@ private:
 			    } else if (err.code() != error_code_success) {
 				    ctx->onError(err);
 			    } else {
-				    auto& [out_kv, out_count, out_more] = out;
+				    auto resCopy = copyKeyValueArray(out);
+				    auto& [resVector, out_more] = resCopy;
 				    ASSERT(!out_more);
+				    results.get()->assign(resVector.begin(), resVector.end());
 				    if (!seenReadSuccess) {
 					    info("BlobGranuleCorrectness::randomReadOp first success\n");
 				    }
@ -178,7 +180,7 @@ private:
 		}
 		execTransaction(
 		    [begin, end, results](auto ctx) {
-			    fdb::Future f = ctx->tx().getBlobGranuleRanges(begin, end).eraseType();
+			    fdb::Future f = ctx->tx().getBlobGranuleRanges(begin, end, 1000).eraseType();
 			    ctx->continueAfter(
 			        f,
 			        [ctx, f, results]() {
--- a/bindings/c/test/apitester/TesterExampleWorkload.cpp
+++ b/bindings/c/test/apitester/TesterExampleWorkload.cpp
@ -0,0 +1,65 @@
+/*
+ * TesterExampleWorkload.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TesterWorkload.h"
+#include "TesterUtil.h"
+
+namespace FdbApiTester {
+
+class SetAndGetWorkload : public WorkloadBase {
+public:
+	fdb::Key keyPrefix;
+	Random random;
+
+	SetAndGetWorkload(const WorkloadConfig& config) : WorkloadBase(config) {
+		keyPrefix = fdb::toBytesRef(fmt::format("{}/", workloadId));
+	}
+
+	void start() override { setAndGet(NO_OP_TASK); }
+
+	void setAndGet(TTaskFct cont) {
+		fdb::Key key = keyPrefix + random.randomStringLowerCase(10, 100);
+		fdb::Value value = random.randomStringLowerCase(10, 1000);
+		execTransaction(
+		    [key, value](auto ctx) {
+			    ctx->tx().set(key, value);
+			    ctx->commit();
+		    },
+		    [this, key, value, cont]() {
+			    execTransaction(
+			        [this, key, value](auto ctx) {
+				        auto future = ctx->tx().get(key, false);
+				        ctx->continueAfter(future, [this, ctx, future, value]() {
+					        std::optional<fdb::Value> res = copyValueRef(future.get());
+					        if (res != value) {
+						        error(fmt::format(
+						            "expected: {} actual: {}", fdb::toCharsRef(value), fdb::toCharsRef(res.value())));
+					        }
+					        ctx->done();
+				        });
+			        },
+			        cont);
+		    });
+	}
+};
+
+WorkloadFactory<SetAndGetWorkload> SetAndGetWorkloadFactory("SetAndGet");
+
+} // namespace FdbApiTester
--- a/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml
+++ b/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml
@ -0,0 +1,23 @@
+[[test]]
+title = 'Mixed Workload for Upgrade Tests with a Multi-Threaded Client'
+multiThreaded = true
+buggify = true
+databasePerTransaction = false
+minFdbThreads = 2
+maxFdbThreads = 8
+minDatabases = 2
+maxDatabases = 8
+minClientThreads = 2
+maxClientThreads = 8
+minClients = 2
+maxClients = 8
+
+    [[test.workload]]
+    name = 'ApiBlobGranuleCorrectness'
+    minKeyLength = 1
+	maxKeyLength = 64
+	minValueLength = 1
+	maxValueLength = 1000
+	maxKeysPerTransaction = 50
+	initialSize = 100
+	runUntilStop = true
--- a/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml
+++ b/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml
@ -32,4 +32,14 @@ maxClients = 8
    maxKeysPerTransaction = 50
    initialSize = 100
    runUntilStop = true
-    readExistingKeysRatio = 0.9
+    readExistingKeysRatio = 0.9
+
+    [[test.workload]]
+    name = 'AtomicOpsCorrectness'
+    initialSize = 0
+    runUntilStop = true
+
+    [[test.workload]]
+    name = 'WatchAndWait'
+    initialSize = 0
+    runUntilStop = true
--- a/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml
+++ b/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml
@ -30,4 +30,14 @@ maxClients = 8
    maxKeysPerTransaction = 50
    initialSize = 100
    runUntilStop = true
-    readExistingKeysRatio = 0.9
+    readExistingKeysRatio = 0.9
+
+    [[test.workload]]
+    name = 'AtomicOpsCorrectness'
+    initialSize = 0
+    runUntilStop = true
+
+    [[test.workload]]
+    name = 'WatchAndWait'
+    initialSize = 0
+    runUntilStop = true
--- a/bindings/c/test/fdb_api.hpp
+++ b/bindings/c/test/fdb_api.hpp
@ -559,9 +559,9 @@ public:
 		                                         reverse);
 	}

-	TypedFuture<future_var::KeyRangeRefArray> getBlobGranuleRanges(KeyRef begin, KeyRef end) {
+	TypedFuture<future_var::KeyRangeRefArray> getBlobGranuleRanges(KeyRef begin, KeyRef end, int rangeLimit) {
 		return native::fdb_transaction_get_blob_granule_ranges(
-		    tr.get(), begin.data(), intSize(begin), end.data(), intSize(end));
+		    tr.get(), begin.data(), intSize(begin), end.data(), intSize(end), rangeLimit);
 	}

 	Result readBlobGranules(KeyRef begin,
--- a/bindings/c/test/unit/fdb_api.cpp
+++ b/bindings/c/test/unit/fdb_api.cpp
@ -356,9 +356,15 @@ fdb_error_t Transaction::add_conflict_range(std::string_view begin_key,
 	    tr_, (const uint8_t*)begin_key.data(), begin_key.size(), (const uint8_t*)end_key.data(), end_key.size(), type);
 }

-KeyRangeArrayFuture Transaction::get_blob_granule_ranges(std::string_view begin_key, std::string_view end_key) {
-	return KeyRangeArrayFuture(fdb_transaction_get_blob_granule_ranges(
-	    tr_, (const uint8_t*)begin_key.data(), begin_key.size(), (const uint8_t*)end_key.data(), end_key.size()));
+KeyRangeArrayFuture Transaction::get_blob_granule_ranges(std::string_view begin_key,
+                                                         std::string_view end_key,
+                                                         int rangeLimit) {
+	return KeyRangeArrayFuture(fdb_transaction_get_blob_granule_ranges(tr_,
+	                                                                   (const uint8_t*)begin_key.data(),
+	                                                                   begin_key.size(),
+	                                                                   (const uint8_t*)end_key.data(),
+	                                                                   end_key.size(),
+	                                                                   rangeLimit));
 }
 KeyValueArrayResult Transaction::read_blob_granules(std::string_view begin_key,
                                                    std::string_view end_key,
--- a/bindings/c/test/unit/fdb_api.hpp
+++ b/bindings/c/test/unit/fdb_api.hpp
@ -348,7 +348,7 @@ public:
 	// Wrapper around fdb_transaction_add_conflict_range.
 	fdb_error_t add_conflict_range(std::string_view begin_key, std::string_view end_key, FDBConflictRangeType type);

-	KeyRangeArrayFuture get_blob_granule_ranges(std::string_view begin_key, std::string_view end_key);
+	KeyRangeArrayFuture get_blob_granule_ranges(std::string_view begin_key, std::string_view end_key, int rangeLimit);
 	KeyValueArrayResult read_blob_granules(std::string_view begin_key,
 	                                       std::string_view end_key,
 	                                       int64_t beginVersion,
--- a/bindings/c/test/unit/unit_tests.cpp
+++ b/bindings/c/test/unit/unit_tests.cpp
@ -2853,7 +2853,7 @@ TEST_CASE("Blob Granule Functions") {
 	// test ranges

 	while (1) {
-		fdb::KeyRangeArrayFuture f = tr.get_blob_granule_ranges(key("bg"), key("bh"));
+		fdb::KeyRangeArrayFuture f = tr.get_blob_granule_ranges(key("bg"), key("bh"), 1000);
 		fdb_error_t err = wait_future(f);
 		if (err) {
 			fdb::EmptyFuture f2 = tr.on_error(err);
--- a/bindings/java/CMakeLists.txt
+++ b/bindings/java/CMakeLists.txt
@ -34,9 +34,11 @@ set(JAVA_BINDING_SRCS
  src/main/com/apple/foundationdb/FDBDatabase.java
  src/main/com/apple/foundationdb/FDBTenant.java
  src/main/com/apple/foundationdb/FDBTransaction.java
+  src/main/com/apple/foundationdb/FutureBool.java
  src/main/com/apple/foundationdb/FutureInt64.java
  src/main/com/apple/foundationdb/FutureKey.java
  src/main/com/apple/foundationdb/FutureKeyArray.java
+  src/main/com/apple/foundationdb/FutureKeyRangeArray.java
  src/main/com/apple/foundationdb/FutureResult.java
  src/main/com/apple/foundationdb/FutureResults.java
  src/main/com/apple/foundationdb/FutureMappedResults.java
@ -56,6 +58,7 @@ set(JAVA_BINDING_SRCS
  src/main/com/apple/foundationdb/RangeQuery.java
  src/main/com/apple/foundationdb/MappedRangeQuery.java
  src/main/com/apple/foundationdb/KeyArrayResult.java
+  src/main/com/apple/foundationdb/KeyRangeArrayResult.java
  src/main/com/apple/foundationdb/RangeResult.java
  src/main/com/apple/foundationdb/MappedRangeResult.java
  src/main/com/apple/foundationdb/RangeResultInfo.java
--- a/bindings/java/fdbJNI.cpp
+++ b/bindings/java/fdbJNI.cpp
@ -25,9 +25,11 @@
 #include "com_apple_foundationdb_FDB.h"
 #include "com_apple_foundationdb_FDBDatabase.h"
 #include "com_apple_foundationdb_FDBTransaction.h"
+#include "com_apple_foundationdb_FutureBool.h"
 #include "com_apple_foundationdb_FutureInt64.h"
 #include "com_apple_foundationdb_FutureKey.h"
 #include "com_apple_foundationdb_FutureKeyArray.h"
+#include "com_apple_foundationdb_FutureKeyRangeArray.h"
 #include "com_apple_foundationdb_FutureResult.h"
 #include "com_apple_foundationdb_FutureResults.h"
 #include "com_apple_foundationdb_FutureStrings.h"
@ -55,7 +57,11 @@ static jclass mapped_range_result_class;
 static jclass mapped_key_value_class;
 static jclass string_class;
 static jclass key_array_result_class;
+static jclass keyrange_class;
+static jclass keyrange_array_result_class;
 static jmethodID key_array_result_init;
+static jmethodID keyrange_init;
+static jmethodID keyrange_array_result_init;
 static jmethodID range_result_init;
 static jmethodID mapped_range_result_init;
 static jmethodID mapped_key_value_from_bytes;
@ -278,6 +284,23 @@ JNIEXPORT void JNICALL Java_com_apple_foundationdb_NativeFuture_Future_1releaseM
 	fdb_future_release_memory(var);
 }

+JNIEXPORT jboolean JNICALL Java_com_apple_foundationdb_FutureBool_FutureBool_1get(JNIEnv* jenv, jobject, jlong future) {
+	if (!future) {
+		throwParamNotNull(jenv);
+		return 0;
+	}
+	FDBFuture* f = (FDBFuture*)future;
+
+	fdb_bool_t value = false;
+	fdb_error_t err = fdb_future_get_bool(f, &value);
+	if (err) {
+		safeThrow(jenv, getThrowable(jenv, err));
+		return 0;
+	}
+
+	return (jboolean)value;
+}
+
 JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FutureInt64_FutureInt64_1get(JNIEnv* jenv, jobject, jlong future) {
 	if (!future) {
 		throwParamNotNull(jenv);
@ -407,6 +430,61 @@ JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureKeyArray_FutureKeyAr
 	return result;
 }

+JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureKeyRangeArray_FutureKeyRangeArray_1get(JNIEnv* jenv,
+                                                                                                   jobject,
+                                                                                                   jlong future) {
+	if (!future) {
+		throwParamNotNull(jenv);
+		return JNI_NULL;
+	}
+
+	FDBFuture* f = (FDBFuture*)future;
+
+	const FDBKeyRange* fdbKr;
+	int count;
+	fdb_error_t err = fdb_future_get_keyrange_array(f, &fdbKr, &count);
+	if (err) {
+		safeThrow(jenv, getThrowable(jenv, err));
+		return JNI_NULL;
+	}
+
+	jobjectArray kr_values = jenv->NewObjectArray(count, keyrange_class, NULL);
+	if (!kr_values) {
+		if (!jenv->ExceptionOccurred())
+			throwOutOfMem(jenv);
+		return JNI_NULL;
+	}
+
+	for (int i = 0; i < count; i++) {
+		jbyteArray beginArr = jenv->NewByteArray(fdbKr[i].begin_key_length);
+		if (!beginArr) {
+			if (!jenv->ExceptionOccurred())
+				throwOutOfMem(jenv);
+			return JNI_NULL;
+		}
+		jbyteArray endArr = jenv->NewByteArray(fdbKr[i].end_key_length);
+		if (!endArr) {
+			if (!jenv->ExceptionOccurred())
+				throwOutOfMem(jenv);
+			return JNI_NULL;
+		}
+		jenv->SetByteArrayRegion(beginArr, 0, fdbKr[i].begin_key_length, (const jbyte*)fdbKr[i].begin_key);
+		jenv->SetByteArrayRegion(endArr, 0, fdbKr[i].end_key_length, (const jbyte*)fdbKr[i].end_key);
+
+		jobject kr = jenv->NewObject(keyrange_class, keyrange_init, beginArr, endArr);
+		if (jenv->ExceptionOccurred())
+			return JNI_NULL;
+		jenv->SetObjectArrayElement(kr_values, i, kr);
+		if (jenv->ExceptionOccurred())
+			return JNI_NULL;
+	}
+	jobject krarr = jenv->NewObject(keyrange_array_result_class, keyrange_array_result_init, kr_values);
+	if (jenv->ExceptionOccurred())
+		return JNI_NULL;
+
+	return krarr;
+}
+
 // SOMEDAY: explore doing this more efficiently with Direct ByteBuffers
 JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureResults_FutureResults_1get(JNIEnv* jenv,
                                                                                       jobject,
@ -830,6 +908,142 @@ Java_com_apple_foundationdb_FDBDatabase_Database_1waitPurgeGranulesComplete(JNIE
 	return (jlong)f;
 }

+JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1blobbifyRange(JNIEnv* jenv,
+                                                                                        jobject,
+                                                                                        jlong dbPtr,
+                                                                                        jbyteArray beginKeyBytes,
+                                                                                        jbyteArray endKeyBytes) {
+	if (!dbPtr || !beginKeyBytes || !endKeyBytes) {
+		throwParamNotNull(jenv);
+		return 0;
+	}
+
+	FDBDatabase* database = (FDBDatabase*)dbPtr;
+
+	uint8_t* beginKeyArr = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL);
+	if (!beginKeyArr) {
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	uint8_t* endKeyArr = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL);
+	if (!endKeyArr) {
+		jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT);
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	FDBFuture* f = fdb_database_blobbify_range(
+	    database, beginKeyArr, jenv->GetArrayLength(beginKeyBytes), endKeyArr, jenv->GetArrayLength(endKeyBytes));
+	jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT);
+	jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKeyArr, JNI_ABORT);
+	return (jlong)f;
+}
+
+JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1unblobbifyRange(JNIEnv* jenv,
+                                                                                          jobject,
+                                                                                          jlong dbPtr,
+                                                                                          jbyteArray beginKeyBytes,
+                                                                                          jbyteArray endKeyBytes) {
+	if (!dbPtr || !beginKeyBytes || !endKeyBytes) {
+		throwParamNotNull(jenv);
+		return 0;
+	}
+
+	FDBDatabase* database = (FDBDatabase*)dbPtr;
+
+	uint8_t* beginKeyArr = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL);
+	if (!beginKeyArr) {
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	uint8_t* endKeyArr = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL);
+	if (!endKeyArr) {
+		jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT);
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	FDBFuture* f = fdb_database_unblobbify_range(
+	    database, beginKeyArr, jenv->GetArrayLength(beginKeyBytes), endKeyArr, jenv->GetArrayLength(endKeyBytes));
+	jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT);
+	jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKeyArr, JNI_ABORT);
+	return (jlong)f;
+}
+
+JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1listBlobbifiedRanges(JNIEnv* jenv,
+                                                                                               jobject,
+                                                                                               jlong dbPtr,
+                                                                                               jbyteArray beginKeyBytes,
+                                                                                               jbyteArray endKeyBytes,
+                                                                                               jint rangeLimit) {
+	if (!dbPtr || !beginKeyBytes || !endKeyBytes) {
+		throwParamNotNull(jenv);
+		return 0;
+	}
+	FDBDatabase* tr = (FDBDatabase*)dbPtr;
+
+	uint8_t* startKey = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL);
+	if (!startKey) {
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	uint8_t* endKey = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL);
+	if (!endKey) {
+		jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT);
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	FDBFuture* f = fdb_database_list_blobbified_ranges(
+	    tr, startKey, jenv->GetArrayLength(beginKeyBytes), endKey, jenv->GetArrayLength(endKeyBytes), rangeLimit);
+	jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT);
+	jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKey, JNI_ABORT);
+	return (jlong)f;
+}
+
+JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1verifyBlobRange(JNIEnv* jenv,
+                                                                                          jobject,
+                                                                                          jlong dbPtr,
+                                                                                          jbyteArray beginKeyBytes,
+                                                                                          jbyteArray endKeyBytes,
+                                                                                          jlong version) {
+	if (!dbPtr || !beginKeyBytes || !endKeyBytes) {
+		throwParamNotNull(jenv);
+		return 0;
+	}
+	FDBDatabase* tr = (FDBDatabase*)dbPtr;
+
+	uint8_t* startKey = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL);
+	if (!startKey) {
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	uint8_t* endKey = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL);
+	if (!endKey) {
+		jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT);
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	FDBFuture* f = fdb_database_list_blobbified_ranges(
+	    tr, startKey, jenv->GetArrayLength(beginKeyBytes), endKey, jenv->GetArrayLength(endKeyBytes), version);
+	jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT);
+	jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKey, JNI_ABORT);
+	return (jlong)f;
+}
+
 JNIEXPORT jboolean JNICALL Java_com_apple_foundationdb_FDB_Error_1predicate(JNIEnv* jenv,
                                                                            jobject,
                                                                            jint predicate,
@ -1307,6 +1521,41 @@ Java_com_apple_foundationdb_FDBTransaction_Transaction_1getRangeSplitPoints(JNIE
 	return (jlong)f;
 }

+JNIEXPORT jlong JNICALL
+Java_com_apple_foundationdb_FDBTransaction_Transaction_1getBlobGranuleRanges(JNIEnv* jenv,
+                                                                             jobject,
+                                                                             jlong tPtr,
+                                                                             jbyteArray beginKeyBytes,
+                                                                             jbyteArray endKeyBytes,
+                                                                             jint rowLimit) {
+	if (!tPtr || !beginKeyBytes || !endKeyBytes || !rowLimit) {
+		throwParamNotNull(jenv);
+		return 0;
+	}
+	FDBTransaction* tr = (FDBTransaction*)tPtr;
+
+	uint8_t* startKey = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL);
+	if (!startKey) {
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	uint8_t* endKey = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL);
+	if (!endKey) {
+		jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT);
+		if (!jenv->ExceptionOccurred())
+			throwRuntimeEx(jenv, "Error getting handle to native resources");
+		return 0;
+	}
+
+	FDBFuture* f = fdb_transaction_get_blob_granule_ranges(
+	    tr, startKey, jenv->GetArrayLength(beginKeyBytes), endKey, jenv->GetArrayLength(endKeyBytes), rowLimit);
+	jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT);
+	jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKey, JNI_ABORT);
+	return (jlong)f;
+}
+
 JNIEXPORT void JNICALL Java_com_apple_foundationdb_FDBTransaction_Transaction_1set(JNIEnv* jenv,
                                                                                   jobject,
                                                                                   jlong tPtr,
@ -1746,6 +1995,15 @@ jint JNI_OnLoad(JavaVM* vm, void* reserved) {
 		key_array_result_init = env->GetMethodID(local_key_array_result_class, "<init>", "([B[I)V");
 		key_array_result_class = (jclass)(env)->NewGlobalRef(local_key_array_result_class);

+		jclass local_keyrange_class = env->FindClass("com/apple/foundationdb/Range");
+		keyrange_init = env->GetMethodID(local_keyrange_class, "<init>", "([B[B)V");
+		keyrange_class = (jclass)(env)->NewGlobalRef(local_keyrange_class);
+
+		jclass local_keyrange_array_result_class = env->FindClass("com/apple/foundationdb/KeyRangeArrayResult");
+		keyrange_array_result_init =
+		    env->GetMethodID(local_keyrange_array_result_class, "<init>", "([Lcom/apple/foundationdb/Range;)V");
+		keyrange_array_result_class = (jclass)(env)->NewGlobalRef(local_keyrange_array_result_class);
+
 		jclass local_range_result_summary_class = env->FindClass("com/apple/foundationdb/RangeResultSummary");
 		range_result_summary_init = env->GetMethodID(local_range_result_summary_class, "<init>", "([BIZ)V");
 		range_result_summary_class = (jclass)(env)->NewGlobalRef(local_range_result_summary_class);
@ -1770,6 +2028,12 @@ void JNI_OnUnload(JavaVM* vm, void* reserved) {
 		if (range_result_class != JNI_NULL) {
 			env->DeleteGlobalRef(range_result_class);
 		}
+		if (keyrange_array_result_class != JNI_NULL) {
+			env->DeleteGlobalRef(keyrange_array_result_class);
+		}
+		if (keyrange_class != JNI_NULL) {
+			env->DeleteGlobalRef(keyrange_class);
+		}
 		if (mapped_range_result_class != JNI_NULL) {
 			env->DeleteGlobalRef(mapped_range_result_class);
 		}
--- a/bindings/java/src/main/com/apple/foundationdb/Database.java
+++ b/bindings/java/src/main/com/apple/foundationdb/Database.java
@ -161,6 +161,20 @@ public interface Database extends AutoCloseable, TransactionContext {
 	 */
 	double getMainThreadBusyness();

+	/**
+	 * Runs {@link #purgeBlobGranules(Function)} on the default executor.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+	 * @param purgeVersion version to purge at
+	 * @param force if true delete all data, if not keep data >= purgeVersion
+	 *
+	 * @return the key to watch for purge complete
+	 */
+	default CompletableFuture<byte[]> purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force) {
+		return purgeBlobGranules(beginKey, endKey, purgeVersion, force, getExecutor());
+	}
+
 	/**
 	 * Queues a purge of blob granules for the specified key range, at the specified version.
     *
@ -168,17 +182,126 @@ public interface Database extends AutoCloseable, TransactionContext {
 	 * @param endKey end of the key range
 	 * @param purgeVersion version to purge at
 	 * @param force if true delete all data, if not keep data >= purgeVersion
+	 * @param e the {@link Executor} to use for asynchronous callbacks
+
 	 * @return the key to watch for purge complete
 	 */
 	CompletableFuture<byte[]> purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force, Executor e);

+
 	/**
-	 * Wait for a previous call to purgeBlobGranules to complete
+	 * Runs {@link #waitPurgeGranulesComplete(Function)} on the default executor.
 	 *
 	 * @param purgeKey key to watch
 	 */
+	default CompletableFuture<Void> waitPurgeGranulesComplete(byte[] purgeKey) {
+		return waitPurgeGranulesComplete(purgeKey, getExecutor());
+	}
+
+	/**
+	 * Wait for a previous call to purgeBlobGranules to complete.
+	 *
+	 * @param purgeKey key to watch
+	 * @param e the {@link Executor} to use for asynchronous callbacks
+	 */
 	CompletableFuture<Void> waitPurgeGranulesComplete(byte[] purgeKey, Executor e);

+	/**
+	 * Runs {@link #blobbifyRange(Function)} on the default executor.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+
+	 * @return if the recording of the range was successful
+	 */
+	default CompletableFuture<Boolean> blobbifyRange(byte[] beginKey, byte[] endKey) {
+		return blobbifyRange(beginKey, endKey, getExecutor());
+	}
+
+	/**
+	 * Sets a range to be blobbified in the database. Must be a completely unblobbified range.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+	 * @param e the {@link Executor} to use for asynchronous callbacks
+
+	 * @return if the recording of the range was successful
+	 */
+	CompletableFuture<Boolean> blobbifyRange(byte[] beginKey, byte[] endKey, Executor e);
+
+	/**
+	 * Runs {@link #unblobbifyRange(Function)} on the default executor.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+
+	 * @return if the recording of the range was successful
+	 */
+	default CompletableFuture<Boolean> unblobbifyRange(byte[] beginKey, byte[] endKey) {
+		return unblobbifyRange(beginKey, endKey, getExecutor());
+	}
+
+	/**
+	 * Sets a range to be unblobbified in the database.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+	 * @param e the {@link Executor} to use for asynchronous callbacks
+
+	 * @return if the recording of the range was successful
+	 */
+	CompletableFuture<Boolean> unblobbifyRange(byte[] beginKey, byte[] endKey, Executor e);
+
+	/**
+	 * Runs {@link #listBlobbifiedRanges(Function)} on the default executor.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+	 * @param rangeLimit batch size
+	 * @param e the {@link Executor} to use for asynchronous callbacks
+
+	 * @return a future with the list of blobbified ranges.
+	 */
+	 default CompletableFuture<KeyRangeArrayResult> listBlobbifiedRanges(byte[] beginKey, byte[] endKey, int rangeLimit) {
+		return listBlobbifiedRanges(beginKey, endKey, rangeLimit, getExecutor());
+	 }
+
+	/**
+	 * Lists blobbified ranges in the database. There may be more if result.size() == rangeLimit.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+	 * @param rangeLimit batch size
+	 * @param e the {@link Executor} to use for asynchronous callbacks
+
+	 * @return a future with the list of blobbified ranges.
+	 */
+	 CompletableFuture<KeyRangeArrayResult> listBlobbifiedRanges(byte[] beginKey, byte[] endKey, int rangeLimit, Executor e);
+
+	/**
+	 * Runs {@link #verifyBlobRange(Function)} on the default executor.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+	 * @param version version to read at
+	 *
+	 * @return a future with the version of the last blob granule.
+	 */
+	default CompletableFuture<Long> verifyBlobRange(byte[] beginKey, byte[] endKey, long version) {
+		return verifyBlobRange(beginKey, endKey, version, getExecutor());
+	}
+
+	/**
+	 * Checks if a blob range is blobbified.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+	 * @param version version to read at
+	 *
+	 * @return a future with the version of the last blob granule.
+	 */
+	CompletableFuture<Long> verifyBlobRange(byte[] beginKey, byte[] endKey, long version, Executor e);
+
 	/**
 	 * Runs a read-only transactional function against this {@code Database} with retry logic.
 	 *  {@link Function#apply(Object) apply(ReadTransaction)} will be called on the
--- a/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java
+++ b/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java
@ -201,20 +201,60 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume
 	}

 	@Override
-	public CompletableFuture<byte[]> purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force, Executor executor) {
+	public CompletableFuture<byte[]> purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force, Executor e) {
 		pointerReadLock.lock();
 		try {
-			return new FutureKey(Database_purgeBlobGranules(getPtr(), beginKey, endKey, purgeVersion, force), executor, eventKeeper);
+			return new FutureKey(Database_purgeBlobGranules(getPtr(), beginKey, endKey, purgeVersion, force), e, eventKeeper);
 		} finally {
 			pointerReadLock.unlock();
 		}
 	}

 	@Override
-	public CompletableFuture<Void> waitPurgeGranulesComplete(byte[] purgeKey, Executor executor) {
+	public CompletableFuture<Void> waitPurgeGranulesComplete(byte[] purgeKey, Executor e) {
 		pointerReadLock.lock();
 		try {
-			return new FutureVoid(Database_waitPurgeGranulesComplete(getPtr(), purgeKey), executor);
+			return new FutureVoid(Database_waitPurgeGranulesComplete(getPtr(), purgeKey), e);
+		} finally {
+			pointerReadLock.unlock();
+		}
+	}
+
+	@Override
+	public CompletableFuture<Boolean> blobbifyRange(byte[] beginKey, byte[] endKey, Executor e) {
+		pointerReadLock.lock();
+		try {
+			return new FutureBool(Database_blobbifyRange(getPtr(), beginKey, endKey), e);
+		} finally {
+			pointerReadLock.unlock();
+		}
+	}
+
+	@Override
+	public CompletableFuture<Boolean> unblobbifyRange(byte[] beginKey, byte[] endKey, Executor e) {
+		pointerReadLock.lock();
+		try {
+			return new FutureBool(Database_unblobbifyRange(getPtr(), beginKey, endKey), e);
+		} finally {
+			pointerReadLock.unlock();
+		}
+	}
+
+	@Override
+	public CompletableFuture<KeyRangeArrayResult> listBlobbifiedRanges(byte[] beginKey, byte[] endKey, int rangeLimit, Executor e) {
+		pointerReadLock.lock();
+		try {
+			return new FutureKeyRangeArray(Database_listBlobbifiedRanges(getPtr(), beginKey, endKey, rangeLimit), e);
+		} finally {
+			pointerReadLock.unlock();
+		}
+	}
+
+	@Override
+	public CompletableFuture<Long> verifyBlobRange(byte[] beginKey, byte[] endKey, long version, Executor e) {
+		pointerReadLock.lock();
+		try {
+			return new FutureInt64(Database_verifyBlobRange(getPtr(), beginKey, endKey, version), e);
 		} finally {
 			pointerReadLock.unlock();
 		}
@ -237,4 +277,8 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume
 	private native double Database_getMainThreadBusyness(long cPtr);
 	private native long Database_purgeBlobGranules(long cPtr, byte[] beginKey, byte[] endKey, long purgeVersion, boolean force);
 	private native long Database_waitPurgeGranulesComplete(long cPtr, byte[] purgeKey);
+	private native long Database_blobbifyRange(long cPtr, byte[] beginKey, byte[] endKey);
+	private native long Database_unblobbifyRange(long cPtr, byte[] beginKey, byte[] endKey);
+	private native long Database_listBlobbifiedRanges(long cPtr, byte[] beginKey, byte[] endKey, int rangeLimit);
+	private native long Database_verifyBlobRange(long cPtr, byte[] beginKey, byte[] endKey, long version);
 }
--- a/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java
+++ b/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java
@ -97,6 +97,11 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC
 			return FDBTransaction.this.getRangeSplitPoints(range, chunkSize);
 		}

+		@Override
+		public CompletableFuture<KeyRangeArrayResult> getBlobGranuleRanges(byte[] begin, byte[] end, int rowLimit) {
+			return FDBTransaction.this.getBlobGranuleRanges(begin, end, rowLimit);
+		}
+
 		@Override
 		public AsyncIterable<MappedKeyValue> getMappedRange(KeySelector begin, KeySelector end, byte[] mapper,
 		                                                    int limit, int matchIndex, boolean reverse,
@ -352,6 +357,16 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC
 		return this.getRangeSplitPoints(range.begin, range.end, chunkSize);
 	}

+	@Override
+	public CompletableFuture<KeyRangeArrayResult> getBlobGranuleRanges(byte[] begin, byte[] end, int rowLimit) {
+		pointerReadLock.lock();
+		try {
+			return new FutureKeyRangeArray(Transaction_getBlobGranuleRanges(getPtr(), begin, end, rowLimit), executor);
+		} finally {
+			pointerReadLock.unlock();
+		}
+	}
+
 	@Override
 	public AsyncIterable<MappedKeyValue> getMappedRange(KeySelector begin, KeySelector end, byte[] mapper, int limit,
 	                                                    int matchIndex, boolean reverse, StreamingMode mode) {
@ -842,4 +857,5 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC
 	private native long Transaction_getKeyLocations(long cPtr, byte[] key);
 	private native long Transaction_getEstimatedRangeSizeBytes(long cPtr, byte[] keyBegin, byte[] keyEnd);
 	private native long Transaction_getRangeSplitPoints(long cPtr, byte[] keyBegin, byte[] keyEnd, long chunkSize);
+	private native long Transaction_getBlobGranuleRanges(long cPtr, byte[] keyBegin, byte[] keyEnd, int rowLimit);
 }
--- a/bindings/java/src/main/com/apple/foundationdb/FutureBool.java
+++ b/bindings/java/src/main/com/apple/foundationdb/FutureBool.java
@ -0,0 +1,37 @@
+/*
+ * FutureBool.java
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.apple.foundationdb;
+
+import java.util.concurrent.Executor;
+
+class FutureBool extends NativeFuture<Boolean> {
+	FutureBool(long cPtr, Executor executor) {
+		super(cPtr);
+		registerMarshalCallback(executor);
+	}
+
+	@Override
+	protected Boolean getIfDone_internal(long cPtr) throws FDBException {
+		return FutureBool_get(cPtr);
+	}
+
+	private native boolean FutureBool_get(long cPtr) throws FDBException;
+}
--- a/bindings/java/src/main/com/apple/foundationdb/FutureKeyRangeArray.java
+++ b/bindings/java/src/main/com/apple/foundationdb/FutureKeyRangeArray.java
@ -0,0 +1,37 @@
+/*
+ * FutureKeyRangeArray.java
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.apple.foundationdb;
+
+import java.util.concurrent.Executor;
+
+class FutureKeyRangeArray extends NativeFuture<KeyRangeArrayResult> {
+	FutureKeyRangeArray(long cPtr, Executor executor) {
+		super(cPtr);
+		registerMarshalCallback(executor);
+	}
+
+	@Override
+	protected KeyRangeArrayResult getIfDone_internal(long cPtr) throws FDBException {
+		return FutureKeyRangeArray_get(cPtr);
+	}
+
+	private native KeyRangeArrayResult FutureKeyRangeArray_get(long cPtr) throws FDBException;
+}
--- a/bindings/java/src/main/com/apple/foundationdb/KeyRangeArrayResult.java
+++ b/bindings/java/src/main/com/apple/foundationdb/KeyRangeArrayResult.java
@ -0,0 +1,36 @@
+/*
+ * KeyRangeArrayResult.java
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.apple.foundationdb;
+
+import java.util.Arrays;
+import java.util.List;
+
+public class KeyRangeArrayResult {
+	final List<Range> keyRanges;
+
+	public KeyRangeArrayResult(Range[] keyRangeArr) {
+		this.keyRanges = Arrays.asList(keyRangeArr);
+	}
+
+	public List<Range> getKeyRanges() {
+		return keyRanges;
+	}
+}
--- a/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java
+++ b/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java
@ -513,6 +513,17 @@ public interface ReadTransaction extends ReadTransactionContext {
 	 */
 	CompletableFuture<KeyArrayResult> getRangeSplitPoints(Range range, long chunkSize);

+	/**
+	 * Gets the blob granule ranges for a given region.
+	 * Returned in batches, requires calling again moving the begin key up.
+	 *
+	 * @param begin beginning of the range (inclusive)
+	 * @param end end of the range (exclusive)
+
+	 * @return list of blob granules in the given range. May not be all.
+	 */
+	 CompletableFuture<KeyRangeArrayResult> getBlobGranuleRanges(byte[] begin, byte[] end, int rowLimit);
+
 	
 	/**
 	 * Returns a set of options that can be set on a {@code Transaction}
--- a/cmake/ConfigureCompiler.cmake
+++ b/cmake/ConfigureCompiler.cmake
@ -69,6 +69,7 @@ if(WIN32)
  add_definitions(-DWIN32_LEAN_AND_MEAN)
  add_definitions(-D_ITERATOR_DEBUG_LEVEL=0)
  add_definitions(-DNOGDI) # WinGDI.h defines macro ERROR
+  add_definitions(-D_USE_MATH_DEFINES) # Math constants
 endif()

 if (USE_CCACHE)
--- a/contrib/TestHarness/Program.cs
+++ b/contrib/TestHarness/Program.cs
@ -302,6 +302,7 @@ namespace SummarizeTest
                        uniqueFileSet.Add(file.Substring(0, file.LastIndexOf("-"))); // all restarting tests end with -1.txt or -2.txt
                    }
                    uniqueFiles = uniqueFileSet.ToArray();
+                    Array.Sort(uniqueFiles);
                    testFile = random.Choice(uniqueFiles);
                    // The on-disk format changed in 4.0.0, and 5.x can't load files from 3.x.
                    string oldBinaryVersionLowerBound = "4.0.0";
@ -334,8 +335,9 @@ namespace SummarizeTest
                        // thus, by definition, if "until_" appears, we do not want to run with the current binary version
                        oldBinaries = oldBinaries.Concat(currentBinary);
                    }
-                    List<string> oldBinariesList = oldBinaries.ToList<string>();
-                    if (oldBinariesList.Count == 0) {
+                    string[] oldBinariesList = oldBinaries.ToArray<string>();
+                    Array.Sort(oldBinariesList);
+                    if (oldBinariesList.Count() == 0) {
                        // In theory, restarting tests are named to have at least one old binary version to run
                        // But if none of the provided old binaries fall in the range, we just skip the test
                        Console.WriteLine("No available old binary version from {0} to {1}", oldBinaryVersionLowerBound, oldBinaryVersionUpperBound);
@ -347,6 +349,7 @@ namespace SummarizeTest
                else
                {
                    uniqueFiles = Directory.GetFiles(testDir);
+                    Array.Sort(uniqueFiles);
                    testFile = random.Choice(uniqueFiles);
                }
            }
@ -718,7 +721,7 @@ namespace SummarizeTest
                        process.Refresh();
                        if (process.HasExited)
                            return;
-                        long mem = process.PrivateMemorySize64;
+                        long mem = process.PagedMemorySize64;
                        MaxMem = Math.Max(MaxMem, mem);
                        //Console.WriteLine(string.Format("Process used {0} bytes", MaxMem));
                        Thread.Sleep(1000);
--- a/contrib/observability_splunk_dashboard/details.xml
+++ b/contrib/observability_splunk_dashboard/details.xml
@ -0,0 +1,431 @@
+<form theme="light">
+  <label>FoundationDB - Details</label>
+  <description>Details for FoundationDB Cluster</description>
+  <fieldset submitButton="false">
+    <input type="text" token="Index" searchWhenChanged="true">
+      <label>Index</label>
+      <default>*</default>
+    </input>
+    <input type="text" token="LogGroup" searchWhenChanged="true">
+      <label>LogGroup</label>
+      <default>*</default>
+    </input>
+    <input type="time" token="TimeRange" searchWhenChanged="true">
+      <label>Time Range</label>
+      <default>
+        <earliest>-60m@m</earliest>
+        <latest>now</latest>
+      </default>
+    </input>
+    <input type="dropdown" token="Span" searchWhenChanged="true">
+      <label>Timechart Resolution</label>
+      <choice value="bins=100">Default</choice>
+      <choice value="span=5s">5 seconds</choice>
+      <choice value="span=1m">1 minute</choice>
+      <choice value="span=10m">10 minutes</choice>
+      <choice value="span=1h">1 hour</choice>
+      <choice value="span=1d">1 day</choice>
+      <default>bins=100</default>
+      <initialValue>bins=100</initialValue>
+    </input>
+    <input type="dropdown" token="Roles" searchWhenChanged="true">
+      <label>Roles</label>
+      <choice value="">All</choice>
+      <choice value="Roles=*SS*">Storage Server</choice>
+      <choice value="Roles=*TL*">Transaction Log</choice>
+      <choice value="Roles=*MP*">Proxy</choice>
+      <choice value="Roles=*RV*">Resolver</choice>
+      <choice value="Roles=*MS*">Master</choice>
+      <choice value="Roles=*CC*">Cluster Controller</choice>
+      <choice value="Roles=*LR*">Log Router</choice>
+      <choice value="Roles=*DD*">Data Distributor</choice>
+      <choice value="Roles=*RK*">Ratekeeper</choice>
+      <choice value="Roles=*TS*">Tester</choice>
+      <default></default>
+    </input>
+    <input type="text" token="Host" searchWhenChanged="true">
+      <label>Host</label>
+      <default>*</default>
+    </input>
+    <input type="text" token="Machine" searchWhenChanged="true">
+      <label>Machine</label>
+      <default>*</default>
+    </input>
+  </fieldset>
+  <row>
+    <panel>
+      <chart>
+        <title>Storage Queue Size</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesInput "(?&lt;InputRate&gt;.*) (?&lt;InputRoughness&gt;.*) (?&lt;InputCounter&gt;.*)" | rex field=BytesDurable "(?&lt;DurableRate&gt;.*) (?&lt;DurableRoughness&gt;.*) (?&lt;DurableCounter&gt;.*)" | eval QueueSize=InputCounter-DurableCounter | timechart $Span$ avg(QueueSize) by Machine</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>Storage Input Rate</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesInput "(?&lt;InputRate&gt;.*) (?&lt;InputRoughness&gt;.*) (?&lt;InputCounter&gt;.*)" | timechart $Span$ avg(InputRate) by Machine</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>Storage Bytes Queried</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesQueried "(?&lt;Rate&gt;.*) (?&lt;Roughness&gt;.*) (?&lt;Counter&gt;.*)" | timechart $Span$ avg(Rate) by Machine</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <chart>
+        <title>Average Process CPU by Role (capped at 2; beware kernel bug)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval Cpu=CPUSeconds/Elapsed | timechart $Span$ avg(Cpu) by Roles</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.axisY.maximumNumber">2</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>Max Process CPU by Role (capped at 2; beware kernel bug)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval Cpu=CPUSeconds/Elapsed | timechart $Span$ max(Cpu) by Roles</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.axisY.maximumNumber">2</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>Disk Busyness</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=ProcessMetrics TrackLatestType=Original | eval DiskBusyPercentage=(Elapsed-DiskIdleSeconds)/Elapsed | timechart $Span$ avg(DiskBusyPercentage) by Machine</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <chart>
+        <title>Max Run Loop Busyness by Role (for &lt;=6.1, S2Pri1)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=NetworkMetrics NOT TrackLatestType=Rolled | eval Busyness=if(isnull(PriorityStarvedBelow1), if(isnull(PriorityBusy1), S2Pri1, PriorityBusy1/Elapsed), PriorityStarvedBelow1/Elapsed) | timechart $Span$ max(Busyness) by Roles</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>Max Run Loop Busyness by Priority (6.2+ only)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=NetworkMetrics TrackLatestType=Original | foreach PriorityBusy* [eval Busyness&lt;&lt;MATCHSTR&gt;&gt;=PriorityBusy&lt;&lt;MATCHSTR&gt;&gt;/Elapsed] | timechart $Span$ max(Busyness*)</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>TLog Queue Size</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval QueueSize=SharedBytesInput-SharedBytesDurable | timechart $Span$ avg(QueueSize) by Machine</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <chart>
+        <title>Connection Timeouts (counted on both sides of connection)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut) $Roles$ host=$Host$ | eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr) | rex field=WithAddr "(?&lt;OtherAddr&gt;[^:]*:[^:]*).*" | eval Machine=Machine+","+OtherAddr | makemv delim="," Machine | search Machine=$Machine$ | eval Count=1+SuppressedEventCount | timechart sum(Count) by Machine useother=f</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.chart.nullValueMode">zero</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>Pairwise Connection Timeouts Between Datacenters</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut)  host=* Machine=* NOT TrackLatestType=Rolled 
+| eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr) 
+| rex field=host "(?&lt;Datacenter&gt;..).*" 
+| eval Datacenter=if(isnotnull(pie_work_unit), pie_work_unit, Datacenter) 
+| rex field=WithAddr "(?&lt;OtherIP&gt;[^:]*):.*" 
+| join OtherIP 
+    [search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics NOT TrackLatestType=Rolled 
+    | rex field=Machine "(?&lt;OtherIP&gt;[^:]*):.*" 
+    | rex field=host "(?&lt;OtherDatacenter&gt;..).*"
+    | eval OtherDatacenter=if(isnotnull(pie_work_unit), pie_work_unit, OtherDatacenter)]
+| eval DC1=if(Datacenter&gt;OtherDatacenter, Datacenter, OtherDatacenter), DC2=if(Datacenter&gt;OtherDatacenter, OtherDatacenter, Datacenter) 
+| eval Connection=DC1+" &lt;-&gt; " + DC2 
+| eval Count=1+SuppressedEventCount 
+| timechart count by Connection</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <table>
+        <title>Pairwise Connection Timeouts Between Known Server Processes (Sorted by Count, descending)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut OR Type=ProcessMetrics) $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr), Reason=if(Type=="ConnectionTimedOut", "Timed out trying to connect", "Established connection timed out") | rex field=Machine "(?&lt;IP&gt;[^:]*):.*" | rex field=host "(?&lt;Datacenter&gt;..).*" | rex field=WithAddr "(?&lt;OtherIP&gt;[^:]*):.*" | eventstats values(Roles) as Roles by IP | join OtherIP [search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics NOT TrackLatestType=Rolled | rex field=Machine "(?&lt;OtherIP&gt;[^:]*):.*" | rex field=host "(?&lt;OtherDatacenter&gt;..).*" | stats values(Roles) as OtherRoles by OtherIP, OtherDatacenter | eval OtherRoles="("+mvjoin(OtherRoles,",")+")"] | eval Roles="("+mvjoin(Roles,",")+")" | eval IP=Datacenter+": "+IP+" "+Roles, OtherIP=OtherDatacenter+": "+OtherIP+" "+OtherRoles | eval Addr1=if(IP&gt;OtherIP, IP, OtherIP), Addr2=if(IP&gt;OtherIP, OtherIP, IP) | eval Connection=Addr1+" &lt;-&gt; " + Addr2 | eval Count=1+SuppressedEventCount | stats sum(Count) as Count, values(Reason) as Reasons by Connection | sort -Count</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <chart>
+        <title>Lazy Deletion Rate (making space available for reuse)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=SpringCleaningMetrics | eval Metric=LazyDeletePages | streamstats current=f global=f window=1 first(Metric) as NextMetric, first(Time), as NextTime by ID | eval Rate=4096*(NextMetric-Metric)/(NextTime-Time) | timechart $Span$ avg(Rate) by Machine</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>Vacuuming Rate (shrinking file)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=SpringCleaningMetrics | eval Metric=VacuumedPages | streamstats current=f global=f window=1 first(Metric) as NextMetric, first(Time), as NextTime by ID | eval Rate=4096*(NextMetric-Metric)/(NextTime-Time) | timechart $Span$ avg(Rate) by Machine</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>Roles</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | makemv delim="," Roles | mvexpand Roles | timechart $Span$ distinct_count(Machine) by Roles</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <table>
+        <title>Slow Tasks (Sorted by Duration, Descending)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=SlowTask $Roles$ host=$Host$ Machine=$Machine$ | sort -Duration | table _time, Duration, Machine, TaskID, Roles</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+    <panel>
+      <table>
+        <title>Event Counts (Sorted by Severity and Count, Descending)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | stats count as Count by Type, Severity | sort -Severity, -Count</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+    <panel>
+      <table>
+        <title>Errors</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Severity=40 $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | table _time, Type, Machine, Roles</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <table>
+        <title>Recoveries (Ignores Filters)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=MasterRecoveryState TrackLatestType=Original (StatusCode=0 OR StatusCode=11) | eval RecoveryResetInterval=10 | sort _time | streamstats earliest(_time) as RecoveryStart, count as EventCount reset_after="(StatusCode=11)" | where StatusCode=11 | eval EventCount=if(EventCount==1, 2, EventCount), RecoveryStart=if(RecoveryStart==_time, _time-RecoveryDuration, RecoveryStart) | sort -_time | streamstats current=f global=f window=1 first(RecoveryStart) as NextRecoveryStart | eval RecoverySpan=NextRecoveryStart-_time, FailedRecoveries=EventCount-2, SuccessfulRecoveries=1 | eval AvailableSeconds=if(RecoverySpan&lt;RecoveryResetInterval, RecoverySpan, 0) | sort _time | streamstats earliest(RecoveryStart) as RecoveryStart, sum(FailedRecoveries) as FailedRecoveryCount, sum(SuccessfulRecoveries) as SuccessfulRecoveryCount, sum(AvailableSeconds) as AvailableSeconds reset_after="(NOT RecoverySpan &lt; RecoveryResetInterval)"  | where NOT RecoverySpan &lt; RecoveryResetInterval | eval Duration=_time-RecoveryStart, StartTime=strftime(RecoveryStart, "%F %X.%Q"), ShortLivedRecoveryCount=SuccessfulRecoveryCount-1 | table StartTime, Duration, FailedRecoveryCount, ShortLivedRecoveryCount, AvailableSeconds | sort -StartTime</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+    <panel>
+      <table>
+        <title>Process (Re)starts</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=ProgramStart TrackLatestType=Original $Roles$ host=$Host$ Machine=$Machine$ | table _time, Machine | sort -_time</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+    <panel>
+      <chart>
+        <title>Failure Detection (Machine Filter Only)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=FailureDetectionStatus System=$Machine$ | sort _time | eval Failed=if(Status=="Failed", 1, 0) | streamstats current=t global=f window=2 first(Failed) as PrevFailed by System | where PrevFailed=1 OR Failed=1 | eval Failed=PrevFailed + "," + Failed | makemv delim="," Failed | mvexpand Failed | timechart $Span$ max(Failed) by System</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.axisY.maximumNumber">1</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <table>
+        <title>Storage Server Space Usage (Sorted by Available Space Percentage, Ascending)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval AvailableSpacePercent=KvstoreBytesAvailable/KvstoreBytesTotal, FreeSpacePercent=KvstoreBytesFree/KvstoreBytesTotal, GBUsed=KvstoreBytesUsed/1e9, GBStored=BytesStored/1e9, Overhead=KvstoreBytesUsed/BytesStored, GBTotalSpace=KvstoreBytesTotal/1e9 | stats latest(AvailableSpacePercent) as AvailableSpacePercent, latest(FreeSpacePercent) as FreeSpacePercent, latest(GBStored) as GBStored, latest(GBUsed) as GBUsed, latest(Overhead) as OverheadFactor, latest(GBTotalSpace) as GBTotalSpace by Machine | sort AvailableSpacePercent</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <table>
+        <title>TLog Server Space Usage (Sorted by Available Space Percentage, Ascending)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$  Type=TLogMetrics  host=* Machine=* TrackLatestType=Original Roles=TL | eval AvailableSpacePercent=KvstoreBytesAvailable/KvstoreBytesTotal, FreeDiskSpacePercent=KvstoreBytesFree/KvstoreBytesTotal, GBUsed=KvstoreBytesUsed/1e9,  GBTotalSpace=KvstoreBytesTotal/1e9  | stats latest(AvailableSpacePercent) as AvailableSpacePercent, latest(FreeDiskSpacePercent) as FreeDiskSpacePercent,  latest(GBUsed) as GBUsed, latest(GBTotalSpace) as GBTotalSpace by Machine | sort AvailableSpacePercent</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <chart>
+        <title>Data Movement by Type (Log Scale, Ignores Filters)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=MovingData TrackLatestType=Original | timechart avg(Priority*) as *</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <chart>
+        <title>Storage Server Max Bytes Stored by Host</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval GBStored=BytesStored/1e9 | timechart max(GBStored) by host limit=100</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <table>
+        <title>Master Failed Clients</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$  Type=WaitFailureClient 
+| stats count by FailedEndpoint</query>
+          <earliest>$TimeRange.earliest$</earliest>
+          <latest>$TimeRange.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+      </table>
+    </panel>
+  </row>
+</form>
--- a/contrib/observability_splunk_dashboard/performance_overview.xml
+++ b/contrib/observability_splunk_dashboard/performance_overview.xml
@ -0,0 +1,323 @@
+<form theme="dark">
+  <label>FoundationDB - Performance Overview (Dev WiP)</label>
+  <fieldset submitButton="false" autoRun="true">
+    <input type="text" token="Index" searchWhenChanged="true">
+      <label>Index</label>
+      <default>*</default>
+    </input>
+    <input type="text" token="LogGroup" searchWhenChanged="true">
+      <label>LogGroup</label>
+      <default></default>
+    </input>
+    <input type="time" token="TimeSpan" searchWhenChanged="true">
+      <label>TimeSpan</label>
+      <default>
+        <earliest>-60m@m</earliest>
+        <latest>now</latest>
+      </default>
+    </input>
+    <input type="dropdown" token="UpdateRateTypeToken" searchWhenChanged="true">
+      <label>RK: Normal or Batch Txn</label>
+      <choice value="">Normal</choice>
+      <choice value="Batch">Batch</choice>
+      <default></default>
+    </input>
+    <input type="text" token="ChartBinSizeToken" searchWhenChanged="true">
+      <label>Chart Bin Size</label>
+      <default>60s</default>
+    </input>
+  </fieldset>
+  <row>
+    <panel>
+      <title>Transaction Rate measured on Proxies</title>
+      <chart>
+        <title>Sum in $ChartBinSizeToken$ seconds</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ host=* Machine=*  (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" 
+| makemv delim=" " TxnRequestIn | makemv delim=" " TxnRequestOut | makemv delim=" " TxnStartIn | makemv delim=" " TxnStartOut | makemv delim=" " TxnThrottled
+| eval TxnRequestInRate=mvindex(TxnRequestIn, 0), TxnRequestOutRate=mvindex(TxnRequestOut, 0), TxnStartInRate=mvindex(TxnStartIn, 0), TxnStartOutRate=mvindex(TxnStartOut, 0), TxnThrottledRate=mvindex(TxnThrottled, 0)
+| timechart span=$ChartBinSizeToken$ sum(TxnRequestInRate) as StartedTxnBatchRate, sum(TxnRequestOutRate) as FinishedTxnBatchRate, sum(TxnStartInRate) as StartedTxnRate, sum(TxnStartOutRate) as FinishedTxnRate, sum(TxnThrottledRate) as ThrottledTxnRate</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Read Rate measured on Storage Servers</title>
+      <chart>
+        <title>Average in $ChartBinSizeToken$ seconds</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original" 
+| rex field=BytesQueried "(?&lt;RRate&gt;.*) (?&lt;RRoughness&gt;.*) (?&lt;RCounter&gt;.*)" 
+| rex field=RowsQueried "(?&lt;KRate&gt;.*) (?&lt;KRoughness&gt;.*) (?&lt;KCounter&gt;.*)" 
+| rex field=BytesInput "(?&lt;WRate&gt;.*) (?&lt;WRoughness&gt;.*) (?&lt;WCounter&gt;.*)" 
+| rex field=BytesFetched "(?&lt;FRate&gt;.*) (?&lt;FRoughness&gt;.*) (?&lt;FCounter&gt;.*)" 
+| timechart span=$ChartBinSizeToken$ avg(RRate) as BytesReadPerSecond, avg(KRate) as RowsReadPerSecond, avg(FRate) as DDReadPerSecond</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">linear</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Write Rate measured on Proxies</title>
+      <chart>
+        <title>1min Average</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ host=* Machine=*  (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" 
+| makemv delim=" " MutationBytes
+| makemv delim=" " Mutations
+| eval MutationBytesRate=mvindex(MutationBytes, 0), MutationsRate=mvindex(Mutations,0)
+| bucket span=5s _time
+| stats sum(MutationBytesRate) as MutationBytes, sum(MutationsRate) as Mutations by _time
+|eval MutationMB=MutationBytes/1024/1024, MutationsK=Mutations/1000
+| timechart span=$ChartBinSizeToken$ avg(MutationMB) as MutationMB, avg(MutationsK) as MutationsK</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.abbreviation">none</option>
+        <option name="charting.axisY.scale">linear</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="charting.layout.splitSeries">0</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Write Rate measured on Storage Servers</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original" 
+| rex field=BytesInput "(?&lt;WRate&gt;.*) (?&lt;WRoughness&gt;.*) (?&lt;WCounter&gt;.*)" 
+| rex field=BytesFetched "(?&lt;FRate&gt;.*) (?&lt;FRoughness&gt;.*) (?&lt;FCounter&gt;.*)" 
+| timechart span=$ChartBinSizeToken$ avg(WRate) as BytesPerSecond, avg(FRate) as DDBytesWrittenPerSecond</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>GRV Latency measured on all Proxies</title>
+      <chart>
+        <title>Seconds</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=GRVLatencyMetrics AND TrackLatestType="Original"
+| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="charting.legend.placement">bottom</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Commit Latency measured on all Proxies</title>
+      <chart>
+        <title>Seconds</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$  Type=CommitLatencyMetrics AND TrackLatestType="Original"
+| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="charting.legend.placement">bottom</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Read Latency measured on all Storage Servers</title>
+      <chart>
+        <title>Seconds</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$  Type=ReadLatencyMetrics AND TrackLatestType="Original"
+| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="charting.legend.placement">bottom</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>RateKeeper: ReleasedTPS vs LimitTPS</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" 
+| replace inf with 100000000000 
+| eval _time=Time
+| table _time ReleasedTPS TPSLimit
+| timechart span=$ChartBinSizeToken$ avg(ReleasedTPS) avg(TPSLimit)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="height">251</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>RateKeeper: Throttling Reason</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" 
+| replace inf with 100000000000 
+| eval _time=Time
+| table _time Reason</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisLabelsY.majorUnit">1</option>
+        <option name="charting.axisY.abbreviation">none</option>
+        <option name="charting.axisY.scale">linear</option>
+        <option name="charting.chart">area</option>
+        <option name="charting.drilldown">none</option>
+        <option name="charting.legend.mode">standard</option>
+        <option name="height">249</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>RateKeeper: Throttling Server</title>
+      <table>
+        <title>Ratekeeper: Limit Reason: ReasonServerID (Most recent 10 records)</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate AND TrackLatestType="Original" 
+| streamstats count as numOfEvents 
+| where numOfEvents &lt; 10
+| eval DateTime=strftime(Time, "%Y-%m-%dT%H:%M:%S")
+| table DateTime, ReasonServerID</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Disk Overhead = Disk Usage / Logical KV Size</title>
+      <chart>
+        <title>Y-axis is capped at 10</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ host=* Machine=*  (Type=StorageMetrics OR Type=DDTrackerStats) TrackLatestType=Original
+| bucket _time span=5s 
+| stats sum(KvstoreBytesUsed) as StorageDiskUsedBytes, sum(KvstoreBytesTotal) as StorageDiskTotalBytes, avg(TotalSizeBytes) as LogicalKVBytes by _time
+| eval overhead=StorageDiskUsedBytes/LogicalKVBytes
+| timechart avg(overhead)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.maximumNumber">10</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="charting.legend.placement">bottom</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>KV Data Size</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+Roles=*DD* host=* Machine=*  Type=DDTrackerStats TrackLatestType=Original
+| eval TotalKVGB=TotalSizeBytes/1024/1024/1024, SystemKVGB=SystemSizeBytes/1024/1024/1024
+|timechart avg(TotalKVGB), avg(SystemKVGB), avg(Shards)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="charting.legend.placement">bottom</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Disk Usage</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ host=* Machine=*  Type=StorageMetrics TrackLatestType=Original
+| bucket _time span=5s 
+| stats sum(KvstoreBytesUsed) as StorageDiskUsedBytes, sum(KvstoreBytesTotal) as StorageDiskTotalBytes by _time
+|eval StorageDiskTotalMB = StorageDiskTotalBytes/1024/1024, StorageDiskUsedMB=StorageDiskUsedBytes/1024/1024
+| timechart avg(StorageDiskTotalMB) as StorageDiskTotalMB, avg(StorageDiskUsedMB) as StorageDiskUsedMB</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="charting.legend.placement">bottom</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Cluster Roles</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics TrackLatestType="Original"
+| rex field=host "(?&lt;HostDC&gt;..).*-..(?&lt;HostConfig&gt;..).*"
+| eval HostDC=if(isnotnull(pie_work_unit), pie_work_unit, HostDC) 
+| makemv delim="," Roles
+| stats dc(Machine) as MachineCount by Roles, HostDC
+| stats list(HostDC), list(MachineCount) by Roles
+| sort Roles</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Storage Engine</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=Role Origination=Recruited As=StorageServer | table StorageEngine, OriginalDateTime, DateTime |head 2</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+    <panel>
+      <title>Cluster Generations</title>
+      <chart>
+        <title>Indicate FDB recoveries</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics |timechart max(Generation)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+</form>
--- a/contrib/observability_splunk_dashboard/ratekeeper.xml
+++ b/contrib/observability_splunk_dashboard/ratekeeper.xml
@ -0,0 +1,928 @@
+<form theme="dark">
+  <label>FoundationDB - RateKeeper (Dev)</label>
+  <fieldset submitButton="false">
+    <input type="text" token="Index" searchWhenChanged="true">
+      <label>Index</label>
+      <default>*</default>
+    </input>
+    <input type="text" token="LogGroup" searchWhenChanged="true">
+      <label>LogGroup</label>
+      <default></default>
+    </input>
+    <input type="time" token="TimeSpan" searchWhenChanged="true">
+      <label>TimeSpan</label>
+      <default>
+        <earliest>-60m@m</earliest>
+        <latest>now</latest>
+      </default>
+    </input>
+    <input type="dropdown" token="UpdateRateTypeToken" searchWhenChanged="true">
+      <label>RKChart: Normal or Batch</label>
+      <choice value="">Normal</choice>
+      <choice value="Batch">Batch</choice>
+      <default></default>
+    </input>
+    <input type="text" token="ChartBinSizeToken" searchWhenChanged="true">
+      <label>Chart Bin Size</label>
+      <default>30s</default>
+    </input>
+    <input type="dropdown" token="ChartByMachineToken" searchWhenChanged="true">
+      <label>ClusterStateMetric byMachine</label>
+      <choice value="by Machine">Yes</choice>
+      <choice value="">No</choice>
+      <default></default>
+    </input>
+    <input type="dropdown" token="RolePerformanceChartToken" searchWhenChanged="true">
+      <label>Role for Proc Perf Charts</label>
+      <choice value="MasterServer">MasterServer</choice>
+      <choice value="MasterProxyServer">MasterProxyServer</choice>
+      <choice value="StorageServer">StorageServer</choice>
+      <choice value="TLog">TLog</choice>
+      <choice value="Resolver">Resolver</choice>
+      <choice value="GrvProxyServer">GrvProxyServer</choice>
+      <choice value="CommitProxyServer">CommitProxyServer</choice>
+    </input>
+    <input type="dropdown" token="SourcePerfConnectionToken" searchWhenChanged="true">
+      <label>Source for Perf Connection</label>
+      <choice value="MasterServer">MasterServer</choice>
+      <choice value="MasterProxyServer">MasterProxyServer</choice>
+      <choice value="Resolver">Resolver</choice>
+      <choice value="TLog">TLog</choice>
+      <choice value="StorageServer">StorageServer</choice>
+      <choice value="GrvProxyServer">GrvProxyServer</choice>
+      <choice value="CommitProxyServer">CommitProxyServer</choice>
+    </input>
+    <input type="dropdown" token="DestinationPerfConnectionToken" searchWhenChanged="true">
+      <label>Dest for Perf Connection</label>
+      <choice value="MasterServer">MasterServer</choice>
+      <choice value="MasterProxyServer">MasterProxyServer</choice>
+      <choice value="Resolver">Resolver</choice>
+      <choice value="TLog">TLog</choice>
+      <choice value="StorageServer">StorageServer</choice>
+      <choice value="GrvProxyServer">GrvProxyServer</choice>
+      <choice value="CommitProxyServer">CommitProxyServer</choice>
+    </input>
+  </fieldset>
+  <row>
+    <panel>
+      <title>Aggregated Storage Server Bandwidth</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original" 
+        | rex field=BytesQueried "(?&lt;RRate&gt;.*) (?&lt;RRoughness&gt;.*) (?&lt;RCounter&gt;.*)" 
+         |  rex field=BytesInput "(?&lt;WRate&gt;.*) (?&lt;WRoughness&gt;.*) (?&lt;WCounter&gt;.*)" 
+          | rex field=BytesFetched "(?&lt;FRate&gt;.*) (?&lt;FRoughness&gt;.*) (?&lt;FCounter&gt;.*)" 
+          | bin span=5s _time 
+          | stats sum(RRate) as ReadSum, sum(WRate) as WriteSum, sum(FRate) as FetchedKeyRate by _time
+          | eval ReadSpeedMB=ReadSum/1024/1024, WriteSpeedMB=WriteSum/1024/1024, FetchedKeyRateMB=FetchedKeyRate/1024/1024
+          |timechart avg(ReadSpeedMB), avg(WriteSpeedMB), avg(FetchedKeyRateMB)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Aggregated Proxy Bandwidth</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" 
+| makemv delim=" " TxnRequestIn | makemv delim=" " TxnRequestOut | makemv delim=" " TxnStartIn | makemv delim=" " TxnStartOut | makemv delim=" " MutationBytes
+| eval TxnRequestInRate=mvindex(TxnRequestIn, 0), TxnRequestOutRate=mvindex(TxnRequestOut, 0), TxnStartInRate=mvindex(TxnStartIn, 0), TxnStartOutRate=mvindex(TxnStartOut, 0), MutationBytesRate=mvindex(MutationBytes, 0)
+| bin span=60s _time
+| stats avg(TxnRequestInRate) as TxnRequestInRatePerHost, avg(TxnRequestOutRate) as TxnRequestOutRatePerHost, avg(TxnStartInRate) as TxnStartInRatePerHost, avg(TxnStartOutRate) as TxnStartOutRatePerHost, avg(MutationBytesRate) as MutationBytesRatePerHost by Machine,_time
+| eval WriteThroughputKB=sum(MutationBytesRatePerHost)/1000  
+| timechart span=1m sum(TxnRequestInRatePerHost), sum(TxnRequestOutRatePerHost), sum(TxnStartInRatePerHost), sum(TxnStartOutRatePerHost), sum(WriteThroughputKB)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 1: Overview - GRV Arrivals and Leaves per Second Seen by Proxies</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" 
+| eval TxnRequestIn=mvindex(TxnRequestIn, 0), TxnRequestOut=mvindex(TxnRequestOut, 0), TxnStartIn=mvindex(TxnStartIn, 0), TxnStartOut=mvindex(TxnStartOut, 0) 
+| timechart span=30s avg(TxnRequestIn) avg(TxnRequestOut) avg(TxnStartIn) avg(TxnStartOut) by Machine</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="height">249</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 2: RKOverview - Input ReleasedTPS and Output TPSLimit</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" 
+| replace inf with 100000000000 
+| eval _time=Time
+| table _time ReleasedTPS TPSLimit
+| timechart span=$ChartBinSizeToken$ avg(ReleasedTPS) avg(TPSLimit)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="height">251</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 3: RKOverview - RKLimitReason</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" 
+| replace inf with 100000000000 
+| eval _time=Time
+| table _time Reason</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisLabelsY.majorUnit">1</option>
+        <option name="charting.axisY.abbreviation">none</option>
+        <option name="charting.axisY.scale">linear</option>
+        <option name="charting.chart">area</option>
+        <option name="charting.drilldown">none</option>
+        <option name="height">249</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 4: Don't Process Transactions - RkSSListFetchTimeout (TpsLimit = 0)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ 
+Type="RkSSListFetchTimeout" 
+| timechart span=1s count</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 5: Don't Process Transactions - RkTlogMinFreeSpaceZero (TpsLimit = 0)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ 
+Type="RkTlogMinFreeSpaceZero" 
+| timechart span=1s count</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 6: Don't Process Transactions - ProxyGRVThresholdExceeded</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ (Type="ProxyGRVThresholdExceeded*") AND TrackLatestType="Original" 
+| timechart span=1s count by Type</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 7: RKLimitReasonCandidate - LimitingStorageServerDurabilityLag (MVCCVersionInMemory)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" 
+| replace inf with 100000000000 
+| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerDurabilityLag)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 8: RKLimitReasonCandidate - LimitingStorageServerVersionLag (TLogVer-SSVer)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" 
+| replace inf with 100000000000 
+| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerVersionLag)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 9: RKLimitReasonCandidate - LimitingStorageServerQueue</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" 
+| replace inf with 100000000000 
+| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerQueue)</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 10: Runtime Monitoring - StorageServer MVCCVersionInMemory (storage_server_durability_lag)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" 
+| eval NonDurableVersions=Version-DurableVersion
+| timechart span=$ChartBinSizeToken$ limit=0 avg(NonDurableVersions) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">linear</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="height">251</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 11: Runtime Monitoring - StorageServer LocalRate (higher MVCCVersionInMemory -&gt; lower LocalRate)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" 
+| timechart limit=0 avg(LocalRate) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 12: Runtime Monitoring - StorageServer ReadsRejected (lower LocalRate -&gt; higher probability of rejecting read))</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" 
+| timechart limit=0 avg(ReadsRejected) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 13: Runtime Monitoring - Version Lag between StorageServer and Tlog (storage_server_readable_behind)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" 
+| eval SSFallBehindVersions=VersionLag
+| timechart span=$ChartBinSizeToken$ limit=0 avg(SSFallBehindVersions) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">linear</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 14: Runtime Monitoring - StorageServerBytes (storage_server_write_queue_size)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" 
+| makemv delim=" " BytesInput | makemv delim=" " BytesDurable | makemv delim=" " BytesFetched | makemv delim=" " MutationBytes
+| eval BytesInput=mvindex(BytesInput, 2), BytesDurable=mvindex(BytesDurable, 2), BytesFetched=mvindex(BytesFetched, 2), MutationBytes=mvindex(MutationBytes, 2), BytesInMemoryQueue=BytesInput-BytesDurable
+| timechart span=$ChartBinSizeToken$ limit=0 avg(BytesInMemoryQueue) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">linear</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 15: Runtime Monitoring - StorageServer KVStore Free Space Ratio (storage_server_min_free_space)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" 
+| eval KvstoreBytesFreeRatio=KvstoreBytesFree/KvstoreBytesTotal
+| timechart span=$ChartBinSizeToken$ limit=0 avg(KvstoreBytesFreeRatio) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 16: Runtime Monitoring - TLog Queue Free Space Ratio (log_server_min_free_space)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" 
+| eval QueueBytesFreeRatio=QueueDiskBytesFree/QueueDiskBytesTotal
+| timechart span=$ChartBinSizeToken$ limit=0 avg(QueueBytesFreeRatio) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 17: Runtime Monitoring - TLog KVStore Free Space Ratio (log_server_min_free_space)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" 
+| eval KvstoreBytesFreeRatio=KvstoreBytesFree/KvstoreBytesTotal
+| timechart span=$ChartBinSizeToken$ limit=0 avg(KvstoreBytesFreeRatio) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 18: Runtime Monitoring - TLogBytes (log_server_write_queue)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" 
+| makemv delim=" " BytesInput 
+| makemv delim=" " BytesDurable 
+| eval BytesInput=mvindex(BytesInput, 2), BytesDurable=mvindex(BytesDurable, 2), BytesInMemoryQueue=BytesInput-BytesDurable | timechart span=$ChartBinSizeToken$ limit=0 avg(BytesInMemoryQueue) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 19: Runtime Monitoring - Proxy Throughput</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" 
+| timechart span=$ChartBinSizeToken$ limit=0 avg(TxnRequestIn) avg(TxnRequestOut) avg(TxnStartIn) avg(TxnStartOut) avg(TxnStartBatch) avg(TxnStartErrors) avg(TxnCommitIn) avg(TxnCommitVersionAssigned) avg(TxnCommitResolving) avg(TxnCommitResolved) avg(TxnCommitOut) avg(TxnCommitOutSuccess) avg(TxnCommitErrors) avg(TxnThrottled) avg(TxnConflicts) avg(CommitBatchIn) avg(CommitBatchOut) avg(TxnRejectedForQueuedTooLong) avg(Mutations)  $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 20: Runtime Monitoring - Proxy Queue Length</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" | timechart span=$ChartBinSizeToken$ limit=0 avg(*QueueSize*)  $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 21: Runtime Monitoring - TLog UnpoppedVersion</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" 
+| eval UnpoppedVersion=PersistentDataDurableVersion-QueuePoppedVersion 
+| timechart span=$ChartBinSizeToken$ limit=0 avg(UnpoppedVersion) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 22: Runtime Monitoring - Storage Server Disk (AIODiskStall)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="ProcessMetrics" 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As="StorageServer" 
+    | stats first(Machine) by Machine 
+    | rename first(Machine) as Machine 
+    | table Machine] 
+| timechart span=$ChartBinSizeToken$ limit=0 avg(AIODiskStall) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 23: Runtime Monitoring - StorageServer Query Queue Length</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" 
+| makemv QueryQueue | eval QueryQueue=mvindex(QueryQueue, 1) | table _time QueryQueue Machine
+| timechart span=$ChartBinSizeToken$ limit=0 avg(QueryQueue) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 24: Transaction Trace Stats - GRV Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
+      <input type="dropdown" token="GRVByMachineStatsToken" searchWhenChanged="true">
+        <label>By Machine</label>
+        <choice value="Machine">Yes</choice>
+        <choice value="">No</choice>
+        <default></default>
+      </input>
+      <input type="text" token="StatsGRVSpanToken" searchWhenChanged="true">
+        <label>Span</label>
+        <default>500ms</default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="TransactionDebug" AND (*ProxyServer.masterProxyServerCore.Broadcast OR *ProxyServer.getLiveCommittedVersion.confirmEpochLive OR *ProxyServer.getLiveCommittedVersion.After) 
+| table Time Type ID Location Machine Roles 
+| append 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionDebug" AND (*ProxyServer.queueTransactionStartRequests.Before) 
+    | rename ID as ParentID 
+    | table Time Type ParentID Location Machine Roles 
+    | join ParentID 
+        [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionAttachID" 
+        | rename ID as ParentID 
+        | rename To as ID 
+        | table ParentID ID] 
+    | table Time Type ID Location Machine Roles] 
+| table Time Type ID Location Machine Roles 
+| sort 0 Time 
+| table Machine Location Time Roles Type ID 
+| stats list(*) by ID 
+| rename list(*) as * 
+| eval TBegin=mvindex(Time, 0), TEnd=mvindex(Time, -1), TimeSpan=TEnd-TBegin, _time=TBegin 
+| bin bins=20 span=$StatsGRVSpanToken$ TimeSpan 
+| chart limit=0 count by TimeSpan $GRVByMachineStatsToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">column</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 25: Transaction Trace Stats - GetValue Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
+      <input type="dropdown" token="GetValueByMachineStatsToken" searchWhenChanged="true">
+        <label>By Machine</label>
+        <choice value="Machine">Yes</choice>
+        <choice value="">No</choice>
+        <default></default>
+      </input>
+      <input type="text" token="StatsReadSpanToken" searchWhenChanged="true">
+        <label>Span</label>
+        <default>500ms</default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (storageServer.received OR getValueQ.DoRead OR getValueQ.AfterVersion OR Reader.Before OR Reader.After OR getValueQ.AfterRead OR NativeAPI.getKeyLocation.Before OR NativeAPI.getKeyLocation.After) 
+| table Machine Location Time Roles ID Type 
+| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10) 
+| sort 0 Time Order 
+| stats list(*) by ID 
+| rename list(*) as * 
+| table Machine Location Time Roles ID Type 
+| eval count = mvcount(Location)
+| search count&gt;2
+| eval TEnd=mvindex(Time, -1), TBegin=mvindex(Time, 0), TimeSpan=TEnd-TBegin, _time=TBegin
+| table _time ID TimeSpan Machine Location Time 
+| bin bins=20 span=$StatsReadSpanToken$ TimeSpan 
+| chart limit=0 count by TimeSpan $GetValueByMachineStatsToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">column</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 26: Transaction Trace Stats - Commit Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
+      <input type="dropdown" token="CommitByMachineStatsToken">
+        <label>By Machine</label>
+        <choice value="Machine">Yes</choice>
+        <choice value="">No</choice>
+        <default>Machine</default>
+      </input>
+      <input type="text" token="StatsCommitSpanToken" searchWhenChanged="true">
+        <label>Span</label>
+        <default>500ms</default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="CommitDebug" AND (*ProxyServer.commitBatch.Before OR *ProxyServer.commitBatch.GettingCommitVersion OR *ProxyServer.commitBatch.GotCommitVersion OR *ProxyServer.commitBatch.ProcessingMutations OR *ProxyServer.commitBatch.AfterStoreCommits OR *ProxyServer.commitBatch.AfterLogPush OR *ProxyServer.commitBatch.AfterResolution) 
+| table Time Type ID Location Machine Roles 
+| sort 0 Time 
+| table Machine Location Time Roles Type ID
+| stats list(*) by ID
+| rename list(*) as *
+| eval Count=mvcount(Location)
+| search Count&gt;=2
+| eval TBegin=mvindex(Time, 0), TEnd=mvindex(Time, -1), TimeSpan=TEnd-TBegin, _time=T1
+| table _time TimeSpan Machine
+| bin bins=20 span=$StatsCommitSpanToken$ TimeSpan 
+| chart limit=0 count by TimeSpan $CommitByMachineStatsToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">column</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 27: Transaction Tracing - GRV Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
+      <input type="dropdown" token="GRVLatencyByMachineToken" searchWhenChanged="true">
+        <label>By Machine</label>
+        <choice value="by Machine">Yes</choice>
+        <choice value="">No</choice>
+        <default></default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="TransactionDebug" AND (*ProxyServer.*ProxyServerCore.Broadcast OR *ProxyServer.getLiveCommittedVersion.confirmEpochLive OR *ProxyServer.getLiveCommittedVersion.After) 
+| table Time Type ID Location Machine Roles 
+| append 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionDebug" AND (*ProxyServer.queueTransactionStartRequests.Before) 
+    | rename ID as ParentID 
+    | table Time Type ParentID Location Machine Roles 
+    | join ParentID 
+        [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionAttachID" 
+        | rename ID as ParentID 
+        | rename To as ID 
+        | table ParentID ID] 
+    | table Time Type ID Location Machine Roles] 
+| table Time Type ID Location Machine Roles 
+| eval Order = case(Location=="NativeAPI.getConsistentReadVersion.Before", 0, Location like "%ProxyServer.queueTransactionStartRequests.Before", 1, Location="MasterProxyServer.masterProxyServerCore.Broadcast", 2, Location like "%ProxyServer.getLiveCommittedVersion.confirmEpochLive", 3, Location like "%ProxyServer.getLiveCommittedVersion.After", 5, Location=="NativeAPI.getConsistentReadVersion.After", 6) 
+| table Time Order Type ID Location Machine Roles 
+| sort 0 Order Time 
+| table Machine Location Time Roles Type ID 
+| stats list(*) by ID 
+| rename list(*) as * 
+| eval T1=mvindex(Time, 0), T2=mvindex(Time, 1), T3=mvindex(Time, 2), T4=mvindex(Time, 3), TimeInQueue = T2-T1, TimeGetVersionFromProxies = if(mvcount==4, T3-T2, -0.0000001), TimeConfirmLivenessFromTLogs = if(mvcount==4, T4-T3, T3-T2), TimeSpan=if(mvcount==4,T4-T1,T3-T1), _time=T1 
+| table _time TimeSpan TimeInQueue TimeGetVersionFromProxies TimeConfirmLivenessFromTLogs Machine 
+| timechart span=$ChartBinSizeToken$ limit=0 avg(TimeSpan), avg(TimeInQueue), avg(TimeGetVersionFromProxies), avg(TimeConfirmLivenessFromTLogs) $GRVLatencyByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 28: Transaction Tracing - GetValue Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
+      <input type="dropdown" token="GetValueLatencyByMachineToken" searchWhenChanged="true">
+        <label>By Machine</label>
+        <choice value="by Machine">Yes</choice>
+        <choice value="">No</choice>
+        <default></default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (storageServer.received OR getValueQ.DoRead OR getValueQ.AfterVersion OR Reader.Before OR Reader.After OR getValueQ.AfterRead OR NativeAPI.getKeyLocation.Before OR NativeAPI.getKeyLocation.After) 
+| table Machine Location Time Roles ID Type 
+| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10) 
+| sort 0 Time Order 
+| stats list(*) by ID 
+| rename list(*) as * 
+| table Machine Location Time Roles ID Type 
+| eval count = mvcount(Location)
+| search count&gt;2
+| eval TEnd=mvindex(Time, -1), TBegin=mvindex(Time, 0), TimeSpan=TEnd-TBegin, _time=TBegin
+| table _time TimeSpan  
+| timechart span=30s limit=0 avg(TimeSpan) $GetValueLatencyByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 29: Transaction Tracing - Commit Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
+      <input type="dropdown" token="CommitByMachineToken" searchWhenChanged="true">
+        <label>By Machine</label>
+        <choice value="By Machine">Yes</choice>
+        <choice value="">No</choice>
+        <default></default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="CommitDebug" AND (*ProxyServer.commitBatch.Before OR *ProxyServer.commitBatch.GettingCommitVersion OR *ProxyServer.commitBatch.GotCommitVersion OR *ProxyServer.commitBatch.ProcessingMutations OR *ProxyServer.commitBatch.AfterStoreCommits OR *ProxyServer.commitBatch.AfterLogPush OR *ProxyServer.commitBatch.AfterResolution) 
+| table Time Type ID Location Machine Roles 
+| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location like "%ProxyServer.batcher", 1, Location like "%ProxyServer.commitBatch.Before", 2, Location like "%ProxyServer.commitBatch.GettingCommitVersion", 3, Location like "%ProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location like "%ProxyServer.commitBatch.AfterResolution", 8.5, Location like "%ProxyServer.commitBatch.ProcessingMutations", 9, Location like "%ProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLog.tLogCommit.BeforeWaitForVersion", 11, Location=="TLog.tLogCommit.Before", 12, Location=="TLog.tLogCommit.AfterTLogCommit", 13, Location=="TLog.tLogCommit.After", 14, Location like "%ProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16)
+| table Time Order Type ID Location Machine Roles 
+| sort 0 Time Order 
+| table Machine Location Time Roles Type ID
+| stats list(*) by ID
+| rename list(*) as *
+| eval Count=mvcount(Location)
+| search Count=7
+| eval T1=mvindex(Time, 0), T2=mvindex(Time, 1), T3=mvindex(Time, 2), T4=mvindex(Time, 3), T5=mvindex(Time, 4), T6=mvindex(Time, 5), T7=mvindex(Time, 6), TimeSpan=T7-T1, TimeResolution=T4-T3, TimePostResolution=T5-T4, TimeProcessingMutation=T6-T5, TimeTLogPush=T7-T6, _time=T1
+| table _time TimeSpan TimeResolution TimePostResolution TimeProcessingMutation TimeTLogPush Machine
+| timechart span=$ChartBinSizeToken$ limit=0 avg(TimeSpan), avg(TimeResolution), avg(TimePostResolution), avg(TimeProcessingMutation), avg(TimeTLogPush) $CommitByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 30: Transaction Tracing - Commit - TLogPush and Resolver Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
+      <input type="dropdown" token="TLogResolverByMachineToken" searchWhenChanged="true">
+        <label>By Machine</label>
+        <choice value="MachineStep">Yes</choice>
+        <choice value="Step">No</choice>
+        <default>Step</default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="CommitDebug" AND (Resolver.resolveBatch.Before OR Resolver.resolveBatch.AfterQueueSizeCheck OR Resolver.resolveBatch.AfterOrderer OR Resolver.resolveBatch.After OR TLog.tLogCommit.BeforeWaitForVersion OR TLog.tLogCommit.Before OR TLog.tLogCommit.AfterTLogCommit OR TLog.tLogCommit.After) 
+| table Time Type ID Location Machine Roles 
+| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location=="MasterProxyServer.batcher", 1, Location=="MasterProxyServer.commitBatch.Before", 2, Location=="MasterProxyServer.commitBatch.GettingCommitVersion", 3, Location=="MasterProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location=="MasterProxyServer.commitBatch.AfterResolution", 8.5, Location=="MasterProxyServer.commitBatch.ProcessingMutations", 9, Location=="MasterProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLog.tLogCommit.BeforeWaitForVersion", 11, Location=="TLog.tLogCommit.Before", 12, Location=="TLog.tLogCommit.AfterTLogCommit", 13, Location=="TLog.tLogCommit.After", 14, Location=="MasterProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16)
+| table Time Order Type ID Location Machine Roles 
+| sort 0 Time Order 
+| table Machine Location Time Roles Type ID
+| stats list(*) by ID
+| rename list(*) as *
+| eval Count=mvcount(Location), Step=case(Count=4 and (mvindex(Location, 0) like "TLog%"), "TimeTLogCommit", Count=4 and (mvindex(Location, 0) like "Resolver%"), "TimeResolver", Count=10, "TimeSpan"), BeginTime=mvindex(Time, 0), EndTime=mvindex(Time, -1), Duration=EndTime-BeginTime, _time=BeginTime
+| search Count=4
+| eval Machinei=mvindex(Machine, 0), MachineStep = Step."-".Machinei
+| table _time Step Duration Machinei Location Machine MachineStep
+| timechart span=$ChartBinSizeToken$ limit=0 avg(Duration) by $TLogResolverByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 31: Machine Performance - CPU Utilization (CPU Time divided by Elapsed)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" 
+| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory Elapsed
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ 
+    | stats first(Machine) by Machine 
+    | rename first(Machine) as Machine 
+    | table Machine] 
+| eval Utilization=CPUSeconds/Elapsed
+| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">linear</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 32: Machine Performance - Memory Utilization (ResidentMemory divided by Memory)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" 
+| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ 
+    | stats first(Machine) by Machine 
+    | rename first(Machine) as Machine 
+    | table Machine] 
+| eval Utilization = ResidentMemory/Memory
+| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">linear</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 33: Machine Performance - Disk Utilization ((DiskTotalBytes-DiskFreeBytes)/DiskTotalBytes)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" 
+| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$
+    | stats first(Machine) by Machine 
+    | rename first(Machine) as Machine 
+    | table Machine] 
+| eval Utilization = (DiskTotalBytes-DiskFreeBytes)/DiskTotalBytes
+| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 34: Machine Performance - Network (Mbps Received and Mbps Sent)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" 
+| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ 
+    | stats first(Machine) by Machine 
+    | rename first(Machine) as Machine 
+    | table Machine] 
+| timechart span=$ChartBinSizeToken$ avg(MbpsReceived) avg(MbpsSent) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.axisY.scale">log</option>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 35: Machine Performance - Disk (Reads Count and Writes Count)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" 
+| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ 
+    | stats first(Machine) by Machine 
+    | rename first(Machine) as Machine 
+    | table Machine] 
+| timechart span=$ChartBinSizeToken$ avg(DiskReadsCount) avg(DiskWritesCount) $ChartByMachineToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 36: Network Performance - Timeout</title>
+      <input type="dropdown" token="TimeoutByConnectionToken" searchWhenChanged="true">
+        <label>By Connection</label>
+        <choice value="By Connection">Yes</choice>
+        <choice value="">No</choice>
+        <default></default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type=ConnectionTimedOut OR Type=ConnectionTimeout) 
+| replace *:tls with * in PeerAddr 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($SourcePerfConnectionToken$)) 
+    | dedup ID] 
+| join PeerAddr 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($DestinationPerfConnectionToken$)) 
+    | dedup ID 
+    | rename Machine as PeerAddr] 
+| eval Connection=Machine."-".PeerAddr
+| timechart useother=0 span=$ChartBinSizeToken$ count $TimeoutByConnectionToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+    <panel>
+      <title>Chart 37: Network Performance - PingLatency</title>
+      <input type="dropdown" token="PingLatencyByConnectionToken" searchWhenChanged="true">
+        <label>By Connection</label>
+        <choice value="By Connection">Yes</choice>
+        <choice value="">No</choice>
+        <default></default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type=PingLatency) 
+| replace *:tls with * in PeerAddr 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($SourcePerfConnectionToken$)) 
+    | dedup ID] 
+| join PeerAddr 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($DestinationPerfConnectionToken$)) 
+    | dedup ID 
+    | rename Machine as PeerAddr] 
+| eval Connection=Machine."-".PeerAddr
+| timechart useother=0 span=$ChartBinSizeToken$ avg(MeanLatency) avg(MaxLatency)   $PingLatencyByConnectionToken$</query>
+          <earliest>$TimeSpan.earliest$</earliest>
+          <latest>$TimeSpan.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+</form>
--- a/contrib/observability_splunk_dashboard/recovery.xml
+++ b/contrib/observability_splunk_dashboard/recovery.xml
@ -0,0 +1,873 @@
+<form theme="dark">
+  <label>FoundationDB - Long Recovery (Dev)</label>
+  <fieldset submitButton="false" autoRun="false"></fieldset>
+  <row>
+    <panel>
+      <title>Table 1: Find long recovery (Input Index and LogGroup and Select a time span).</title>
+      <input type="text" token="IndexForOverview" searchWhenChanged="true">
+        <label>Index</label>
+        <default>*</default>
+      </input>
+      <input type="text" token="LogGroupForOverview" searchWhenChanged="true">
+        <label>LogGroup</label>
+        <default></default>
+      </input>
+      <input type="time" token="time_token_for_recoveryhistorytable" searchWhenChanged="true">
+        <label>Select a time span</label>
+        <default>
+          <earliest>-0s</earliest>
+          <latest>now</latest>
+        </default>
+      </input>
+      <table>
+        <search>
+          <query>index=$IndexForOverview$ LogGroup=$LogGroupForOverview$
+    ((Type="MasterRecoveryState" AND (Status="reading_coordinated_state" OR Status="fully_recovered" OR Status="accepting_commits")) OR (Type="Role" AND As="MasterServer" AND ("Transition"="Begin" OR "Transition"="End")) OR Type="MasterTerminated") AND (NOT TrackLatestType="Rolled") | eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
+| table ID Machine Type Transition As Status DateTime Time ErrorDescription LogGroup
+| search NOT ErrorDescription="Success"
+| eval EventType=case(Transition="Begin" AND As="MasterServer" AND Type="Role", "MasterStart", Type="MasterRecoveryState" AND Status="fully_recovered", "FullRecovery", Type="MasterRecoveryState" AND Status="reading_coordinated_state", "StartRecoveryAttempt", Transition="End" AND As="MasterServer" AND Type="Role", "MasterTerminated", Type="MasterTerminated", "MasterTerminated", Type="MasterRecoveryState" AND Status="accepting_commits", "AcceptingCommits") 
+| table ID Machine EventType DateTime Time ErrorDescription LogGroup
+| fillnull value="-" 
+| sort -Time 
+| eval ifMasterTerminatedEvent=if(EventType="MasterTerminated", 1, 0) 
+| stats list(*) by ID Machine ifMasterTerminatedEvent 
+| rename list(*) as * 
+| table ID Machine EventType DateTime Time ErrorDescription LogGroup
+| sort -Time 
+| eval LastTime=mvindex(Time, 0), FirstTime=mvindex(Time, -1), Duration=LastTime-FirstTime 
+| table ID Machine Duration EventType DateTime Time ErrorDescription LogGroup</query>
+          <earliest>$time_token_for_recoveryhistorytable.earliest$</earliest>
+          <latest>$time_token_for_recoveryhistorytable.latest$</latest>
+        </search>
+        <option name="count">15</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 2: Select timespan containing the long recovery and see all recovery attempts in the time span (The input Index and LogGroup and Timespan are for all following tables and charts)</title>
+      <input type="text" token="Index" searchWhenChanged="true">
+        <label>Index</label>
+        <default>*</default>
+      </input>
+      <input type="text" searchWhenChanged="true" token="LogGroup">
+        <label>LogGroup</label>
+      </input>
+      <input type="time" token="ReoveryTime" searchWhenChanged="true">
+        <label>ReoveryTimeSpan</label>
+        <default>
+          <earliest>-0s@s</earliest>
+          <latest>now</latest>
+        </default>
+      </input>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type="MasterRecoveryState" OR (Type="MasterTerminated") OR (Type="Role" AND As="MasterServer" AND "Transition"="End") OR Type="RecoveryInternal" OR Type="ProxyReplies" OR Type="CommitProxyReplies" OR Type="ResolverReplies" OR Type="MasterRecruitedInitialStorageServers") AND (NOT TrackLatestType="Rolled") 
+| rename ID as MasterID 
+| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+| table MasterID Machine Status Step Type DateTime Time StatusCode MyRecoveryCount ErrorDescription Reason ErrorCode 
+| fillnull value="-" ErrorDescription Reason ErrorCode 
+| eval Status=case(Type=="MasterRecoveryState", Status, Type=="Role", "RoleEnd", Type=="MasterTerminated", "MasterTerminated", Type=="RecoveryInternal", Status."/".Step, Type=="ProxyReplies" OR Type=="CommitProxyReplies", "initializing_transaction_servers/ProxyReplies", Type="ResolverReplies", "initializing_transaction_servers/ResolverReplies", Type=="MasterRecruitedInitialStorageServers", "initializing_transaction_servers/MasterRecruitedInitialStorageServers"), StatusCode=case(Type=="ProxyReplies" OR Type=="CommitProxyReplies" OR Type=="ResolverReplies" OR Type=="MasterRecruitedInitialStorageServers", "8", Type!="ProxyReplies" AND Type!="CommitProxyReplies" AND Type!="ResolverReplies" AND Type!="MasterRecruitedInitialStorageServers", StatusCode)
+| fillnull value="-" StatusCode 
+| sort 0 -Time -StatusCode
+| stats list(*) by MasterID Machine 
+| rename list(*) as * 
+| eval FirstTime=mvindex(Time, -1), LastTime=mvindex(Time, 0), Duration=LastTime-FirstTime 
+| table MasterID Machine MyRecoveryCount Duration ErrorDescription Reason ErrorCode StatusCode Status DateTime Time 
+| sort -MyRecoveryCount 
+| fillnull value="-" MyRecoveryCount</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="count">3</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+        <option name="wrap">false</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 3: Why recovery is triggered? Using WaitFailureClient event. Machine A detects Machine B's failure. First column is the time when WaitFailureClient happens. Columns of 2,3,4,5 are for A. Columns of 6,7 are for B.</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="WaitFailureClient" 
+| table Type Time Machine FailedEndpoint 
+| replace *:tls with * in FailedEndpoint 
+| join Machine type=left 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND Transition="End" 
+    | eval EndTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+    | rename As as Role 
+    | table ID EndTime Machine Role] 
+| join FailedEndpoint type=left 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" 
+    | stats latest(*) by ID | rename latest(*) as *
+    | rename Machine as FailedEndpoint 
+    | eval FailedEndpointLatestRoleEventInfo=As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+    | stats list(*) by FailedEndpoint 
+    | rename list(*) as * 
+    | table FailedEndpoint FailedEndpointLatestRoleEventInfo] 
+| eval FailureDetectedTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+| makemv delim=" " FailedEndpointLatestRoleEventInfo 
+| table FailureDetectedTime Machine ID Role EndTime FailedEndpoint FailedEndpointLatestRoleEventInfo</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="wrap">false</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 4: New Recruitment Configuration (using MasterRecoveredConfig event)</title>
+      <event>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="MasterRecoveredConfig" AND TrackLatestType="Original" 
+| eval Configuration=replace(Conf, "&amp;quot;", "\"") 
+| rename Configuration as _raw</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="list.drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </event>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 5: Data Centers (using ProcessMetrics event)</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type=ProcessMetrics 
+| dedup DCID 
+| rename DCID as DataCenterID 
+| table DataCenterID pie_work_unit
+| fillnull value="-"</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+    <panel>
+      <title>Table 6: New Role (using Role event joined by ProcessMetrics event)</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type="Role" AND ((As="ClusterController") OR (As="MasterServer") OR (As="TLog") OR (As="Resolver") OR (As="MasterProxyServer") OR (As="CommitProxyServer") OR (As="GrvProxyServer") OR (As="LogRouter")) AND (NOT TrackLatestType="Rolled") AND (NOT Transition="Refresh")) 
+| eventstats count by ID 
+| rename As as Role 
+| search count=1 AND Transition="Begin" 
+| table ID Role Machine 
+| join type=left Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics 
+    | dedup Machine, DCID 
+    | rename DCID as DataCenter 
+    | table Machine DataCenter] 
+| table ID Role Machine DataCenter 
+| fillnull value="null" DataCenter 
+| stats count by Role DataCenter</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 7: Role Details</title>
+      <input type="multiselect" token="RolesToken" searchWhenChanged="true">
+        <label>Roles</label>
+        <choice value="MasterServer">MasterServer</choice>
+        <choice value="TLog">TLog</choice>
+        <choice value="Resolver">Resolver</choice>
+        <choice value="MasterProxyServer">MasterProxyServer (for &lt;7.0)</choice>
+        <choice value="LogRouter">LogRouter</choice>
+        <choice value="CommitProxyServer">CommitProxyServer (for 7.0+)</choice>
+        <choice value="GrvProxyServer">GrvProxyServer (for 7.0+)</choice>
+        <valuePrefix>As="</valuePrefix>
+        <valueSuffix>"</valueSuffix>
+        <delimiter> OR </delimiter>
+      </input>
+      <input type="dropdown" token="RoleDetailTableWhichRoleToken" searchWhenChanged="true">
+        <label>Begin/End</label>
+        <choice value="count=1 AND Transition=&quot;Begin&quot;">Begin</choice>
+        <choice value="count=1 AND Transition=&quot;End&quot;">End</choice>
+        <choice value="count=2">Begin-&gt;End</choice>
+        <default>count=1 AND Transition="Begin"</default>
+      </input>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type="Role" AND ($RolesToken$) AND (NOT TrackLatestType="Rolled") AND (NOT Transition="Refresh")) 
+| eventstats count by ID 
+| rename As as Role 
+| search $RoleDetailTableWhichRoleToken$
+| table ID Role Machine Time
+| join type=left Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics 
+    | dedup Machine, DCID 
+    | rename DCID as DataCenter 
+    | table Machine DataCenter] 
+| table ID Role Machine DataCenter Time
+| fillnull value="null" DataCenter 
+| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+| table ID Role Machine DataCenter DateTime 
+| sort 0 -DateTime</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 8: CC Recruitment SevWarn OR SevError (use events in clusterRecruitFromConfiguration and clusterRecruitRemoteFromConfiguration)</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="RecruitFromConfigurationNotAvailable" OR Type="RecruitFromConfigurationRetry" OR Type="RecruitFromConfigurationError" OR Type="RecruitRemoteFromConfigurationNotAvailable" OR Type="RecruitRemoteFromConfigurationRetry" OR Type="RecruitRemoteFromConfigurationError"
+    | eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)"), GoodRecruitmentTimeReady=case(Type=="RecruitFromConfigurationNotAvailable" OR Type=="RecruitRemoteFromConfigurationNotAvailable", "True", Type=="RecruitFromConfigurationRetry" OR Type=="RecruitRemoteFromConfigurationRetry", GoodRecruitmentTimeReady, Type=="RecruitFromConfigurationError" OR Type=="RecruitRemoteFromConfigurationError", "-")
+    | table Type GoodRecruitmentTimeReady Time DateTime</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 9: RecoveryCount of the selected TLog (in Table 11)</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (ID=$row.TLogID$ AND Type="TLogStart") OR (LogId=$row.TLogID$ AND Type="TLogPersistentStateRestore") 
+| eval ID=if(Type="TLogStart", ID, LogId), DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)")  
+| table ID RecoveryCount Type DateTime | fillnull value="Not found. The fdb version is somewhat old."</query>
+          <earliest>-7d@h</earliest>
+          <latest>now</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+    <panel>
+      <title>Table 10: Which roles the selected TLog (in Table 11) talks to</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    ((Type="TLogRejoining" AND ID=$row.TLogID$) OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow") AND TLog=$row.TLogID$) OR ((Type="TLogLockStarted" OR Type="TLogLocked") AND TLog=$row.TLogID$) OR (Type="TLogStop" AND ID=$row.TLogID$) OR (Type="TLogStop2" AND LogId=$row.TLogID$) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND ID=$row.TLogID$)) AND (NOT TrackLatestType="Rolled") 
+| sort -Time 
+| eval TLogID=case((Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRejoining"), Time." ".Type." ".Master, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."Null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."Null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."Null") 
+| stats list(*) by TLogID 
+| rename list(*) As * 
+| table TLogID TLogEvents 
+| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) 
+| search ignore=0 
+| sort TLogID 
+| table TLogID TLogEvents 
+| mvexpand TLogEvents 
+| eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), MasterID=mvindex(temp,2) 
+| fields - temp - TLogEvents 
+| sort 0 -Time 
+| search NOT MasterID="NULL" 
+| dedup MasterID 
+| rename MasterID as ID 
+| join type=left ID 
+    [ search index=$Index$ LogGroup=$LogGroup$ 
+        (Type="Role")
+    | sort 0 -Time 
+    | dedup ID 
+    | table ID Machine As] 
+| table ID Machine As | fillnull value="null" Machine As</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 11: TLog Events (Collecting all TLogs that produce interesting events during the time span)</title>
+      <input type="text" token="SeeLogEventDetailTableToken" searchWhenChanged="true">
+        <label>Input * to do search</label>
+      </input>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type="TLogRecover") OR (Type="TLogReady") OR (Type="TLogStart") OR
+    ((Type="TLogLockStarted") OR (Type="TLogLocked") OR (Type="TLogStop") OR (Type="TLogStop2")) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh") AND (NOT TrackLatestType="Rolled") AND $SeeLogEventDetailTableToken$
+| sort -Time 
+| eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRecover"), Time." ".Type." "."null", (Type="TLogReady"), Time." ".Type." "."null", (Type="TLogStart"), Time." ".Type." "."null", (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."null") 
+| stats list(TLogEvents) by TLogID 
+| rename list(TLogEvents) As TLogEvents 
+| eval EarliestEvent=mvindex(TLogEvents, -1) , LatestEvent=mvindex(TLogEvents, 0) 
+| table TLogID TLogEvents EarliestEvent LatestEvent 
+| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) 
+| search ignore=0 
+| sort TLogID 
+| join type=left TLogID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND As="TLog") 
+    | sort 0 -Time 
+    | dedup ID 
+    | rename ID as TLogID 
+    | table TLogID host LogGroup Machine] 
+| table TLogID Machine LogGroup host EarliestEvent LatestEvent 
+| fillnull value="null" Machine host LogGroup
+| eval temp=split(LatestEvent," "), LatestTime=mvindex(temp,0), LatestEvent=mvindex(temp,1), temp2=split(EarliestEvent," "), EarliestTime=mvindex(temp2,0), EarliestEvent=mvindex(temp2,1), Duration=LatestTime-EarliestTime 
+| table TLogID Machine EarliestTime Duration LogGroup host 
+| join type=left Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ 
+        Type=ProcessMetrics 
+    | dedup Machine, DCID 
+    | rename DCID as DataCenter 
+    | table Machine DataCenter] 
+| fillnull value="null" DataCenter 
+| table TLogID Machine DataCenter EarliestTime Duration host LogGroup 
+| join type=left TLogID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        ((Type="TLogRejoining") OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow")) OR ((Type="TLogLockStarted" OR Type="TLogLocked")) OR (Type="TLogStop") OR (Type="TLogStop2") OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh")) AND (NOT TrackLatestType="Rolled") 
+    | sort -Time 
+    | eval TLogID=case((Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRejoining"), Time." ".Type." ".Master, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."Null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."Null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."Null") 
+    | stats list(*) by TLogID 
+    | rename list(*) As * 
+    | table TLogID TLogEvents 
+    | eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) 
+    | search ignore=0 
+    | sort TLogID 
+    | table TLogID TLogEvents 
+    | mvexpand TLogEvents 
+    | eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), RoleID=mvindex(temp,2) 
+    | fields - temp - TLogEvents 
+    | sort 0 -Time 
+    | search NOT RoleID="NULL" 
+    | table TLogID RoleID MasterMachine 
+    | stats list(*) by TLogID 
+    | rename list(*) as * 
+    | streamstats count 
+    | mvexpand RoleID 
+    | dedup count RoleID 
+    | fields - count 
+    | stats count by TLogID 
+    | rename count as Roles 
+    | table TLogID Roles] 
+| table TLogID Machine DataCenter Roles EarliestTime Duration host LogGroup 
+| join type=left TLogID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="TLogRecover") OR (Type="TLogReady") OR (Type="TLogStart") OR
+        ((Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked") OR (Type="TLogStop") OR (Type="TLogStop2") OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh") AND (NOT TrackLatestType="Rolled")) 
+    | sort -Time 
+    | eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=if(Type="Role", Type.Transition, Type) 
+    | sort 0 TLogEvents 
+    | stats list(TLogEvents) by TLogID 
+    | rename list(TLogEvents) As TLogEvents 
+    | table TLogID TLogEvents 
+    | eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) 
+    | search ignore=0 
+    | mvcombine delim=" " TLogEvents 
+    | table TLogID TLogEvents] 
+| table TLogID Machine DataCenter Roles Duration TLogEvents EarliestTime host LogGroup 
+| eval EarliestDateTime=strftime(EarliestTime, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+| table TLogID Machine DataCenter Roles Duration TLogEvents EarliestDateTime host LogGroup
+| join type=left TLogID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="TLogStart") OR (Type="TLogPersistentStateRestore") 
+    | eval TLogID=if(Type="TLogStart", ID, LogId) 
+    | table TLogID RecoveryCount] 
+| table TLogID RecoveryCount Machine DataCenter Roles Duration TLogEvents EarliestDateTime host LogGroup 
+| fillnull value="TLog too old, click and see details" RecoveryCount</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">cell</option>
+        <option name="wrap">false</option>
+        <drilldown>
+          <set token="row.TLogID">$click.value$</set>
+        </drilldown>
+      </table>
+    </panel>
+    <panel>
+      <title>Table 12: Event Details (Including rejoining events) of the selected TLog (in Table 11)</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type="TLogRecover" AND LogId=$row.TLogID$) OR (Type="TLogReady" AND ID=$row.TLogID$) OR (Type="TLogStart" AND ID=$row.TLogID$) OR
+    ((Type="TLogRejoining" AND ID=$row.TLogID$) OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow") AND TLog=$row.TLogID$) OR ((Type="TLogLockStarted" OR Type="TLogLocked") AND TLog=$row.TLogID$) OR (Type="TLogStop" AND ID=$row.TLogID$) OR (Type="TLogStop2" AND LogId=$row.TLogID$) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND ID=$row.TLogID$)) AND (NOT TrackLatestType="Rolled") 
+| sort -Time 
+| eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRecover"), Time." ".Type." "."-"." "."-", (Type="TLogReady"), Time." ".Type." "."-"." "."-", (Type="TLogStart"), Time." ".Type." "."-"." "."-", (Type="TLogRejoining"), Time." ".Type." ".Master." "."-", (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."-", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."-"." "."-", (Type="Role" AND As="TLog" AND Transition="Begin" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."-"." ".Origination, (Type="Role" AND As="TLog" AND Transition="End" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."-"." "."-") 
+| stats list(*) by TLogID 
+| rename list(*) As * 
+| table TLogID TLogEvents 
+| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) 
+| search ignore=0 
+| sort TLogID 
+| join type=left TLogID 
+    [ search index=$Index$ LogGroup=$LogGroup$ (Type="Role" AND As="TLog" AND ID=$row.TLogID$) 
+    | dedup ID 
+    | rename ID as TLogID 
+    | table TLogID Machine] 
+| table TLogID Machine TLogEvents 
+| fillnull value="-" Machine 
+| mvexpand TLogEvents 
+| eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), ToID=mvindex(temp,2), Origination= mvindex(temp,3) 
+| fields - temp - TLogEvents 
+| join type=left 
+    [ search index=$Index$ LogGroup=$LogGroup$ (Type="Role") 
+    | dedup ID 
+    | rename ID as ToID 
+    | rename As as ToRole 
+    | rename Machine as ToMachine 
+    | table ToID ToRole ToMachine] 
+| sort 0 -Time 
+| fillnull value="-" ToRole ToMachine 
+| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+| table TLogID Machine Event DateTime ToID ToRole ToMachine Time DateTime</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="count">14</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+        <option name="wrap">false</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 13: All Tags of the selected TLog (in Table 11) that have been popped by SSes (using TLogPoppedTag event)</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (ID=$row.TLogID$ AND Type="TLogPoppedTag") 
+| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+| rename ID as TLogID 
+| rename Tags as UnpoppedRecoveredTagCount 
+| rename Tag as TagPopped 
+| rename DurableKCVer as DurableKnownCommittedVersion 
+| search TagPopped!="-1:2" 
+| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped DurableKnownCommittedVersion RecoveredAt 
+| sort 0 -UnpoppedRecoveredTagCount 
+| join TagPopped type=left 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="StorageMetrics") 
+    | stats latest(*) by Machine 
+    | rename latest(*) as * 
+    | rename Tag as TagPopped 
+    | table TagPopped ID Machine] 
+| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped DurableKnownCommittedVersion RecoveredAt ID Machine
+| join type=left Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type=ProcessMetrics
+    | dedup Machine, DCID 
+    | rename DCID as DataCenter 
+    | table Machine DataCenter] 
+| rename ID as SSID 
+| rename Machine as SSMachine 
+| rename DataCenter as SSDataCenter 
+| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped SSID SSMachine SSDataCenter DurableKnownCommittedVersion RecoveredAt 
+| fillnull value="-"</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+        <option name="wrap">false</option>
+      </table>
+    </panel>
+    <panel>
+      <title>Table 14: All Tags of the selected TLog (in Table 11) to be popped by SSes (using TLogReady event)</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (ID=$row.TLogID$ AND Type="TLogReady") 
+| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+| rename ID as TLogID 
+| table TLogID Type AllTags Locality 
+| makemv delim="," AllTags 
+| mvexpand AllTags 
+| rename AllTags as Tag | sort 0 Tag
+| join Tag type=left 
+    [ search index=$Index$ LogGroup=$LogGroup$ 
+        (Type="StorageMetrics") 
+    | stats latest(*) by Machine 
+    | rename latest(*) as * 
+    | table Tag ID Machine] 
+| table TLogID Tag ID Machine
+| join type=left Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type=ProcessMetrics 
+    | dedup Machine, DCID 
+    | rename DCID as DataCenter 
+    | table Machine DataCenter] 
+| fillnull value="-"
+| table TLogID Tag ID Machine DataCenter 
+| rename ID as SSID | rename Machine as SSMachine | rename DataCenter as SSDataCenter
+| search Tag!="-1:2"</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 15: The Tags of the selected TLog (in Table 11) that are not popped by SSes (using set diff tags in Table 13 and Table 14) (if result contains "...", the result of Table 15 is wrong)</title>
+      <table>
+        <search>
+          <query>| set diff 
+    [ search index=$Index$ LogGroup=$LogGroup$ 
+        (ID=$row.TLogID$ AND Type="TLogReady") 
+    | table AllTags  
+    | makemv delim="," AllTags 
+    | mvexpand AllTags 
+    | rename AllTags as Tag 
+    | table Tag] 
+    [ search index=$Index$ LogGroup=$LogGroup$ 
+        (ID=$row.TLogID$ AND Type="TLogPoppedTag") 
+    | table Tag]</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+    <panel>
+      <title>Table 16: All Current Storage Servers (assume each machine has at most one SS)</title>
+      <input type="text" token="TriggerSSTableToken" searchWhenChanged="true">
+        <label>Input * to search</label>
+      </input>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type="StorageMetrics") AND $TriggerSSTableToken$ 
+| stats latest(*) by Machine 
+| rename latest(*) as * 
+| table Tag ID Machine 
+| join type=left Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type=ProcessMetrics 
+    | dedup Machine, DCID 
+    | rename DCID as DataCenter 
+    | table Machine DataCenter] 
+| table ID Machine DataCenter Tag 
+| join ID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ((As="StorageServer")) AND (NOT TrackLatestType="Rolled")) 
+    | stats latest(*) by Machine 
+    | rename latest(*) as * 
+    | rename As as Role 
+    | table ID Role Machine 
+    | join type=left Machine 
+        [ search index=$Index$ LogGroup=$LogGroup$
+            Type=ProcessMetrics 
+        | dedup Machine, DCID 
+        | rename DCID as DataCenter 
+        | table Machine DataCenter] 
+    | table ID Role Machine DataCenter 
+    | fillnull value="null" DataCenter] 
+| sort 0 DataCenter
+| table Tag ID Machine DataCenter | sort 0 Tag</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 1: Timeout/TimedOut event distribution grouped by source (Machine)</title>
+      <input type="text" token="TimeoutEventByMachineTableTimeSpanToken" searchWhenChanged="true">
+        <label>TimeSpan</label>
+        <default>5s</default>
+      </input>
+      <input type="multiselect" token="TimeoutbyMachineTableSourceRoleToken" searchWhenChanged="true">
+        <label>Select Source Roles</label>
+        <choice value="TLog">TLog</choice>
+        <choice value="MasterServer">MasterServer</choice>
+        <choice value="MasterProxyServer">MasterProxyServer (for version &lt; 7)</choice>
+        <choice value="Resolver">Resolver</choice>
+        <choice value="ClusterController">ClusterController</choice>
+        <choice value="SharedTLog">SharedTLog</choice>
+        <choice value="LogRouter">LogRouter</choice>
+        <choice value="Coordinator">Coordinator</choice>
+        <choice value="StorageServer">StorageServer</choice>
+        <choice value="CommitProxyServer">CommitProxyServer (for version 7+)</choice>
+        <choice value="GrvProxyServer">GrvProxyServer (for ver 7+)</choice>
+        <valuePrefix>As="</valuePrefix>
+        <valueSuffix>"</valueSuffix>
+        <delimiter> OR </delimiter>
+      </input>
+      <input type="multiselect" token="TimeoutbyMachineTableDestinationRoleToken" searchWhenChanged="true">
+        <label>Select Destination Roles</label>
+        <choice value="TLog">TLog</choice>
+        <choice value="MasterServer">MasterServer</choice>
+        <choice value="MasterProxyServer">MasterProxyServer (for version &lt;7)</choice>
+        <choice value="Resolver">Resolver</choice>
+        <choice value="ClusterController">ClusterController</choice>
+        <choice value="SharedTLog">SharedTLog</choice>
+        <choice value="LogRouter">LogRouter</choice>
+        <choice value="Coordinator">Coordinator</choice>
+        <choice value="StorageServer">StorageServer</choice>
+        <choice value="CommitProxyServer">CommitProxyServer (for version 7+)</choice>
+        <choice value="GrvProxyServer">GrvProxyServer (for version 7+)</choice>
+        <valuePrefix>As="</valuePrefix>
+        <valueSuffix>"</valueSuffix>
+        <delimiter> OR </delimiter>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type=ConnectionTimedOut OR Type=ConnectionTimeout) 
+| replace *:tls with * in PeerAddr 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$)) 
+    | dedup ID] 
+| join PeerAddr 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$)) 
+    | dedup ID 
+    | rename Machine as PeerAddr] 
+| timechart useother=0 span=$TimeoutEventByMachineTableTimeSpanToken$ count by Machine</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="height">233</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 2: Timeout/TimedOut event distribution grouped by destination (PeerAddr)</title>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type=ConnectionTimedOut OR Type=ConnectionTimeout) 
+| replace *:tls with * in PeerAddr 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$)) 
+    | dedup ID] 
+| join PeerAddr 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$)) 
+    | dedup ID 
+    | rename Machine as PeerAddr] 
+| timechart useother=0 span=$TimeoutEventByMachineTableTimeSpanToken$ count by PeerAddr</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="height">219</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 17: Check Type=ConnectionTimedOut OR Type=ConnectionTimeout events between transaction roles in the recovery (including the role that refresh/begin/end in the timespan)</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type=ConnectionTimedOut OR Type=ConnectionTimeout) 
+| replace *:tls with * in PeerAddr 
+| stats count as TotalTimeouts by Machine PeerAddr 
+| table Machine PeerAddr TotalTimeouts 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$)) 
+    | stats latest(*) by ID 
+    | rename latest(*) as * 
+    | eval Role = As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+    | stats list(Role) AS MachineRoleLatestEvent BY Machine 
+        ] 
+| join PeerAddr 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$)) 
+    | stats latest(*) by ID 
+    | rename latest(*) as * 
+    | eval Role = As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+    | stats list(Role) AS PeerRoleLatestEvent BY Machine 
+    | rename Machine AS PeerAddr
+        ] 
+| table Machine PeerAddr TotalTimeouts MachineRoleLatestEvent PeerRoleLatestEvent</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 18: Proxy 0</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Type="ProxyReplies" OR Type="CommitProxyReplies") AND FirstProxy="True" 
+| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+| table WorkerID LogGroup FirstProxy Time DateTime 
+| sort 0 -Time 
+| join type=left WorkerID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type="Role" AND As="Worker" AND Transition="Refresh" 
+    | dedup ID 
+    | rename ID as WorkerID 
+    | stats list(*) by WorkerID 
+    | rename list(*) as * 
+    | table WorkerID Machine Roles] 
+| table WorkerID Machine Roles LogGroup FirstProxy Time DateTime 
+| join type=left Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type="Role" AND (As="MasterProxyServer" OR As="CommitProxyServer") AND Transition="Refresh" 
+    | dedup ID 
+    | rename ID as ProxyID 
+    | table Machine ProxyID] 
+| table ProxyID Machine LogGroup FirstProxy</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 19: Latest Role Events on the input Machine (Input Machine, like 172.27.113.121:4500)</title>
+      <input type="text" token="SearchMachineToken" searchWhenChanged="true">
+        <label>Machine (IP:PORT)</label>
+      </input>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="Role" AND Machine=$SearchMachineToken$ 
+| stats latest(*) by ID Transition 
+| rename latest(*) as * 
+| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") 
+| table DateTime Machine ID Transition As Roles LogGroup Error ErrorDescription Reason 
+| sort 0 -DateTime 
+| fillnull value="-"</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Chart 3: severity&gt;=20 event distribution (including roles that refresh/begin/end in the timespan)</title>
+      <input type="text" token="BadEvents" searchWhenChanged="true">
+        <label>Events</label>
+        <default>*</default>
+      </input>
+      <input type="multiselect" token="BadEventRoleToken" searchWhenChanged="true">
+        <label>Roles</label>
+        <choice value="TLog">TLog</choice>
+        <choice value="MasterServer">MasterServer</choice>
+        <choice value="MasterProxyServer">MasterProxyServer (for version &lt;7)</choice>
+        <choice value="Resolver">Resolver</choice>
+        <choice value="ClusterController">ClusterController</choice>
+        <choice value="SharedTLog">SharedTLog</choice>
+        <choice value="LogRouter">LogRouter</choice>
+        <choice value="Coordinator">Coordinator</choice>
+        <choice value="StorageServer">StorageServer</choice>
+        <choice value="CommitProxyServer">CommitProxyServer (for version 7+)</choice>
+        <choice value="GrvProxyServer">GrvProxyServer (for version 7+)</choice>
+        <valuePrefix>As="</valuePrefix>
+        <valueSuffix>"</valueSuffix>
+        <delimiter> OR </delimiter>
+      </input>
+      <input type="dropdown" token="BadEventChartBy" searchWhenChanged="true">
+        <label>By</label>
+        <choice value="Type">EventType</choice>
+        <choice value="Machine">Machine</choice>
+        <choice value="Severity">Severity</choice>
+        <default>Type</default>
+      </input>
+      <input type="text" token="BadEventChartTimeSpanToken" searchWhenChanged="true">
+        <label>TimeSpan</label>
+        <default>5s</default>
+      </input>
+      <chart>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Severity&gt;10 AND $BadEvents$
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type="Role" AND ($BadEventRoleToken$)
+    | dedup ID | table Machine] 
+| table Machine Type Severity _time
+| timechart useother=0 span=$BadEventChartTimeSpanToken$ count by $BadEventChartBy$</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="charting.chart">line</option>
+        <option name="charting.drilldown">none</option>
+        <option name="height">305</option>
+        <option name="refresh.display">progressbar</option>
+      </chart>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Table 20: Check severity&gt;20 events of roles in the recovery (including the role that refresh/begin/end in the timespan)</title>
+      <table>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ 
+    Severity&gt;10 
+| stats count by Machine Type 
+| rename count as Count 
+| join Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ 
+        Type="Role" AND ($BadEventRoleToken$)
+    | dedup ID 
+    | eval Role=As."-".ID 
+    | stats list(Role) by Machine 
+    | rename list(Role) as Roles 
+    | table Machine Roles] 
+| table Type Count Roles Machine 
+| sort -Count</query>
+          <earliest>$ReoveryTime.earliest$</earliest>
+          <latest>$ReoveryTime.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+        <option name="refresh.display">progressbar</option>
+        <option name="wrap">false</option>
+      </table>
+    </panel>
+  </row>
+</form>
--- a/contrib/observability_splunk_dashboard/transaction_latency.xml
+++ b/contrib/observability_splunk_dashboard/transaction_latency.xml
@ -0,0 +1,247 @@
+<form theme="dark">
+  <label>FoundationDB - Tracing GRV and Commit Long Latency of CC Transactions (6.3 and 7.0+) (DEV)</label>
+  <description>Design for ClusterController issued transactions.</description>
+  <fieldset submitButton="false" autoRun="true">
+    <input type="text" token="Index" searchWhenChanged="true">
+      <label>Index</label>
+      <default></default>
+    </input>
+    <input type="text" token="LogGroup" searchWhenChanged="true">
+      <label>LogGroup</label>
+      <default>*</default>
+    </input>
+    <input type="text" token="transactionID">
+      <label>Hex Transaction ID (optional)</label>
+      <default>*</default>
+    </input>
+    <input type="time" token="time_token" searchWhenChanged="true">
+      <label>Time span</label>
+      <default>
+        <earliest>@d</earliest>
+        <latest>now</latest>
+      </default>
+    </input>
+  </fieldset>
+  <row>
+    <panel>
+      <title>All Transactions (Currently, this table also does not cover getrange operation and the operation which not do commit).</title>
+      <table>
+        <title>for FDB 6.3 and 7.0+</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ ID=$transactionID$
+    (Type="TransactionAttachID" OR Type="GetValueAttachID" OR Type="CommitAttachID") 
+| eval To=case(Type=="TransactionAttachID", "0"."-".To, Type="GetValueAttachID", "1"."-".To, Type=="CommitAttachID", "2"."-".To) 
+| stats list(To) by ID 
+| rename list(To) as ToList 
+| table ID ToList 
+| eval Count = mvcount(ToList) 
+| search Count=3 
+| eval To0=mvindex(ToList,0), To1=mvindex(ToList,1), To2=mvindex(ToList,2), To0=split(To0,"-"), To1=split(To1,"-"), To2=split(To2,"-"), GrvID=case(mvindex(To0, 0)=="0", mvindex(To0, 1), mvindex(To1, 0)=="0", mvindex(To1, 1), mvindex(To2, 0)=="0", mvindex(To2, 1)), ReadID=case(mvindex(To0, 0)=="1", mvindex(To0, 1), mvindex(To1, 0)=="1", mvindex(To1, 1), mvindex(To2, 0)=="1", mvindex(To2, 1)), CommitID=case(mvindex(To0, 0)=="2", mvindex(To0, 1), mvindex(To1, 0)=="2", mvindex(To1, 1), mvindex(To2, 0)=="2", mvindex(To2, 1)) 
+| table ID GrvID ReadID CommitID 
+| join GrvID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="TransactionDebug" AND Location="NativeAPI.getConsistentReadVersion.Before") 
+    | rename ID as GrvID 
+    | rename Time as BeginTime 
+    | table GrvID BeginTime
+        ] 
+| join GrvID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="TransactionDebug" AND Location="NativeAPI.getConsistentReadVersion.After") 
+    | rename ID as GrvID 
+    | rename Time as GRVDoneTime 
+    | table GrvID GRVDoneTime
+        ] 
+| join ReadID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="GetValueDebug" AND Location="NativeAPI.getValue.After") 
+    | rename ID as ReadID 
+    | rename Time as ReadDoneTime 
+    | table ReadID ReadDoneTime
+        ] 
+| join CommitID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        (Type="CommitDebug" AND Location="NativeAPI.commit.After") 
+    | rename ID as CommitID 
+    | rename Time as CommitDoneTime 
+    | table CommitID CommitDoneTime
+        ] 
+| rename ID as TransactionID 
+| eval BeginToGRVDone = GRVDoneTime-BeginTime, GRVDoneToReadDone = ReadDoneTime-GRVDoneTime, ReadDoneToCommitDone = CommitDoneTime-ReadDoneTime, Duration=CommitDoneTime-BeginTime, BeginTimeScope=BeginTime-1, EndTimeScope=CommitDoneTime+1, BeginDateTime=strftime(BeginTime, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
+| table TransactionID Duration BeginDateTime BeginToGRVDone GRVDoneToReadDone ReadDoneToCommitDone Duration  GrvID ReadID CommitID BeginTimeScope EndTimeScope | sort -Duration</query>
+          <earliest>$time_token.earliest$</earliest>
+          <latest>$time_token.latest$</latest>
+        </search>
+        <option name="drilldown">cell</option>
+        <drilldown>
+          <set token="BeginTime">$row.BeginTimeScope$</set>
+          <set token="EndTime">$row.EndTimeScope$</set>
+          <set token="ReadID">$row.ReadID$</set>
+          <set token="GrvID">$row.GrvID$</set>
+          <set token="CommitID">$row.CommitID$</set>
+        </drilldown>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Step1: GRV</title>
+      <table>
+        <title>for FDB 6.3 and 7.0+</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ 
+    Type="TransactionDebug" AND (NOT MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion) 
+AND (ID=$GrvID$ OR ID= 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type="TransactionAttachID" AND ID=$GrvID$
+    | return $To])
+| table Time Type ID Location Machine Roles
+| eventstats min(Time) as MinTime
+| eval Delta = Time - MinTime, Order = case(Location=="NativeAPI.getConsistentReadVersion.Before", 0, Location like "%ProxyServer.queueTransactionStartRequests.Before", 1, Location=="MasterProxyServer.masterProxyServerCore.Broadcast", 2, Location=="GrvProxyServer.transactionStarter.AskLiveCommittedVersionFromMaster", 2.1, Location like "%ProxyServer.getLiveCommittedVersion.confirmEpochLive", 3, Location=="MasterServer.serveLiveCommittedVersion.GetRawCommittedVersion", 4, Location like "%ProxyServer.getLiveCommittedVersion.After", 5, Location=="NativeAPI.getConsistentReadVersion.After", 6)
+| table Time Delta Order Type ID Location Machine Roles
+| sort 0 Order
+| table Machine Location Delta Time Roles ID Type</query>
+          <earliest>$BeginTime$</earliest>
+          <latest>$EndTime$</latest>
+        </search>
+        <option name="drilldown">none</option>
+      </table>
+    </panel>
+    <panel>
+      <title>Step1: (Only for FDB v6.3): GRV --- Get Committed Version (MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion Events)</title>
+      <table>
+        <title>only for FDB 6.3</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="TransactionDebug" AND Location="MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion" 
+    AND ID= 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type="TransactionAttachID" AND ID=$GrvID$ 
+    | return $To] 
+| table Time Type ID Location Machine Roles
+| eventstats min(Time) as MinTime
+| eval Delta = Time - MinTime
+| sort 0 -Time
+| table Machine Delta Time Roles ID Type</query>
+          <earliest>$BeginTime$</earliest>
+          <latest>$EndTime$</latest>
+        </search>
+        <option name="drilldown">none</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Step2: GetValue</title>
+      <table>
+        <title>for FDB 6.3 and 7.0+</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$ Type="GetValueDebug" AND ID=$ReadID$ 
+| eventstats min(Time) as MinTime 
+| eval Delta = Time-MinTime 
+| table Machine Location Delta Time Roles ID Type 
+| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10) 
+| sort 0 Order
+| table Machine Location Delta Time Roles ID Type</query>
+          <earliest>$time_token.earliest$</earliest>
+          <latest>$time_token.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Step3: Commit</title>
+      <table>
+        <title>for FDB 6.3 and 7.0+</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    Type="CommitDebug" AND (ID=$CommitID$ OR ID= 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type="CommitAttachID" AND ID=$CommitID$ 
+    | return $To]) 
+
+| table Time Type ID Location Machine Roles 
+| eventstats min(Time) as MinTime
+| eval Delta = Time-MinTime
+| table Machine Location Delta Time Roles ID Type
+| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location like "%ProxyServer.batcher", 1, Location like "%ProxyServer.commitBatch.Before", 2, Location like "%ProxyServer.commitBatch.GettingCommitVersion", 3, Location like "%ProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location like "%ProxyServer.commitBatch.AfterResolution", 8.5, Location like "%ProxyServer.commitBatch.ProcessingMutations", 9, Location like "%ProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLogServer.tLogCommit.BeforeWaitForVersion", 11, Location=="TLogServer.tLogCommit.Before", 12, Location=="TLogServer.tLogCommit.AfterTLogCommit", 13, Location=="TLogServer.tLogCommit.After", 14, Location like "%ProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16)
+| sort 0 Order
+| table Machine Location Delta Time Roles ID Type</query>
+          <earliest>$BeginTime$</earliest>
+          <latest>$EndTime$</latest>
+        </search>
+        <option name="drilldown">none</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Step3: Commit --- Resolver</title>
+      <table>
+        <title>for FDB 6.3 and 7.0+</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Location="Resolver*") 
+| join ID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type="CommitAttachID" AND ID= 
+        [ search index=$Index$ LogGroup=$LogGroup$
+            Type="CommitAttachID" AND ID=$CommitID$ 
+        | return $To] 
+    | rename To as ID 
+    | table ID] 
+| eventstats min(Time) as MinTime 
+| eval Delta = Time-MinTime 
+| eval Order=case(Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8)
+| sort 0 Time Order
+| stats list(*) by Type ID Machine Roles
+| rename list(*) as *
+| eval T1=mvindex(Time, 0), T2=mvindex(Time, 3), Duration=T2-T1 | sort -Duration
+| table Machine Roles Duration Location Delta Time
+| join type=left Machine 
+    [ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics 
+    | dedup Machine, DCID 
+    | rename DCID as DataCenter 
+    | table Machine DataCenter]
+| table Machine DataCenter Roles Duration Location Delta Time</query>
+          <earliest>$time_token.earliest$</earliest>
+          <latest>$time_token.latest$</latest>
+        </search>
+        <option name="drilldown">none</option>
+      </table>
+    </panel>
+  </row>
+  <row>
+    <panel>
+      <title>Step3: Commit --- Commit to TLogs (CommitDebug Events), grouped by Machine and sorted by Duration</title>
+      <table>
+        <title>for FDB 6.3 and 7.0+</title>
+        <search>
+          <query>index=$Index$ LogGroup=$LogGroup$
+    (Location="TLog*") 
+| join ID 
+    [ search index=$Index$ LogGroup=$LogGroup$
+        Type="CommitAttachID" AND ID= 
+        [ search index=$Index$ LogGroup=$LogGroup$
+            Type="CommitAttachID" AND ID=$CommitID$ 
+        | return $To] 
+    | rename To as ID 
+    | table ID] 
+| eventstats min(Time) as MinTime 
+| eval Delta = Time-MinTime 
+| sort 0 Time
+| stats list(*) by Type ID Machine Roles
+| rename list(*) as *
+| eval T1=mvindex(Time, 0), T2=mvindex(Time, 3), Duration=T2-T1 | sort -Duration
+| table Machine Roles Duration Location Delta Time</query>
+          <earliest>$BeginTime$</earliest>
+          <latest>$EndTime$</latest>
+        </search>
+        <option name="count">10</option>
+        <option name="drilldown">none</option>
+      </table>
+    </panel>
+  </row>
+</form>
--- a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py
+++ b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py
@ -284,6 +284,12 @@ class ErrorCommitInfo(BaseInfo):
        if protocol_version >= PROTOCOL_VERSION_6_3:
            self.report_conflicting_keys = bb.get_bool()

+        if protocol_version >= PROTOCOL_VERSION_7_1:
+            lock_aware = bb.get_bool()
+            if bb.get_bool():
+                spanId = bb.get_bytes(16)
+
+
 class UnsupportedProtocolVersionError(Exception):
    def __init__(self, protocol_version):
        super().__init__("Unsupported protocol version 0x%0.2X" % protocol_version)
--- a/documentation/sphinx/source/client-testing.rst
+++ b/documentation/sphinx/source/client-testing.rst
@ -373,3 +373,302 @@ with the ``multitest`` role:
   fdbserver -r multitest -f testfile.txt

 This command will block until all tests are completed.
+
+##########
+API Tester
+##########
+
+Introduction
+============
+
+API tester is a framework for implementing end-to-end tests of FDB C API, i.e. testing the API on a real
+FDB cluster through all layers of the FDB client. Its executable is ``fdb_c_api_tester``, and the source
+code is located in ``bindings/c/test/apitester``. The structure of API Tests is similar to that of the 
+Simulation Tests. The tests are implemented as workloads using FDB API, which are all built into the 
+``fdb_c_api_tester``. A concrete test configuration is defined as a TOML file, which specifies the
+combination of workloads to be executed by the test together with their parameters. The test can be then
+executed by passing the TOML file as a parameter to ``fdb_c_api_tester``. 
+
+Since simulation tests rely on the actor model to execute the tests deterministically in single-threaded
+mode, they are not suitable for testing various multi-threaded aspects of the FDB client. End-to-end API
+tests complement the simulation tests by testing the FDB Client layers above the single-threaded Native
+Client. 
+
+- The specific testing goals of the end-to-end tests are:
+- Check functional correctness of the Multi-Version Client (MVC) and Thread-Safe Client 
+- Detecting race conditions. They can be caused by accessing the state of the Native Client from wrong
+  threads or introducing other shared state without proper synchronization
+- Detecting memory management errors. Thread-safe reference counting must be used where necessary. MVC
+  works with multiple client libraries. Memory allocated by one client library must be also deallocated
+  by the same library.
+- Maintaining interoperability with other client versions.  The client functionality is made available
+  depending on the selected API version. The API changes are correctly adapted. 
+- Client API behaves correctly in case of cluster upgrades. Database and transaction state is correctly
+  migrated to the upgraded connections. Pending operations are canceled and successfully retried on the
+  upgraded connections.
+
+Implementing a Workload
+=======================
+
+Each workload is declared as a direct or indirect subclass of ``WorkloadBase`` implementing a constructor
+with ``WorkloadConfig`` as a parameter and the method ``start()``, which defines the entry point of the
+workload. 
+
+``WorkloadBase`` provides a set of methods that serve as building blocks for implementation of a workload:
+
+.. function:: execTransaction(start, cont, failOnError = true)
+
+   creates and executes an FDB transaction. Here ``start`` is a function that takes a transaction context
+   as parameter and implements the starting point of the transaction, and ``cont`` is a function implementing
+   a continuation to be executed after finishing the transaction execution. Transactions are automatically
+   retried on retryable errors. Transactions are retried by calling the ``start`` function again. In case
+   of a fatal error, the entire workload is considered as failed unless ``failOnError`` is set to ``false``.
+
+.. function:: schedule(task)
+
+   schedules a task for asynchronous execution. It is usually used in the continuations to schedule 
+   the next step of the workload.
+
+.. function:: info(msg) 
+              error(msg) 
+              
+   are used for logging a message with a tag identifying the workload. Issuing an error message marks
+   the workload as failed.
+
+The transaction context provides methods for implementation of the transaction logics:
+
+.. function:: tx()
+   
+   the reference to the FDB transaction object
+
+.. function:: continueAfter(future, cont, retryOnError = true)
+   
+   set a continuation to be executed when the future is ready. The ``retryOnError`` flag controls whether
+   the transaction should be automatically retried in case the future results in a retriable error.
+
+.. function:: continueAfterAll(futures, cont)
+   
+   takes a vector of futures and sets a continuation to be executed when all of the futures get ready.
+   The transaction is retried if at least one of the futures results in an error. This method is useful 
+   for handling multiple concurrent reads.
+
+.. function:: commit() 
+   
+   commit and finish the transaction. If the commit is successful, the execution proceeds to the
+   continuation of ``execTransaction()``. In case of a retriable error the transaction is
+   automatically retried. A fatal error results in a failure of the workoad.
+
+
+.. function:: done() 
+   
+   finish the transaction without committing. This method should be used to finish read transactions. 
+   The transaction gets destroyed and execution proceeds to the continuation of ``execTransaction()``.
+   Each transaction must be finished either by ``commit()`` or ``done()``, because otherwise
+   the framework considers that the transaction is still being executed, so it won't destroy it and
+   won't call the continuation.
+
+.. function:: onError(err) 
+   
+   Handle an error: restart the transaction in case of a retriable error, otherwise fail the workload.
+   This method is typically used in the continuation of ``continueAfter`` called with
+   ``retryOnError=false`` as a fallback to the default error handling.
+
+A workload execution ends automatically when it is marked as failed or its last continuation does not
+schedule any new task or transaction. 
+
+The workload class should be defined in the namespace FdbApiTester. The file name convention is
+``Tester{Name}Workload.cpp`` so that we distinguish them from the source files of simulation workloads.
+
+Basic Workload Example
+======================
+
+The code below implements a workload that consists of only two transactions. The first one sets a
+randomly generated key to a randomly generated value, and the second one reads the key and checks if
+the returned value matches the written one.
+
+.. literalinclude:: ../../../bindings/c/test/apitester/TesterExampleWorkload.cpp
+   :language: C++
+   :lines: 21-
+
+The workload is implemented in the method ``setAndGet``. It generates a random key and a random value
+and executes a transaction that writes that key-value pair and commits. In the continuation of the
+first ``execTransaction`` call, we execute the second transaction that reads the same key. The read
+operation returns a future. So we call ``continueAfter`` to set a continuation for that future. In the
+continuation we check if the returned value matches the written one and finish the transaction by
+calling ``ctx->done()``. After completing the second transaction we execute the continuation passed
+as parameter to the ``setAndGet`` method by the start method. In this case it is ``NO_OP_TASK``, which
+does nothing and so finishes the workload.
+
+Finally, we declare an instance ``WorkloadFactory`` to register this workload with the name ``SetAndGet``.
+
+Note that we use ``workloadId`` as a key prefix. This is necessary for isolating the key space of this
+workload, because the framework may be instructed to create multiple instances of the ``SetAndGet``
+workload. If we do not isolate the key space, another workload can write a different value for the
+same key and so break the assumption of the test.
+
+The workload is implemented using the internal C++ API, implemented in ``fdb_api.hpp``. It introduces
+a set of classes representing the FDB objects (transactions, futures, etc.). These classes provide C++-style 
+methods wrapping FDB C API calls and automate memory management by means of reference counting.
+
+Implementing Control Structures
+===============================
+
+Our basic workload executes just 2 transactions, but in practice we want to have workloads that generate
+multiple transactions. The following code demonstrates how we can modify our basic workload to generate
+multiple transactions in a loop. 
+
+.. code-block:: C++
+
+   class SetAndGetWorkload : public WorkloadBase {
+   public:
+      ...
+      int numIterations;
+      int iterationsLeft;
+
+      SetAndGetWorkload(const WorkloadConfig& config) : WorkloadBase(config) {
+         keyPrefix = fdb::toBytesRef(fmt::format("{}/", workloadId));
+         numIterations = config.getIntOption("numIterations", 1000);
+      }
+
+      void start() override {
+         iterationsLeft = numIterations;
+         setAndGetLoop();
+      }
+
+      void setAndGetLoop() {
+         if (iterationsLeft == 0) {
+            return;
+         }
+         iterationsLeft--;
+         setAndGet([this]() { setAndGetLoop(); });
+      }
+      ...
+   }
+
+We introduce a workload parameter ``numIterations`` to specify the number of iterations. If not specified
+in the test configuration it defaults to 1000.
+
+The method ``setAndGetLoop`` implements the loop that decrements iterationsLeft counter until it reaches 0
+and each iteration calls setAndGet with a continuation that returns the execution to the loop. As you
+can see we don't need any change in ``setAndGet``, just call it with another continuation. 
+
+The pattern of passing a continuation as a parameter also can be used to decompose the workload into a
+sequence of steps. For example,  we can introduce setup and cleanUp steps to our workload and modify the
+``setAndGetLoop`` to make it composable with an arbitrary continuation:
+
+.. code-block:: C++
+
+    void start() override {
+       setup([this](){
+           iterationsLeft = numIterations;
+           setAndGetLoop([this](){
+               cleanup(NO_OP_TASK);
+           });
+       });
+    }
+
+    void setAndGetLoop(TTaskFct cont) {
+       if (iterationsLeft == 0) {
+           schedule(cont);
+       }
+       iterationsLeft--;
+       setAndGet([this, cont]() { setAndGetLoop(cont); });
+   }
+
+   void setup(TTaskFct cont) { ... }
+
+   void cleanup(TTaskFct cont) {  ... }
+
+Note that we call ``schedule(cont)`` in ``setAndGetLoop`` instead of calling the continuation directly.
+In this way we avoid keeping ``setAndGetLoop`` in the call stack, when executing the next step.
+
+Subclassing ApiWorkload
+=======================
+
+``ApiWorkload`` is an abstract subclass of ``WorkloadBase`` that provides a framework for a typical
+implementation of API test workloads. It implements a workflow consisting of cleaning up the key space
+of the workload, populating it with newly generated data and then running a loop consisting of random
+database operations. The concrete subclasses of ``ApiWorkload`` are expected to override the method
+``randomOperation`` with an implementation of concrete random operations.
+
+The ``ApiWorkload`` maintains a local key-value store that mirrors the part of the database state
+relevant to the workload. A successful database write operation should be followed by a continuation
+that performs equivalent changes in the local store, and the results of a database read operation should
+be validated against the values from the local store. 
+
+Test Configuration
+==================
+
+A concrete test configuration is specified by a TOML file. The file must contain one ``[[test]]`` section
+specifying the general settings for test execution followed by one or more ``[[test.workload]]``
+configuration sessions, specifying the workloads to be executed and their parameters. The specified
+workloads are started all at once and executed concurrently.
+
+The ``[[test]]`` section can contain the following options:
+
+- ``title``: descriptive title of the test
+- ``multiThreaded``: enable multi-threading (default: false)
+- ``minFdbThreads`` and ``maxFdbThreads``: the number of FDB (network) threads to be randomly selected
+  from the given range (default: 1-1). Used only if ``multiThreaded=true``. It is also important to use
+  multiple database instances to make use of the multithreading.
+- ``minDatabases`` and ``maxDatabases``: the number of database instances to be randomly selected from
+  the given range (default 1-1). The transactions of all workloads are randomly load-balanced over the
+  pool of database instances.
+- ``minClients`` and ``maxClients``: the number of clients, i.e. instances of each workload, to be
+  randomly selected from the given range (default 1-8).
+- ``minClientThreads`` and ``maxClientThreads``: the number of client threads, i.e. the threads used
+  for execution of the workload, to be randomly selected from the given range (default 1-1).
+- ``blockOnFutures``: use blocking waits on futures instead of scheduling future callbacks asynchronously
+  (default: false)
+- ``buggify``: Enable client-side failure injection (default: false)
+- ``databasePerTransaction``: Create a separate database instance for each transaction (default: false).
+  It is a special mode useful for testing bugs related to creation and destruction of database instances. 
+- ``fdbCallbacksOnExternalThreads``: Enables the option ``FDB_NET_OPTION_CALLBACKS_ON_EXTERNAL_THREADS``
+  causting the callbacks of futures to be executed directly on the threads of the external FDB clients 
+  rather than on the thread of the local FDB client. 
+
+The workload section ``[[test.workload]]`` must contain the attribute name matching the registered name
+of the workload to be executed. Other options are workload-specific. 
+
+The subclasses of the ``ApiWorkload`` inherit the following configuration options:
+
+- ``minKeyLength`` and ``maxKeyLength``: the size range of randomly generated keys (default: 1-64)
+- ``minValueLength`` and ``maxValueLength``:  the size range of randomly generated values 
+  (default: 1-1000)
+- ``maxKeysPerTransaction``: the maximum number of keys per transaction (default: 50)
+- ``initialSize``: the number of key-value pairs in the initially populated database (default: 1000)
+- ``readExistingKeysRatio``: the probability of choosing an existing key for read operations 
+  (default: 0.9)
+- ``numRandomOperations``: the number of random operations to be executed per workload (default: 1000)
+- ``runUntilStop``: run the workload indefinitely until the stop command is received (default: false).
+   This execution mode in upgrade tests and other scripted tests, where the workload needs to
+   be generated continously until completion of the scripted test.
+- ``numOperationsForProgressCheck``: the number of operations to be performed to confirm a progress 
+   check (default: 10). This option is used in combination with ``runUntilStop``. Progress checks are
+   initiated by a test script to check if the client workload is successfully progressing after a
+   cluster change.
+
+Executing the Tests
+===================
+
+The ``fdb_c_api_tester`` executable takes a single TOML file as a parameter and executes the test
+according to its specification. Before that we must create a FDB cluster and pass its cluster file as
+a parameter to ``fdb_c_api_tester``. Note that multithreaded tests also need to be provided with an
+external client library. 
+
+For example, we can create a temporary cluster and use it for execution of one of the existing API tests:
+
+.. code-block:: bash
+
+   ${srcDir}/tests/TestRunner/tmp_cluster.py --build-dir ${buildDir} -- \
+      ${buildDir}/bin/fdb_c_api_tester \
+      --cluster-file @CLUSTER_FILE@ \
+      --external-client-library=${buildDir}/bindings/c/libfdb_c_external.so \
+      --test-file ${srcDir}/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml
+
+The test specifications added to the ``bindings/c/test/apitester/tests/`` directory are executed as a part
+of the regression test suite. They can be executed using the ``ctest`` target ``fdb_c_api_tests``:
+
+.. code-block:: bash
+   
+   ctest -R fdb_c_api_tests -VV
--- a/documentation/sphinx/source/special-keys.rst
+++ b/documentation/sphinx/source/special-keys.rst
@ -22,6 +22,8 @@ Each special key that existed before api version 630 is its own module. These ar
 #. ``\xff\xff/cluster_file_path`` - See :ref:`cluster file client access <cluster-file-client-access>`
 #. ``\xff\xff/status/json`` - See :doc:`Machine-readable status <mr-status>`

+#. ``\xff\xff/worker_interfaces`` - key as the worker's network address and value as the serialized ClientWorkerInterface, not transactional
+
 Prior to api version 630, it was also possible to read a range starting at ``\xff\xff/worker_interfaces``. This is mostly an implementation detail of fdbcli,
 but it's available in api version 630 as a module with prefix ``\xff\xff/worker_interfaces/``.

@ -210,6 +212,7 @@ that process, and wait for necessary data to be moved away.
 #. ``\xff\xff/management/options/failed_locality/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed_locality/<locality>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
 #. ``\xff\xff/management/tenant/map/<tenant>`` Read/write. Setting a key in this range to any value will result in a tenant being created with name ``<tenant>``. Clearing a key in this range will delete the tenant with name ``<tenant>``. Reading all or a portion of this range will return the list of tenants currently present in the cluster, excluding any changes in this transaction. Values read in this range will be JSON objects containing the metadata for the associated tenants.
 #. ``\xff\xff/management/tenant/rename/<tenant>`` Read/write. Setting a key in this range to an unused tenant name will result in the tenant with the name ``<tenant>`` to be renamed to the value provided. If the rename operation is a transaction retried in a loop, it is possible for the rename to be applied twice, in which case ``tenant_not_found`` or ``tenant_already_exists`` errors may be returned. This can be avoided by checking for the tenant's existence first.
+#. ``\xff\xff/management/options/worker_interfaces/verify`` Read/write. Setting this key will add a verification phase in reading ``\xff\xff/worker_interfaces``. Setting this key only has an effect in the current transaction and is not persisted on commit. Try to establish connections with every worker from the list returned by Cluster Controller and only return those workers that the client can connect to. This option is now only used in fdbcli commands ``kill``, ``suspend`` and ``expensive_data_check`` to populate the worker list.

 An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or
 an ip address and port (e.g. ``127.0.0.1:4500``) or any locality (e.g ``locality_dcid:primary-satellite`` or
--- a/fdbcli/BlobRangeCommand.actor.cpp
+++ b/fdbcli/BlobRangeCommand.actor.cpp
@ -23,6 +23,7 @@
 #include "fdbclient/FDBOptions.g.h"
 #include "fdbclient/IClientApi.h"
 #include "fdbclient/ManagementAPI.actor.h"
+#include "fdbclient/NativeAPI.actor.h"

 #include "flow/Arena.h"
 #include "flow/FastRef.h"
@ -31,33 +32,6 @@

 namespace {

-// copy to standalones for krm
-ACTOR Future<Void> setBlobRange(Database db, Key startKey, Key endKey, Value value) {
-	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
-
-	loop {
-		try {
-			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-
-			// FIXME: check that the set range is currently inactive, and that a revoked range is currently its own
-			// range in the map and fully set.
-
-			tr->set(blobRangeChangeKey, deterministicRandom()->randomUniqueID().toString());
-			// This is not coalescing because we want to keep each range logically separate.
-			wait(krmSetRange(tr, blobRangeKeys.begin, KeyRange(KeyRangeRef(startKey, endKey)), value));
-			wait(tr->commit());
-			printf("Successfully updated blob range [%s - %s) to %s\n",
-			       startKey.printable().c_str(),
-			       endKey.printable().c_str(),
-			       value.printable().c_str());
-			return Void();
-		} catch (Error& e) {
-			wait(tr->onError(e));
-		}
-	}
-}
-
 ACTOR Future<Version> getLatestReadVersion(Database db) {
 	state Transaction tr(db);
 	loop {
@ -99,65 +73,10 @@ ACTOR Future<Void> doBlobPurge(Database db, Key startKey, Key endKey, Optional<V
 	return Void();
 }

-ACTOR Future<Version> checkBlobSubrange(Database db, KeyRange keyRange, Optional<Version> version) {
-	state Transaction tr(db);
-	state Version readVersionOut = invalidVersion;
-	loop {
-		try {
-			wait(success(tr.readBlobGranules(keyRange, 0, version, &readVersionOut)));
-			return readVersionOut;
-		} catch (Error& e) {
-			wait(tr.onError(e));
-		}
-	}
-}
-
 ACTOR Future<Void> doBlobCheck(Database db, Key startKey, Key endKey, Optional<Version> version) {
-	state Transaction tr(db);
-	state Version readVersionOut = invalidVersion;
 	state double elapsed = -timer_monotonic();
-	state KeyRange range = KeyRange(KeyRangeRef(startKey, endKey));
-	state Standalone<VectorRef<KeyRangeRef>> allRanges;
-	loop {
-		try {
-			wait(store(allRanges, tr.getBlobGranuleRanges(range)));
-			break;
-		} catch (Error& e) {
-			wait(tr.onError(e));
-		}
-	}

-	if (allRanges.empty()) {
-		fmt::print("ERROR: No blob ranges for [{0} - {1})\n", startKey.printable(), endKey.printable());
-		return Void();
-	}
-	fmt::print("Loaded {0} blob ranges to check\n", allRanges.size());
-	state std::vector<Future<Version>> checkParts;
-	// Chunk up to smaller ranges than this limit. Must be smaller than BG_TOO_MANY_GRANULES to not hit the limit
-	int maxChunkSize = CLIENT_KNOBS->BG_TOO_MANY_GRANULES / 2;
-	KeyRange currentChunk;
-	int currentChunkSize = 0;
-	for (auto& it : allRanges) {
-		if (currentChunkSize == maxChunkSize) {
-			checkParts.push_back(checkBlobSubrange(db, currentChunk, version));
-			currentChunkSize = 0;
-		}
-		if (currentChunkSize == 0) {
-			currentChunk = it;
-		} else if (it.begin != currentChunk.end) {
-			fmt::print("ERROR: Blobrange check failed, gap in blob ranges from [{0} - {1})\n",
-			           currentChunk.end.printable(),
-			           it.begin.printable());
-			return Void();
-		} else {
-			currentChunk = KeyRangeRef(currentChunk.begin, it.end);
-		}
-		currentChunkSize++;
-	}
-	checkParts.push_back(checkBlobSubrange(db, currentChunk, version));
-
-	wait(waitForAll(checkParts));
-	readVersionOut = checkParts.back().get();
+	state Version readVersionOut = wait(db->verifyBlobRange(KeyRangeRef(startKey, endKey), version));

 	elapsed += timer_monotonic();

@ -201,7 +120,7 @@ ACTOR Future<bool> blobRangeCommandActor(Database localDb,
 		fmt::print("Invalid blob range [{0} - {1})\n", tokens[2].printable(), tokens[3].printable());
 	} else {
 		if (tokencmp(tokens[1], "start") || tokencmp(tokens[1], "stop")) {
-			bool starting = tokencmp(tokens[1], "start");
+			state bool starting = tokencmp(tokens[1], "start");
 			if (tokens.size() > 4) {
 				printUsage(tokens[0]);
 				return false;
@ -210,7 +129,19 @@ ACTOR Future<bool> blobRangeCommandActor(Database localDb,
 			           starting ? "Starting" : "Stopping",
 			           tokens[2].printable().c_str(),
 			           tokens[3].printable().c_str());
-			wait(setBlobRange(localDb, begin, end, starting ? LiteralStringRef("1") : StringRef()));
+			state bool success = false;
+			if (starting) {
+				wait(store(success, localDb->blobbifyRange(KeyRangeRef(begin, end))));
+			} else {
+				wait(store(success, localDb->unblobbifyRange(KeyRangeRef(begin, end))));
+			}
+			if (!success) {
+				fmt::print("{0} blobbify range for [{1} - {2}) failed\n",
+				           starting ? "Starting" : "Stopping",
+				           tokens[2].printable().c_str(),
+				           tokens[3].printable().c_str());
+			}
+			return success;
 		} else if (tokencmp(tokens[1], "purge") || tokencmp(tokens[1], "forcepurge") || tokencmp(tokens[1], "check")) {
 			bool purge = tokencmp(tokens[1], "purge") || tokencmp(tokens[1], "forcepurge");
 			bool forcePurge = tokencmp(tokens[1], "forcepurge");
--- a/fdbcli/ExpensiveDataCheckCommand.actor.cpp
+++ b/fdbcli/ExpensiveDataCheckCommand.actor.cpp
@ -46,7 +46,7 @@ ACTOR Future<bool> expensiveDataCheckCommandActor(
 	if (tokens.size() == 1) {
 		// initialize worker interfaces
 		address_interface->clear();
-		wait(getWorkerInterfaces(tr, address_interface));
+		wait(getWorkerInterfaces(tr, address_interface, true));
 	}
 	if (tokens.size() == 1 || tokencmp(tokens[1], "list")) {
 		if (address_interface->size() == 0) {
--- a/fdbcli/KillCommand.actor.cpp
+++ b/fdbcli/KillCommand.actor.cpp
@ -44,7 +44,7 @@ ACTOR Future<bool> killCommandActor(Reference<IDatabase> db,
 	if (tokens.size() == 1) {
 		// initialize worker interfaces
 		address_interface->clear();
-		wait(getWorkerInterfaces(tr, address_interface));
+		wait(getWorkerInterfaces(tr, address_interface, true));
 	}
 	if (tokens.size() == 1 || tokencmp(tokens[1], "list")) {
 		if (address_interface->size() == 0) {
--- a/fdbcli/SuspendCommand.actor.cpp
+++ b/fdbcli/SuspendCommand.actor.cpp
@ -43,7 +43,7 @@ ACTOR Future<bool> suspendCommandActor(Reference<IDatabase> db,
 	if (tokens.size() == 1) {
 		// initialize worker interfaces
 		address_interface->clear();
-		wait(getWorkerInterfaces(tr, address_interface));
+		wait(getWorkerInterfaces(tr, address_interface, true));
 		if (address_interface->size() == 0) {
 			printf("\nNo addresses can be suspended.\n");
 		} else if (address_interface->size() == 1) {
--- a/fdbcli/Util.actor.cpp
+++ b/fdbcli/Util.actor.cpp
@ -62,56 +62,52 @@ ACTOR Future<std::string> getSpecialKeysFailureErrorMessage(Reference<ITransacti
 	return valueObj["message"].get_str();
 }

-ACTOR Future<Void> verifyAndAddInterface(std::map<Key, std::pair<Value, ClientLeaderRegInterface>>* address_interface,
-                                         Reference<FlowLock> connectLock,
-                                         KeyValue kv) {
-	wait(connectLock->take());
-	state FlowLock::Releaser releaser(*connectLock);
-	state ClientWorkerInterface workerInterf;
-	try {
-		// the interface is back-ward compatible, thus if parsing failed, it needs to upgrade cli version
-		workerInterf = BinaryReader::fromStringRef<ClientWorkerInterface>(kv.value, IncludeVersion());
-	} catch (Error& e) {
-		fprintf(stderr, "Error: %s; CLI version is too old, please update to use a newer version\n", e.what());
-		return Void();
-	}
-	state ClientLeaderRegInterface leaderInterf(workerInterf.address());
-	choose {
-		when(Optional<LeaderInfo> rep =
-		         wait(brokenPromiseToNever(leaderInterf.getLeader.getReply(GetLeaderRequest())))) {
-			StringRef ip_port =
-			    (kv.key.endsWith(LiteralStringRef(":tls")) ? kv.key.removeSuffix(LiteralStringRef(":tls")) : kv.key)
-			        .removePrefix(LiteralStringRef("\xff\xff/worker_interfaces/"));
-			(*address_interface)[ip_port] = std::make_pair(kv.value, leaderInterf);
-
-			if (workerInterf.reboot.getEndpoint().addresses.secondaryAddress.present()) {
-				Key full_ip_port2 =
-				    StringRef(workerInterf.reboot.getEndpoint().addresses.secondaryAddress.get().toString());
-				StringRef ip_port2 = full_ip_port2.endsWith(LiteralStringRef(":tls"))
-				                         ? full_ip_port2.removeSuffix(LiteralStringRef(":tls"))
-				                         : full_ip_port2;
-				(*address_interface)[ip_port2] = std::make_pair(kv.value, leaderInterf);
-			}
+void addInterfacesFromKVs(RangeResult& kvs,
+                          std::map<Key, std::pair<Value, ClientLeaderRegInterface>>* address_interface) {
+	for (const auto& kv : kvs) {
+		ClientWorkerInterface workerInterf;
+		try {
+			// the interface is back-ward compatible, thus if parsing failed, it needs to upgrade cli version
+			workerInterf = BinaryReader::fromStringRef<ClientWorkerInterface>(kv.value, IncludeVersion());
+		} catch (Error& e) {
+			fprintf(stderr, "Error: %s; CLI version is too old, please update to use a newer version\n", e.what());
+			return;
+		}
+		ClientLeaderRegInterface leaderInterf(workerInterf.address());
+		StringRef ip_port =
+		    (kv.key.endsWith(LiteralStringRef(":tls")) ? kv.key.removeSuffix(LiteralStringRef(":tls")) : kv.key)
+		        .removePrefix(LiteralStringRef("\xff\xff/worker_interfaces/"));
+		(*address_interface)[ip_port] = std::make_pair(kv.value, leaderInterf);
+
+		if (workerInterf.reboot.getEndpoint().addresses.secondaryAddress.present()) {
+			Key full_ip_port2 =
+			    StringRef(workerInterf.reboot.getEndpoint().addresses.secondaryAddress.get().toString());
+			StringRef ip_port2 = full_ip_port2.endsWith(LiteralStringRef(":tls"))
+			                         ? full_ip_port2.removeSuffix(LiteralStringRef(":tls"))
+			                         : full_ip_port2;
+			(*address_interface)[ip_port2] = std::make_pair(kv.value, leaderInterf);
 		}
-		when(wait(delay(CLIENT_KNOBS->CLI_CONNECT_TIMEOUT))) {}
 	}
-	return Void();
 }

 ACTOR Future<Void> getWorkerInterfaces(Reference<ITransaction> tr,
-                                       std::map<Key, std::pair<Value, ClientLeaderRegInterface>>* address_interface) {
+                                       std::map<Key, std::pair<Value, ClientLeaderRegInterface>>* address_interface,
+                                       bool verify) {
+	if (verify) {
+		tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+		tr->set(workerInterfacesVerifyOptionSpecialKey, ValueRef());
+	}
 	// Hold the reference to the standalone's memory
 	state ThreadFuture<RangeResult> kvsFuture = tr->getRange(
 	    KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"), LiteralStringRef("\xff\xff/worker_interfaces0")),
 	    CLIENT_KNOBS->TOO_MANY);
-	RangeResult kvs = wait(safeThreadFutureToFuture(kvsFuture));
+	state RangeResult kvs = wait(safeThreadFutureToFuture(kvsFuture));
 	ASSERT(!kvs.more);
-	auto connectLock = makeReference<FlowLock>(CLIENT_KNOBS->CLI_CONNECT_PARALLELISM);
-	std::vector<Future<Void>> addInterfs;
-	for (auto it : kvs) {
-		addInterfs.push_back(verifyAndAddInterface(address_interface, connectLock, it));
+	if (verify) {
+		// remove the option if set
+		tr->clear(workerInterfacesVerifyOptionSpecialKey);
 	}
-	wait(waitForAll(addInterfs));
+	addInterfacesFromKVs(kvs, address_interface);
 	return Void();
 }

--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@ -1050,36 +1050,6 @@ Future<T> stopNetworkAfter(Future<T> what) {
 	}
 }

-ACTOR Future<Void> addInterface(std::map<Key, std::pair<Value, ClientLeaderRegInterface>>* address_interface,
-                                Reference<FlowLock> connectLock,
-                                KeyValue kv) {
-	wait(connectLock->take());
-	state FlowLock::Releaser releaser(*connectLock);
-	state ClientWorkerInterface workerInterf =
-	    BinaryReader::fromStringRef<ClientWorkerInterface>(kv.value, IncludeVersion());
-	state ClientLeaderRegInterface leaderInterf(workerInterf.address());
-	choose {
-		when(Optional<LeaderInfo> rep =
-		         wait(brokenPromiseToNever(leaderInterf.getLeader.getReply(GetLeaderRequest())))) {
-			StringRef ip_port =
-			    (kv.key.endsWith(LiteralStringRef(":tls")) ? kv.key.removeSuffix(LiteralStringRef(":tls")) : kv.key)
-			        .removePrefix(LiteralStringRef("\xff\xff/worker_interfaces/"));
-			(*address_interface)[ip_port] = std::make_pair(kv.value, leaderInterf);
-
-			if (workerInterf.reboot.getEndpoint().addresses.secondaryAddress.present()) {
-				Key full_ip_port2 =
-				    StringRef(workerInterf.reboot.getEndpoint().addresses.secondaryAddress.get().toString());
-				StringRef ip_port2 = full_ip_port2.endsWith(LiteralStringRef(":tls"))
-				                         ? full_ip_port2.removeSuffix(LiteralStringRef(":tls"))
-				                         : full_ip_port2;
-				(*address_interface)[ip_port2] = std::make_pair(kv.value, leaderInterf);
-			}
-		}
-		when(wait(delay(CLIENT_KNOBS->CLI_CONNECT_TIMEOUT))) {}
-	}
-	return Void();
-}
-
 ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 	state LineNoise& linenoise = *plinenoise;
 	state bool intrans = false;
--- a/fdbcli/include/fdbcli/fdbcli.actor.h
+++ b/fdbcli/include/fdbcli/fdbcli.actor.h
@ -120,6 +120,7 @@ extern const KeyRangeRef processClassSourceSpecialKeyRange;
 extern const KeyRangeRef processClassTypeSpecialKeyRange;
 // Other special keys
 inline const KeyRef errorMsgSpecialKey = LiteralStringRef("\xff\xff/error_message");
+inline const KeyRef workerInterfacesVerifyOptionSpecialKey = "\xff\xff/management/options/worker_interfaces/verify"_sr;
 // help functions (Copied from fdbcli.actor.cpp)

 // get all workers' info
@ -132,13 +133,14 @@ void printUsage(StringRef command);
 // Pre: tr failed with special_keys_api_failure error
 // Read the error message special key and return the message
 ACTOR Future<std::string> getSpecialKeysFailureErrorMessage(Reference<ITransaction> tr);
-// Using \xff\xff/worker_interfaces/ special key, get all worker interfaces
+// Using \xff\xff/worker_interfaces/ special key, get all worker interfaces.
+// A worker list will be returned from CC.
+// If verify, we will try to establish connections to all workers returned.
+// In particular, it will deserialize \xff\xff/worker_interfaces/<address>:=<ClientInterface> kv pairs and issue RPC
+// calls, then only return interfaces(kv pairs) the client can talk to
 ACTOR Future<Void> getWorkerInterfaces(Reference<ITransaction> tr,
-                                       std::map<Key, std::pair<Value, ClientLeaderRegInterface>>* address_interface);
-// Deserialize \xff\xff/worker_interfaces/<address>:=<ClientInterface> k-v pair and verify by a RPC call
-ACTOR Future<Void> verifyAndAddInterface(std::map<Key, std::pair<Value, ClientLeaderRegInterface>>* address_interface,
-                                         Reference<FlowLock> connectLock,
-                                         KeyValue kv);
+                                       std::map<Key, std::pair<Value, ClientLeaderRegInterface>>* address_interface,
+                                       bool verify = false);
 // print cluster status info
 void printStatus(StatusObjectReader statusObj,
                 StatusClient::StatusLevel level,
--- a/fdbclient/BlobGranuleFiles.cpp
+++ b/fdbclient/BlobGranuleFiles.cpp
@ -40,6 +40,7 @@

 #include <cstring>
 #include <fstream> // for perf microbenchmark
+#include <limits>
 #include <vector>

 #define BG_READ_DEBUG false
@ -209,16 +210,21 @@ namespace {
 BlobGranuleFileEncryptionKeys getEncryptBlobCipherKey(const BlobGranuleCipherKeysCtx cipherKeysCtx) {
 	BlobGranuleFileEncryptionKeys eKeys;

+	// Cipher key reconstructed is 'never' inserted into BlobCipherKey cache, choose 'neverExpire'
 	eKeys.textCipherKey = makeReference<BlobCipherKey>(cipherKeysCtx.textCipherKey.encryptDomainId,
 	                                                   cipherKeysCtx.textCipherKey.baseCipherId,
 	                                                   cipherKeysCtx.textCipherKey.baseCipher.begin(),
 	                                                   cipherKeysCtx.textCipherKey.baseCipher.size(),
-	                                                   cipherKeysCtx.textCipherKey.salt);
+	                                                   cipherKeysCtx.textCipherKey.salt,
+	                                                   std::numeric_limits<int64_t>::max(),
+	                                                   std::numeric_limits<int64_t>::max());
 	eKeys.headerCipherKey = makeReference<BlobCipherKey>(cipherKeysCtx.headerCipherKey.encryptDomainId,
 	                                                     cipherKeysCtx.headerCipherKey.baseCipherId,
 	                                                     cipherKeysCtx.headerCipherKey.baseCipher.begin(),
 	                                                     cipherKeysCtx.headerCipherKey.baseCipher.size(),
-	                                                     cipherKeysCtx.headerCipherKey.salt);
+	                                                     cipherKeysCtx.headerCipherKey.salt,
+	                                                     std::numeric_limits<int64_t>::max(),
+	                                                     std::numeric_limits<int64_t>::max());

 	return eKeys;
 }
@ -810,10 +816,6 @@ static Standalone<VectorRef<ParsedDeltaBoundaryRef>> loadSnapshotFile(

 	ASSERT(file.indexBlockRef.block.children.size() >= 2);

-	// TODO: refactor this out of delta tree
-	// int commonPrefixLen = commonPrefixLength(index.dataBlockOffsets.front().first,
-	// index.dataBlockOffsets.back().first);
-
 	// find range of blocks needed to read
 	ChildBlockPointerRef* currentBlock = file.findStartBlock(keyRange.begin);

@ -1163,10 +1165,6 @@ Standalone<VectorRef<ParsedDeltaBoundaryRef>> loadChunkedDeltaFile(const Standal

 	ASSERT(file.indexBlockRef.block.children.size() >= 2);

-	// TODO: refactor this out of delta tree
-	// int commonPrefixLen = commonPrefixLength(index.dataBlockOffsets.front().first,
-	// index.dataBlockOffsets.back().first);
-
 	// find range of blocks needed to read
 	ChildBlockPointerRef* currentBlock = file.findStartBlock(keyRange.begin);

@ -1175,7 +1173,8 @@ Standalone<VectorRef<ParsedDeltaBoundaryRef>> loadChunkedDeltaFile(const Standal
 		return deltas;
 	}

-	// TODO: could cpu optimize first block a bit more by seeking right to start
+	// FIXME: shared prefix for key comparison
+	// FIXME: could cpu optimize first block a bit more by seeking right to start
 	bool lastBlock = false;
 	bool prevClearAfter = false;
 	while (!lastBlock) {
@ -2378,7 +2377,6 @@ void checkDeltaRead(const KeyValueGen& kvGen,
 	std::string filename = randomBGFilename(
 	    deterministicRandom()->randomUniqueID(), deterministicRandom()->randomUniqueID(), readVersion, ".delta");
 	Standalone<BlobGranuleChunkRef> chunk;
-	// TODO need to add cipher keys meta
 	chunk.deltaFiles.emplace_back_deep(
 	    chunk.arena(), filename, 0, serialized->size(), serialized->size(), kvGen.cipherKeys);
 	chunk.keyRange = kvGen.allRange;
@ -2435,7 +2433,6 @@ static std::tuple<KeyRange, Version, Version> randomizeKeyAndVersions(const KeyV
 		}
 	}

-	// TODO randomize begin and read version to sometimes +/- 1 and readRange begin and end to keyAfter sometimes
 	return { readRange, beginVersion, readVersion };
 }

--- a/fdbclient/BlobGranuleReader.actor.cpp
+++ b/fdbclient/BlobGranuleReader.actor.cpp
@ -31,13 +31,6 @@
 #include "fdbclient/FDBTypes.h"
 #include "flow/actorcompiler.h" // This must be the last #include.

-// TODO more efficient data structure besides std::map? PTree is unnecessary since this isn't versioned, but some other
-// sorted thing could work. And if it used arenas it'd probably be more efficient with allocations, since everything
-// else is in 1 arena and discarded at the end.
-
-// TODO could refactor the file reading code from here and the delta file function into another actor,
-// then this part would also be testable? but meh
-
 ACTOR Future<Standalone<StringRef>> readFile(Reference<BlobConnectionProvider> bstoreProvider, BlobFilePointerRef f) {
 	try {
 		state Arena arena;
--- a/fdbclient/ClientKnobs.cpp
+++ b/fdbclient/ClientKnobs.cpp
@ -60,6 +60,7 @@ void ClientKnobs::initialize(Randomize randomize) {

 	init( WRONG_SHARD_SERVER_DELAY,                .01 ); if( randomize && BUGGIFY ) WRONG_SHARD_SERVER_DELAY = deterministicRandom()->random01(); // FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is mostly wrong (e.g. dumping the database after a test)
 	init( FUTURE_VERSION_RETRY_DELAY,              .01 ); if( randomize && BUGGIFY ) FUTURE_VERSION_RETRY_DELAY = deterministicRandom()->random01();// FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY;
+	init( GRV_ERROR_RETRY_DELAY,                   5.0 ); if( randomize && BUGGIFY ) GRV_ERROR_RETRY_DELAY = 0.01 + 5 * deterministicRandom()->random01();
 	init( UNKNOWN_TENANT_RETRY_DELAY,              0.0 ); if( randomize && BUGGIFY ) UNKNOWN_TENANT_RETRY_DELAY = deterministicRandom()->random01();
 	init( REPLY_BYTE_LIMIT,                      80000 );
 	init( DEFAULT_BACKOFF,                         .01 ); if( randomize && BUGGIFY ) DEFAULT_BACKOFF = deterministicRandom()->random01();
--- a/fdbclient/MonitorLeader.actor.cpp
+++ b/fdbclient/MonitorLeader.actor.cpp
@ -663,69 +663,43 @@ ACTOR Future<Void> asyncDeserializeClusterInterface(Reference<AsyncVar<Value>> s
 	}
 }

-struct ClientStatusStats {
-	int count;
-	std::vector<std::pair<NetworkAddress, Key>> examples;
+namespace {

-	ClientStatusStats() : count(0) { examples.reserve(CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT); }
-};
+void tryInsertIntoSamples(OpenDatabaseRequest::Samples& samples,
+                          const NetworkAddress& networkAddress,
+                          const Key& traceLogGroup) {
+	++samples.count;
+	if (samples.samples.size() < static_cast<size_t>(CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT)) {
+		samples.samples.insert({ networkAddress, traceLogGroup });
+	}
+}
+
+} // namespace

 OpenDatabaseRequest ClientData::getRequest() {
 	OpenDatabaseRequest req;

-	std::map<StringRef, ClientStatusStats> issueMap;
-	std::map<ClientVersionRef, ClientStatusStats> versionMap;
-	std::map<StringRef, ClientStatusStats> maxProtocolMap;
-	int clientCount = 0;
-
-	// SOMEDAY: add a yield in this loop
 	for (auto& ci : clientStatusInfoMap) {
-		for (auto& it : ci.second.issues) {
-			auto& entry = issueMap[it];
-			entry.count++;
-			if (entry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) {
-				entry.examples.emplace_back(ci.first, ci.second.traceLogGroup);
-			}
-		}
-		if (ci.second.versions.size()) {
-			clientCount++;
-			StringRef maxProtocol;
-			for (auto& it : ci.second.versions) {
-				maxProtocol = std::max(maxProtocol, it.protocolVersion);
-				auto& entry = versionMap[it];
-				entry.count++;
-				if (entry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) {
-					entry.examples.emplace_back(ci.first, ci.second.traceLogGroup);
-				}
-			}
-			auto& maxEntry = maxProtocolMap[maxProtocol];
-			maxEntry.count++;
-			if (maxEntry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) {
-				maxEntry.examples.emplace_back(ci.first, ci.second.traceLogGroup);
-			}
-		} else {
-			auto& entry = versionMap[ClientVersionRef()];
-			entry.count++;
-			if (entry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) {
-				entry.examples.emplace_back(ci.first, ci.second.traceLogGroup);
-			}
-		}
-	}
+		const auto& networkAddress = ci.first;
+		const auto& traceLogGroup = ci.second.traceLogGroup;

-	req.issues.reserve(issueMap.size());
-	for (auto& it : issueMap) {
-		req.issues.push_back(ItemWithExamples<Key>(it.first, it.second.count, it.second.examples));
+		for (auto& issue : ci.second.issues) {
+			tryInsertIntoSamples(req.issues[issue], networkAddress, traceLogGroup);
+		}
+
+		if (!ci.second.versions.size()) {
+			tryInsertIntoSamples(req.supportedVersions[ClientVersionRef()], networkAddress, traceLogGroup);
+			continue;
+		}
+
+		++req.clientCount;
+		StringRef maxProtocol;
+		for (auto& it : ci.second.versions) {
+			maxProtocol = std::max(maxProtocol, it.protocolVersion);
+			tryInsertIntoSamples(req.supportedVersions[it], networkAddress, traceLogGroup);
+		}
+		tryInsertIntoSamples(req.maxProtocolSupported[maxProtocol], networkAddress, traceLogGroup);
 	}
-	req.supportedVersions.reserve(versionMap.size());
-	for (auto& it : versionMap) {
-		req.supportedVersions.push_back(
-		    ItemWithExamples<Standalone<ClientVersionRef>>(it.first, it.second.count, it.second.examples));
-	}
-	req.maxProtocolSupported.reserve(maxProtocolMap.size());
-	for (auto& it : maxProtocolMap) {
-		req.maxProtocolSupported.push_back(ItemWithExamples<Key>(it.first, it.second.count, it.second.examples));
-	}
-	req.clientCount = clientCount;

 	return req;
 }
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@ -257,13 +257,14 @@ ThreadFuture<Standalone<VectorRef<KeyRef>>> DLTransaction::getRangeSplitPoints(c
 	});
 }

-ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> DLTransaction::getBlobGranuleRanges(const KeyRangeRef& keyRange) {
+ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> DLTransaction::getBlobGranuleRanges(const KeyRangeRef& keyRange,
+                                                                                     int rangeLimit) {
 	if (!api->transactionGetBlobGranuleRanges) {
 		return unsupported_operation();
 	}

 	FdbCApi::FDBFuture* f = api->transactionGetBlobGranuleRanges(
-	    tr, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size());
+	    tr, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), rangeLimit);
 	return toThreadFuture<Standalone<VectorRef<KeyRangeRef>>>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
 		const FdbCApi::FDBKeyRange* keyRanges;
 		int keyRangesLength;
@ -583,6 +584,71 @@ ThreadFuture<Void> DLDatabase::waitPurgeGranulesComplete(const KeyRef& purgeKey)
 	return toThreadFuture<Void>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { return Void(); });
 }

+ThreadFuture<bool> DLDatabase::blobbifyRange(const KeyRangeRef& keyRange) {
+	if (!api->databaseBlobbifyRange) {
+		return unsupported_operation();
+	}
+
+	FdbCApi::FDBFuture* f = api->databaseBlobbifyRange(
+	    db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size());
+
+	return toThreadFuture<bool>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
+		bool ret = false;
+		ASSERT(!api->futureGetBool(f, &ret));
+		return ret;
+	});
+}
+
+ThreadFuture<bool> DLDatabase::unblobbifyRange(const KeyRangeRef& keyRange) {
+	if (!api->databaseUnblobbifyRange) {
+		return unsupported_operation();
+	}
+
+	FdbCApi::FDBFuture* f = api->databaseUnblobbifyRange(
+	    db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size());
+
+	return toThreadFuture<bool>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
+		bool ret = false;
+		ASSERT(!api->futureGetBool(f, &ret));
+		return ret;
+	});
+}
+
+ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> DLDatabase::listBlobbifiedRanges(const KeyRangeRef& keyRange,
+                                                                                  int rangeLimit) {
+	if (!api->databaseListBlobbifiedRanges) {
+		return unsupported_operation();
+	}
+
+	FdbCApi::FDBFuture* f = api->databaseListBlobbifiedRanges(
+	    db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), rangeLimit);
+
+	return toThreadFuture<Standalone<VectorRef<KeyRangeRef>>>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
+		const FdbCApi::FDBKeyRange* keyRanges;
+		int keyRangesLength;
+		FdbCApi::fdb_error_t error = api->futureGetKeyRangeArray(f, &keyRanges, &keyRangesLength);
+		ASSERT(!error);
+		// The memory for this is stored in the FDBFuture and is released when the future gets destroyed.
+		return Standalone<VectorRef<KeyRangeRef>>(VectorRef<KeyRangeRef>((KeyRangeRef*)keyRanges, keyRangesLength),
+		                                          Arena());
+	});
+}
+
+ThreadFuture<Version> DLDatabase::verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) {
+	if (!api->databaseVerifyBlobRange) {
+		return unsupported_operation();
+	}
+
+	FdbCApi::FDBFuture* f = api->databaseVerifyBlobRange(
+	    db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), version);
+
+	return toThreadFuture<Version>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
+		Version version = invalidVersion;
+		ASSERT(!api->futureGetInt64(f, &version));
+		return version;
+	});
+}
+
 // DLApi

 // Loads the specified function from a dynamic library
@ -670,6 +736,13 @@ void DLApi::init() {
 	                   fdbCPath,
 	                   "fdb_database_wait_purge_granules_complete",
 	                   headerVersion >= 710);
+	loadClientFunction(&api->databaseBlobbifyRange, lib, fdbCPath, "fdb_database_blobbify_range", headerVersion >= 720);
+	loadClientFunction(
+	    &api->databaseUnblobbifyRange, lib, fdbCPath, "fdb_database_unblobbify_range", headerVersion >= 720);
+	loadClientFunction(
+	    &api->databaseListBlobbifiedRanges, lib, fdbCPath, "fdb_database_list_blobbified_ranges", headerVersion >= 720);
+	loadClientFunction(
+	    &api->databaseVerifyBlobRange, lib, fdbCPath, "fdb_database_verify_blob_range", headerVersion >= 720);

 	loadClientFunction(
 	    &api->tenantCreateTransaction, lib, fdbCPath, "fdb_tenant_create_transaction", headerVersion >= 710);
@ -744,6 +817,7 @@ void DLApi::init() {
 	                   fdbCPath,
 	                   headerVersion >= 620 ? "fdb_future_get_int64" : "fdb_future_get_version",
 	                   headerVersion >= 0);
+	loadClientFunction(&api->futureGetBool, lib, fdbCPath, "fdb_future_get_bool", headerVersion >= 720);
 	loadClientFunction(&api->futureGetUInt64, lib, fdbCPath, "fdb_future_get_uint64", headerVersion >= 700);
 	loadClientFunction(&api->futureGetError, lib, fdbCPath, "fdb_future_get_error", headerVersion >= 0);
 	loadClientFunction(&api->futureGetKey, lib, fdbCPath, "fdb_future_get_key", headerVersion >= 0);
@ -1079,9 +1153,10 @@ ThreadFuture<Standalone<VectorRef<KeyRef>>> MultiVersionTransaction::getRangeSpl
 }

 ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> MultiVersionTransaction::getBlobGranuleRanges(
-    const KeyRangeRef& keyRange) {
+    const KeyRangeRef& keyRange,
+    int rangeLimit) {
 	auto tr = getTransaction();
-	auto f = tr.transaction ? tr.transaction->getBlobGranuleRanges(keyRange)
+	auto f = tr.transaction ? tr.transaction->getBlobGranuleRanges(keyRange, rangeLimit)
 	                        : makeTimeout<Standalone<VectorRef<KeyRangeRef>>>();
 	return abortableFuture(f, tr.onChange);
 }
@ -1589,6 +1664,32 @@ ThreadFuture<Void> MultiVersionDatabase::waitPurgeGranulesComplete(const KeyRef&
 	return abortableFuture(f, dbState->dbVar->get().onChange);
 }

+ThreadFuture<bool> MultiVersionDatabase::blobbifyRange(const KeyRangeRef& keyRange) {
+	auto dbVar = dbState->dbVar->get();
+	auto f = dbVar.value ? dbVar.value->blobbifyRange(keyRange) : ThreadFuture<bool>(Never());
+	return abortableFuture(f, dbVar.onChange);
+}
+
+ThreadFuture<bool> MultiVersionDatabase::unblobbifyRange(const KeyRangeRef& keyRange) {
+	auto dbVar = dbState->dbVar->get();
+	auto f = dbVar.value ? dbVar.value->unblobbifyRange(keyRange) : ThreadFuture<bool>(Never());
+	return abortableFuture(f, dbVar.onChange);
+}
+
+ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> MultiVersionDatabase::listBlobbifiedRanges(const KeyRangeRef& keyRange,
+                                                                                            int rangeLimit) {
+	auto dbVar = dbState->dbVar->get();
+	auto f = dbVar.value ? dbVar.value->listBlobbifiedRanges(keyRange, rangeLimit)
+	                     : ThreadFuture<Standalone<VectorRef<KeyRangeRef>>>(Never());
+	return abortableFuture(f, dbVar.onChange);
+}
+
+ThreadFuture<Version> MultiVersionDatabase::verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) {
+	auto dbVar = dbState->dbVar->get();
+	auto f = dbVar.value ? dbVar.value->verifyBlobRange(keyRange, version) : ThreadFuture<Version>(Never());
+	return abortableFuture(f, dbVar.onChange);
+}
+
 // Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is given, the future won't return until the protocol version is different than expected
 // Note: this will never return if the server is running a protocol from FDB 5.0 or older
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -1279,32 +1279,6 @@ void DatabaseContext::registerSpecialKeysImpl(SpecialKeySpace::MODULE module,
 ACTOR Future<RangeResult> getWorkerInterfaces(Reference<IClusterConnectionRecord> clusterRecord);
 ACTOR Future<Optional<Value>> getJSON(Database db);

-struct WorkerInterfacesSpecialKeyImpl : SpecialKeyRangeReadImpl {
-	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw,
-	                             KeyRangeRef kr,
-	                             GetRangeLimits limitsHint) const override {
-		if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) {
-			Key prefix = Key(getKeyRange().begin);
-			return map(getWorkerInterfaces(ryw->getDatabase()->getConnectionRecord()),
-			           [prefix = prefix, kr = KeyRange(kr)](const RangeResult& in) {
-				           RangeResult result;
-				           for (const auto& [k_, v] : in) {
-					           auto k = k_.withPrefix(prefix);
-					           if (kr.contains(k))
-						           result.push_back_deep(result.arena(), KeyValueRef(k, v));
-				           }
-
-				           std::sort(result.begin(), result.end(), KeyValueRef::OrderByKey{});
-				           return result;
-			           });
-		} else {
-			return RangeResult();
-		}
-	}
-
-	explicit WorkerInterfacesSpecialKeyImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
-};
-
 struct SingleSpecialKeyImpl : SpecialKeyRangeReadImpl {
 	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw,
 	                             KeyRangeRef kr,
@ -3535,8 +3509,8 @@ ACTOR Future<Key> getKey(Reference<TransactionState> trState,

 ACTOR Future<Version> waitForCommittedVersion(Database cx, Version version, SpanContext spanContext) {
 	state Span span("NAPI:waitForCommittedVersion"_loc, spanContext);
-	try {
-		loop {
+	loop {
+		try {
 			choose {
 				when(wait(cx->onProxiesChanged())) {}
 				when(GetReadVersionReply v = wait(basicLoadBalance(
@ -3562,10 +3536,16 @@ ACTOR Future<Version> waitForCommittedVersion(Database cx, Version version, Span
 					wait(delay(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, cx->taskID));
 				}
 			}
+		} catch (Error& e) {
+			if (e.code() == error_code_batch_transaction_throttled ||
+			    e.code() == error_code_grv_proxy_memory_limit_exceeded) {
+				// GRV Proxy returns an error
+				wait(delayJittered(CLIENT_KNOBS->GRV_ERROR_RETRY_DELAY));
+			} else {
+				TraceEvent(SevError, "WaitForCommittedVersionError").error(e);
+				throw;
+			}
 		}
-	} catch (Error& e) {
-		TraceEvent(SevError, "WaitForCommittedVersionError").error(e);
-		throw;
 	}
 }

@ -6774,9 +6754,12 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion(SpanContext parentSpa
 				}
 			}
 		} catch (Error& e) {
-			if (e.code() != error_code_broken_promise && e.code() != error_code_batch_transaction_throttled)
+			if (e.code() != error_code_broken_promise && e.code() != error_code_batch_transaction_throttled &&
+			    e.code() != error_code_grv_proxy_memory_limit_exceeded)
 				TraceEvent(SevError, "GetConsistentReadVersionError").error(e);
-			if (e.code() == error_code_batch_transaction_throttled && !cx->apiVersionAtLeast(630)) {
+			if ((e.code() == error_code_batch_transaction_throttled ||
+			     e.code() == error_code_grv_proxy_memory_limit_exceeded) &&
+			    !cx->apiVersionAtLeast(630)) {
 				wait(delayJittered(5.0));
 			} else {
 				throw;
@ -7655,7 +7638,9 @@ Future<Standalone<VectorRef<KeyRef>>> Transaction::getRangeSplitPoints(KeyRange

 // the blob granule requests are a bit funky because they piggyback off the existing transaction to read from the system
 // keyspace
-ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRangesActor(Transaction* self, KeyRange keyRange) {
+ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRangesActor(Transaction* self,
+                                                                           KeyRange keyRange,
+                                                                           int rangeLimit) {
 	// FIXME: use streaming range read
 	state KeyRange currentRange = keyRange;
 	state Standalone<VectorRef<KeyRangeRef>> results;
@ -7678,7 +7663,7 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRangesActor(Trans

 			// basically krmGetRange, but enable it to not use tenant without RAW_ACCESS by doing manual getRange with
 			// UseTenant::False
-			GetRangeLimits limits(1000);
+			GetRangeLimits limits(2 * rangeLimit + 2);
 			limits.minRows = 2;
 			RangeResult rawMapping = wait(getRange(self->trState,
 			                                       self->getReadVersion(),
@ -7700,6 +7685,9 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRangesActor(Trans
 			if (blobGranuleMapping[i].value.size()) {
 				results.push_back(results.arena(),
 				                  KeyRangeRef(blobGranuleMapping[i].key, blobGranuleMapping[i + 1].key));
+				if (results.size() == rangeLimit) {
+					return results;
+				}
 			}
 		}
 		results.arena().dependsOn(blobGranuleMapping.arena());
@ -7711,8 +7699,8 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRangesActor(Trans
 	}
 }

-Future<Standalone<VectorRef<KeyRangeRef>>> Transaction::getBlobGranuleRanges(const KeyRange& range) {
-	return ::getBlobGranuleRangesActor(this, range);
+Future<Standalone<VectorRef<KeyRangeRef>>> Transaction::getBlobGranuleRanges(const KeyRange& range, int rangeLimit) {
+	return ::getBlobGranuleRangesActor(this, range, rangeLimit);
 }

 // hack (for now) to get blob worker interface into load balance
@ -7818,7 +7806,6 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
 				fmt::print("Key range [{0} - {1}) missing worker assignment!\n",
 				           granuleStartKey.printable(),
 				           granuleEndKey.printable());
-				// TODO probably new exception type instead
 			}
 			throw blob_granule_transaction_too_old();
 		}
@ -8024,6 +8011,71 @@ ACTOR Future<Version> setPerpetualStorageWiggle(Database cx, bool enable, LockAw
 	return version;
 }

+ACTOR Future<Version> checkBlobSubrange(Database db, KeyRange keyRange, Optional<Version> version) {
+	state Transaction tr(db);
+	state Version readVersionOut = invalidVersion;
+	loop {
+		try {
+			wait(success(tr.readBlobGranules(keyRange, 0, version, &readVersionOut)));
+			return readVersionOut;
+		} catch (Error& e) {
+			wait(tr.onError(e));
+		}
+	}
+}
+
+ACTOR Future<Version> verifyBlobRangeActor(Reference<DatabaseContext> cx, KeyRange range, Optional<Version> version) {
+	state Database db(cx);
+	state Transaction tr(db);
+	state Standalone<VectorRef<KeyRangeRef>> allRanges;
+	state KeyRange curRegion = KeyRangeRef(range.begin, range.begin);
+	state Version readVersionOut = invalidVersion;
+	state int batchSize = CLIENT_KNOBS->BG_TOO_MANY_GRANULES / 2;
+	loop {
+		try {
+			wait(store(allRanges, tr.getBlobGranuleRanges(KeyRangeRef(curRegion.begin, range.end), 20 * batchSize)));
+		} catch (Error& e) {
+			wait(tr.onError(e));
+		}
+
+		if (allRanges.empty()) {
+			if (curRegion.begin < range.end) {
+				return invalidVersion;
+			}
+			return readVersionOut;
+		}
+
+		state std::vector<Future<Version>> checkParts;
+		// Chunk up to smaller ranges than this limit. Must be smaller than BG_TOO_MANY_GRANULES to not hit the limit
+		int batchCount = 0;
+		for (auto& it : allRanges) {
+			if (it.begin != curRegion.end) {
+				return invalidVersion;
+			}
+
+			curRegion = KeyRangeRef(curRegion.begin, it.end);
+			batchCount++;
+
+			if (batchCount == batchSize) {
+				checkParts.push_back(checkBlobSubrange(db, curRegion, version));
+				batchCount = 0;
+				curRegion = KeyRangeRef(curRegion.end, curRegion.end);
+			}
+		}
+		if (!curRegion.empty()) {
+			checkParts.push_back(checkBlobSubrange(db, curRegion, version));
+		}
+
+		wait(waitForAll(checkParts));
+		readVersionOut = checkParts.back().get();
+		curRegion = KeyRangeRef(curRegion.end, curRegion.end);
+	}
+}
+
+Future<Version> DatabaseContext::verifyBlobRange(const KeyRange& range, Optional<Version> version) {
+	return verifyBlobRangeActor(Reference<DatabaseContext>::addRef(this), range, version);
+}
+
 ACTOR Future<std::vector<std::pair<UID, StorageWiggleValue>>> readStorageWiggleValues(Database cx,
                                                                                      bool primary,
                                                                                      bool use_system_priority) {
@ -9733,6 +9785,7 @@ Reference<DatabaseContext::TransactionT> DatabaseContext::createTransaction() {
 	return makeReference<ReadYourWritesTransaction>(Database(Reference<DatabaseContext>::addRef(this)));
 }

+// BlobGranule API.
 ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
                                         KeyRange range,
                                         Version purgeVersion,
@ -9824,6 +9877,89 @@ Future<Void> DatabaseContext::waitPurgeGranulesComplete(Key purgeKey) {
 	return waitPurgeGranulesCompleteActor(Reference<DatabaseContext>::addRef(this), purgeKey);
 }

+ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx, KeyRange range, bool active) {
+	state Database db(cx);
+	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
+
+	state Value value = active ? blobRangeActive : blobRangeInactive;
+
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+
+			if (active) {
+				state RangeResult results = wait(krmGetRanges(tr, blobRangeKeys.begin, range));
+				ASSERT(results.size() >= 2);
+				if (results[0].key == range.begin && results[1].key == range.end &&
+				    results[0].value == blobRangeActive) {
+					return true;
+				} else {
+					for (int i = 0; i < results.size(); i++) {
+						if (results[i].value == blobRangeActive) {
+							return false;
+						}
+					}
+				}
+			}
+
+			tr->set(blobRangeChangeKey, deterministicRandom()->randomUniqueID().toString());
+			// This is not coalescing because we want to keep each range logically separate.
+			wait(krmSetRange(tr, blobRangeKeys.begin, range, value));
+			wait(tr->commit());
+			printf("Successfully updated blob range [%s - %s) to %s\n",
+			       range.begin.printable().c_str(),
+			       range.end.printable().c_str(),
+			       value.printable().c_str());
+			return true;
+		} catch (Error& e) {
+			wait(tr->onError(e));
+		}
+	}
+}
+
+Future<bool> DatabaseContext::blobbifyRange(KeyRange range) {
+	return setBlobRangeActor(Reference<DatabaseContext>::addRef(this), range, true);
+}
+
+Future<bool> DatabaseContext::unblobbifyRange(KeyRange range) {
+	return setBlobRangeActor(Reference<DatabaseContext>::addRef(this), range, false);
+}
+
+ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRangesActor(Reference<DatabaseContext> cx,
+                                                                           KeyRange range,
+                                                                           int rangeLimit) {
+	state Database db(cx);
+	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
+	state Standalone<VectorRef<KeyRangeRef>> blobRanges;
+
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+
+			state RangeResult results = wait(krmGetRanges(tr, blobRangeKeys.begin, range, 2 * rangeLimit + 2));
+
+			blobRanges.arena().dependsOn(results.arena());
+			for (int i = 0; i < results.size() - 1; i++) {
+				if (results[i].value == LiteralStringRef("1")) {
+					blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].value, results[i + 1].value));
+				}
+				if (blobRanges.size() == rangeLimit) {
+					return blobRanges;
+				}
+			}
+
+			return blobRanges;
+		} catch (Error& e) {
+			wait(tr->onError(e));
+		}
+	}
+}
+
+Future<Standalone<VectorRef<KeyRangeRef>>> DatabaseContext::listBlobbifiedRanges(KeyRange range, int rowLimit) {
+	return listBlobbifiedRangesActor(Reference<DatabaseContext>::addRef(this), range, rowLimit);
+}
+
 int64_t getMaxKeySize(KeyRef const& key) {
 	return getMaxWriteKeySize(key, true);
 }
--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@ -1783,7 +1783,8 @@ Future<Standalone<VectorRef<KeyRef>>> ReadYourWritesTransaction::getRangeSplitPo
 	return waitOrError(tr.getRangeSplitPoints(range, chunkSize), resetPromise.getFuture());
 }

-Future<Standalone<VectorRef<KeyRangeRef>>> ReadYourWritesTransaction::getBlobGranuleRanges(const KeyRange& range) {
+Future<Standalone<VectorRef<KeyRangeRef>>> ReadYourWritesTransaction::getBlobGranuleRanges(const KeyRange& range,
+                                                                                           int rangeLimit) {
 	if (checkUsedDuringCommit()) {
 		return used_during_commit();
 	}
@ -1794,7 +1795,7 @@ Future<Standalone<VectorRef<KeyRangeRef>>> ReadYourWritesTransaction::getBlobGra
 	if (range.begin > maxKey || range.end > maxKey)
 		return key_outside_legal_range();

-	return waitOrError(tr.getBlobGranuleRanges(range), resetPromise.getFuture());
+	return waitOrError(tr.getBlobGranuleRanges(range, rangeLimit), resetPromise.getFuture());
 }

 Future<Standalone<VectorRef<BlobGranuleChunkRef>>> ReadYourWritesTransaction::readBlobGranules(
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@ -89,7 +89,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( MAX_CACHE_VERSIONS,                                   10e6 );
 	init( TLOG_IGNORE_POP_AUTO_ENABLE_DELAY,                   300.0 );
 	init( TXS_POPPED_MAX_DELAY,                                  1.0 ); if ( randomize && BUGGIFY ) TXS_POPPED_MAX_DELAY = deterministicRandom()->random01();
-	init( TLOG_MAX_CREATE_DURATION,                             10.0 );
+	// In some rare simulation tests, particularly with log_spill:=1 configured, the 10 second limit is exceeded, causing SevError trace events
+	// and simulation test failure. Increasing the knob value to 15.0 in simulation is a workaround to avoid these failures.
+	init( TLOG_MAX_CREATE_DURATION,                             10.0 ); if (isSimulated) TLOG_MAX_CREATE_DURATION = 15.0;
 	init( PEEK_LOGGING_AMOUNT,                                     5 );
 	init( PEEK_LOGGING_DELAY,                                    5.0 );
 	init( PEEK_RESET_INTERVAL,                                 300.0 ); if ( randomize && BUGGIFY ) PEEK_RESET_INTERVAL = 20.0;
@ -159,9 +161,14 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( PRIORITY_TEAM_FAILED,                                  805 );
 	init( PRIORITY_TEAM_0_LEFT,                                  809 );
 	init( PRIORITY_SPLIT_SHARD,                                  950 ); if( randomize && BUGGIFY ) PRIORITY_SPLIT_SHARD = 350;
+	init( PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD,           960 ); if( randomize && BUGGIFY ) PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD = 360; // Set as the lowest priority

 	// Data distribution
 	init( SHARD_ENCODE_LOCATION_METADATA,                      false ); if( randomize && BUGGIFY )  SHARD_ENCODE_LOCATION_METADATA = true;
+	init( ENABLE_DD_PHYSICAL_SHARD,                            false ); // EXPERIMENTAL; If true, SHARD_ENCODE_LOCATION_METADATA must be true; When true, optimization of data move between DCs is disabled
+	init( MAX_PHYSICAL_SHARD_BYTES,                        500000000 ); // 500 MB; for ENABLE_DD_PHYSICAL_SHARD; smaller leads to larger number of physicalShard per storage server
+ 	init( PHYSICAL_SHARD_METRICS_DELAY,                        300.0 ); // 300 seconds; for ENABLE_DD_PHYSICAL_SHARD
+	init( ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME,            600.0 ); if( randomize && BUGGIFY )  ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME = 0.0; // 600 seconds; for ENABLE_DD_PHYSICAL_SHARD
 	init( READ_REBALANCE_CPU_THRESHOLD,                         15.0 );
 	init( READ_REBALANCE_SRC_PARALLELISM,                         20 );
 	init( READ_REBALANCE_SHARD_TOPK,  READ_REBALANCE_SRC_PARALLELISM * 2 );
@ -365,6 +372,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( REPLACE_CONTENTS_BYTES,                                1e5 );

 	// KeyValueStoreRocksDB
+	init( ROCKSDB_READ_RANGE_ROW_LIMIT,                        65535 ); if( randomize && BUGGIFY )  ROCKSDB_READ_RANGE_ROW_LIMIT = deterministicRandom()->randomInt(2, 10);
+
 	init( ROCKSDB_BACKGROUND_PARALLELISM,                          4 );
 	init( ROCKSDB_READ_PARALLELISM,                                4 );
 	// Use a smaller memtable in simulation to avoid OOMs.
@ -394,6 +403,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi

 	init( ROCKSDB_PERFCONTEXT_ENABLE,                          false ); if( randomize && BUGGIFY ) ROCKSDB_PERFCONTEXT_ENABLE = deterministicRandom()->coinflip() ? false : true;
 	init( ROCKSDB_PERFCONTEXT_SAMPLE_RATE,                    0.0001 );
+	init( ROCKSDB_METRICS_SAMPLE_INTERVAL,						  0.0);
 	init( ROCKSDB_MAX_SUBCOMPACTIONS,                              2 );
 	init( ROCKSDB_SOFT_PENDING_COMPACT_BYTES_LIMIT,      64000000000 ); // 64GB, Rocksdb option, Writes will slow down.
 	init( ROCKSDB_HARD_PENDING_COMPACT_BYTES_LIMIT,     100000000000 ); // 100GB, Rocksdb option, Writes will stall.
@ -406,6 +416,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( ROCKSDB_COMPACTION_READAHEAD_SIZE,                   32768 ); // 32 KB, performs bigger reads when doing compaction.
 	init( ROCKSDB_BLOCK_SIZE,                                  32768 ); // 32 KB, size of the block in rocksdb cache.
 	init( ENABLE_SHARDED_ROCKSDB,                              false );
+	init( ROCKSDB_WRITE_BUFFER_SIZE,                         1 << 30 ); // 1G
+	init( ROCKSDB_MAX_TOTAL_WAL_SIZE,                              0 ); // RocksDB default.
+	init( ROCKSDB_MAX_BACKGROUND_JOBS,                             2 ); // RocksDB default.
+	init( ROCKSDB_DELETE_OBSOLETE_FILE_PERIOD,                 21600 ); // 6h, RocksDB default.

 	// Leader election
 	bool longLeaderElection = randomize && BUGGIFY;
@ -708,7 +722,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( FETCH_BLOCK_BYTES,                                     2e6 );
 	init( FETCH_KEYS_PARALLELISM_BYTES,                          4e6 ); if( randomize && BUGGIFY ) FETCH_KEYS_PARALLELISM_BYTES = 3e6;
 	init( FETCH_KEYS_PARALLELISM,                                  2 );
-	init( FETCH_KEYS_PARALLELISM_FULL,                            10 );
+	init( FETCH_KEYS_PARALLELISM_FULL,                             6 );
 	init( FETCH_KEYS_LOWER_PRIORITY,                               0 );
 	init( SERVE_FETCH_CHECKPOINT_PARALLELISM,                      4 );
 	init( BUGGIFY_BLOCK_BYTES,                                 10000 );
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@ -133,7 +133,8 @@ std::unordered_map<std::string, KeyRange> SpecialKeySpace::actorLineageApiComman
 std::set<std::string> SpecialKeySpace::options = { "excluded/force",
 	                                               "failed/force",
 	                                               "excluded_locality/force",
-	                                               "failed_locality/force" };
+	                                               "failed_locality/force",
+	                                               "worker_interfaces/verify" };

 std::set<std::string> SpecialKeySpace::tracingOptions = { kTracingTransactionIdKey, kTracingTokenKey };

@ -2754,6 +2755,64 @@ Future<Optional<std::string>> FailedLocalitiesRangeImpl::commit(ReadYourWritesTr
 	return excludeLocalityCommitActor(ryw, true);
 }

+// Defined in ReadYourWrites.actor.cpp
+ACTOR Future<RangeResult> getWorkerInterfaces(Reference<IClusterConnectionRecord> clusterRecord);
+// Defined in NativeAPI.actor.cpp
+ACTOR Future<bool> verifyInterfaceActor(Reference<FlowLock> connectLock, ClientWorkerInterface workerInterf);
+
+ACTOR static Future<RangeResult> workerInterfacesImplGetRangeActor(ReadYourWritesTransaction* ryw,
+                                                                   KeyRef prefix,
+                                                                   KeyRangeRef kr) {
+	if (!ryw->getDatabase().getPtr() || !ryw->getDatabase()->getConnectionRecord())
+		return RangeResult();
+
+	state RangeResult interfs = wait(getWorkerInterfaces(ryw->getDatabase()->getConnectionRecord()));
+	// for options' special keys, the boolean flag indicates if it's a SET operation
+	auto [verify, _] = ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandOptionSpecialKey(
+	    "worker_interfaces", "verify")];
+	state RangeResult result;
+	if (verify) {
+		// if verify option is set, we try to talk to every worker and only returns those we can talk to
+		Reference<FlowLock> connectLock(new FlowLock(CLIENT_KNOBS->CLI_CONNECT_PARALLELISM));
+		state std::vector<Future<bool>> verifyInterfs;
+		for (const auto& [k_, value] : interfs) {
+			auto k = k_.withPrefix(prefix);
+			if (kr.contains(k)) {
+				ClientWorkerInterface workerInterf =
+				    BinaryReader::fromStringRef<ClientWorkerInterface>(value, IncludeVersion());
+				verifyInterfs.push_back(verifyInterfaceActor(connectLock, workerInterf));
+			} else {
+				verifyInterfs.push_back(false);
+			}
+		}
+		wait(waitForAll(verifyInterfs));
+		// state int index;
+		for (int index = 0; index < interfs.size(); index++) {
+			if (verifyInterfs[index].get()) {
+				// if we can establish a connection, add the kv pair into the result
+				result.push_back_deep(result.arena(),
+				                      KeyValueRef(interfs[index].key.withPrefix(prefix), interfs[index].value));
+			}
+		}
+	} else {
+		for (const auto& [k_, v] : interfs) {
+			auto k = k_.withPrefix(prefix);
+			if (kr.contains(k))
+				result.push_back_deep(result.arena(), KeyValueRef(k, v));
+		}
+	}
+	std::sort(result.begin(), result.end(), KeyValueRef::OrderByKey{});
+	return result;
+}
+
+WorkerInterfacesSpecialKeyImpl::WorkerInterfacesSpecialKeyImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
+
+Future<RangeResult> WorkerInterfacesSpecialKeyImpl::getRange(ReadYourWritesTransaction* ryw,
+                                                             KeyRangeRef kr,
+                                                             GetRangeLimits limitsHint) const {
+	return workerInterfacesImplGetRangeActor(ryw, getKeyRange().begin, kr);
+}
+
 ACTOR Future<Void> validateSpecialSubrangeRead(ReadYourWritesTransaction* ryw,
                                               KeySelector begin,
                                               KeySelector end,
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@ -1331,6 +1331,9 @@ int64_t decodeBlobManagerEpochValue(ValueRef const& value) {
 }

 // blob granule data
+const KeyRef blobRangeActive = LiteralStringRef("1");
+const KeyRef blobRangeInactive = LiteralStringRef("0");
+
 const KeyRangeRef blobGranuleFileKeys(LiteralStringRef("\xff\x02/bgf/"), LiteralStringRef("\xff\x02/bgf0"));
 const KeyRangeRef blobGranuleMappingKeys(LiteralStringRef("\xff\x02/bgm/"), LiteralStringRef("\xff\x02/bgm0"));
 const KeyRangeRef blobGranuleLockKeys(LiteralStringRef("\xff\x02/bgl/"), LiteralStringRef("\xff\x02/bgl0"));
--- a/fdbclient/ThreadSafeTransaction.cpp
+++ b/fdbclient/ThreadSafeTransaction.cpp
@ -144,6 +144,32 @@ ThreadFuture<Void> ThreadSafeDatabase::waitPurgeGranulesComplete(const KeyRef& p
 	return onMainThread([db, key]() -> Future<Void> { return db->waitPurgeGranulesComplete(key); });
 }

+ThreadFuture<bool> ThreadSafeDatabase::blobbifyRange(const KeyRangeRef& keyRange) {
+	DatabaseContext* db = this->db;
+	KeyRange range = keyRange;
+	return onMainThread([=]() -> Future<bool> { return db->blobbifyRange(range); });
+}
+
+ThreadFuture<bool> ThreadSafeDatabase::unblobbifyRange(const KeyRangeRef& keyRange) {
+	DatabaseContext* db = this->db;
+	KeyRange range = keyRange;
+	return onMainThread([=]() -> Future<bool> { return db->blobbifyRange(range); });
+}
+
+ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> ThreadSafeDatabase::listBlobbifiedRanges(const KeyRangeRef& keyRange,
+                                                                                          int rangeLimit) {
+	DatabaseContext* db = this->db;
+	KeyRange range = keyRange;
+	return onMainThread(
+	    [=]() -> Future<Standalone<VectorRef<KeyRangeRef>>> { return db->listBlobbifiedRanges(range, rangeLimit); });
+}
+
+ThreadFuture<Version> ThreadSafeDatabase::verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) {
+	DatabaseContext* db = this->db;
+	KeyRange range = keyRange;
+	return onMainThread([=]() -> Future<Version> { return db->verifyBlobRange(range, version); });
+}
+
 ThreadSafeDatabase::ThreadSafeDatabase(ConnectionRecordType connectionRecordType,
                                       std::string connectionRecordString,
                                       int apiVersion) {
@ -359,13 +385,14 @@ ThreadFuture<Standalone<VectorRef<const char*>>> ThreadSafeTransaction::getAddre
 }

 ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> ThreadSafeTransaction::getBlobGranuleRanges(
-    const KeyRangeRef& keyRange) {
+    const KeyRangeRef& keyRange,
+    int rangeLimit) {
 	ISingleThreadTransaction* tr = this->tr;
 	KeyRange r = keyRange;

-	return onMainThread([tr, r]() -> Future<Standalone<VectorRef<KeyRangeRef>>> {
+	return onMainThread([=]() -> Future<Standalone<VectorRef<KeyRangeRef>>> {
 		tr->checkDeferredError();
-		return tr->getBlobGranuleRanges(r);
+		return tr->getBlobGranuleRanges(r, rangeLimit);
 	});
 }

--- a/fdbclient/include/fdbclient/BlobGranuleCommon.h
+++ b/fdbclient/include/fdbclient/BlobGranuleCommon.h
@ -35,7 +35,6 @@
 #define BG_ENCRYPT_COMPRESS_DEBUG false

 // file format of actual blob files
-// FIXME: use VecSerStrategy::String serialization for this
 struct GranuleSnapshot : VectorRef<KeyValueRef> {

 	constexpr static FileIdentifier file_identifier = 1300395;
--- a/fdbclient/include/fdbclient/BlobWorkerCommon.h
+++ b/fdbclient/include/fdbclient/BlobWorkerCommon.h
@ -50,6 +50,7 @@ struct BlobWorkerStats {
 	int activeReadRequests;
 	int granulesPendingSplitCheck;
 	Version minimumCFVersion;
+	Version cfVersionLag;
 	int notAtLatestChangeFeeds;
 	int64_t lastResidentMemory;
 	int64_t estimatedMaxResidentMemory;
@ -82,13 +83,14 @@ struct BlobWorkerStats {
 	    flushGranuleReqs("FlushGranuleReqs", cc), compressionBytesRaw("CompressionBytesRaw", cc),
 	    compressionBytesFinal("CompressionBytesFinal", cc), fullRejections("FullRejections", cc), numRangesAssigned(0),
 	    mutationBytesBuffered(0), activeReadRequests(0), granulesPendingSplitCheck(0), minimumCFVersion(0),
-	    notAtLatestChangeFeeds(0), lastResidentMemory(0), estimatedMaxResidentMemory(0),
+	    cfVersionLag(0), notAtLatestChangeFeeds(0), lastResidentMemory(0), estimatedMaxResidentMemory(0),
 	    initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock), deltaWritesLock(deltaWritesLock) {
 		specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; });
 		specialCounter(cc, "MutationBytesBuffered", [this]() { return this->mutationBytesBuffered; });
 		specialCounter(cc, "ActiveReadRequests", [this]() { return this->activeReadRequests; });
 		specialCounter(cc, "GranulesPendingSplitCheck", [this]() { return this->granulesPendingSplitCheck; });
 		specialCounter(cc, "MinimumChangeFeedVersion", [this]() { return this->minimumCFVersion; });
+		specialCounter(cc, "CFVersionLag", [this]() { return this->cfVersionLag; });
 		specialCounter(cc, "NotAtLatestChangeFeeds", [this]() { return this->notAtLatestChangeFeeds; });
 		specialCounter(cc, "LastResidentMemory", [this]() { return this->lastResidentMemory; });
 		specialCounter(cc, "EstimatedMaxResidentMemory", [this]() { return this->estimatedMaxResidentMemory; });
--- a/fdbclient/include/fdbclient/BlobWorkerInterface.h
+++ b/fdbclient/include/fdbclient/BlobWorkerInterface.h
@ -30,7 +30,6 @@

 struct BlobWorkerInterface {
 	constexpr static FileIdentifier file_identifier = 8358753;
-	// TODO: mimic what StorageServerInterface does with sequential endpoint IDs
 	RequestStream<ReplyPromise<Void>> waitFailure;
 	PublicRequestStream<struct BlobGranuleFileRequest> blobGranuleFileRequest;
 	RequestStream<struct AssignBlobRangeRequest> assignBlobRangeRequest;
--- a/fdbclient/include/fdbclient/ClientKnobs.h
+++ b/fdbclient/include/fdbclient/ClientKnobs.h
@ -57,6 +57,7 @@ public:
 	double WRONG_SHARD_SERVER_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is
 	                                 // mostly wrong (e.g. dumping the database after a test)
 	double FUTURE_VERSION_RETRY_DELAY;
+	double GRV_ERROR_RETRY_DELAY;
 	double UNKNOWN_TENANT_RETRY_DELAY;
 	int REPLY_BYTE_LIMIT;
 	double DEFAULT_BACKOFF;
--- a/fdbclient/include/fdbclient/ClusterInterface.h
+++ b/fdbclient/include/fdbclient/ClusterInterface.h
@ -98,32 +98,44 @@ struct ClusterControllerClientInterface {
 	}
 };

-template <class T>
-struct ItemWithExamples {
-	T item;
-	int count;
-	std::vector<std::pair<NetworkAddress, Key>> examples;
-
-	ItemWithExamples() : item{}, count(0) {}
-	ItemWithExamples(T const& item, int count, std::vector<std::pair<NetworkAddress, Key>> const& examples)
-	  : item(item), count(count), examples(examples) {}
-
-	template <class Ar>
-	void serialize(Ar& ar) {
-		serializer(ar, item, count, examples);
-	}
-};
-
 struct OpenDatabaseRequest {
 	constexpr static FileIdentifier file_identifier = 2799502;
 	// Sent by the native API to the cluster controller to open a database and track client
 	//   info changes.  Returns immediately if the current client info id is different from
 	//   knownClientInfoID; otherwise returns when it next changes (or perhaps after a long interval)

-	int clientCount;
-	std::vector<ItemWithExamples<Key>> issues;
-	std::vector<ItemWithExamples<Standalone<ClientVersionRef>>> supportedVersions;
-	std::vector<ItemWithExamples<Key>> maxProtocolSupported;
+	struct Samples {
+		int count;
+
+		// network address / trace log group
+		std::set<std::pair<NetworkAddress, Key>> samples;
+
+		Samples() : count(0), samples{} {}
+
+		template <typename Ar>
+		void serialize(Ar& ar) {
+			serializer(ar, count, samples);
+		}
+
+		// Merges a set of Samples into *this
+		Samples& operator+=(const Samples& other) {
+			count += other.count;
+			samples.insert(std::begin(other.samples), std::end(other.samples));
+
+			return *this;
+		}
+	};
+
+	int clientCount = 0;
+
+	// Maps issue to Samples
+	std::map<Key, Samples> issues;
+
+	// Maps ClientVersionRef to Samples
+	std::map<Standalone<ClientVersionRef>, Samples> supportedVersions;
+
+	// Maps max protocol to Samples
+	std::map<Key, Samples> maxProtocolSupported;

 	UID knownClientInfoID;
 	ReplyPromise<struct ClientDBInfo> reply;
--- a/fdbclient/include/fdbclient/CommitTransaction.h
+++ b/fdbclient/include/fdbclient/CommitTransaction.h
@ -25,6 +25,7 @@
 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/Knobs.h"
 #include "fdbclient/Tracing.h"
+#include "flow/BlobCipher.h"

 // The versioned message has wire format : -1, version, messages
 static const int32_t VERSION_HEADER = -1;
@ -79,7 +80,7 @@ struct MutationRef {
 		CompareAndClear,
 		Reserved_For_SpanContextMessage /* See fdbserver/SpanContextMessage.h */,
 		Reserved_For_OTELSpanContextMessage,
-		Reserved_For_EncryptedMutationMessage /* See fdbserver/EncryptedMutationMessage.actor.h */,
+		Encrypted, /* Represents an encrypted mutation and cannot be used directly before decrypting */
 		MAX_ATOMIC_OP
 	};
 	// This is stored this way for serialization purposes.
@ -128,6 +129,64 @@ struct MutationRef {
 		}
 	}

+	// An encrypted mutation has type Encrypted, encryption header (which contains encryption metadata) as param1,
+	// and the payload as param2. It can be serialize/deserialize as normal mutation, but can only be used after
+	// decryption via decrypt().
+	bool isEncrypted() const { return type == Encrypted; }
+
+	const BlobCipherEncryptHeader* encryptionHeader() const {
+		ASSERT(isEncrypted());
+		return reinterpret_cast<const BlobCipherEncryptHeader*>(param1.begin());
+	}
+
+	MutationRef encrypt(const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>& cipherKeys,
+	                    const EncryptCipherDomainId& domainId,
+	                    Arena& arena) const {
+		ASSERT_NE(domainId, ENCRYPT_INVALID_DOMAIN_ID);
+		auto textCipherItr = cipherKeys.find(domainId);
+		auto headerCipherItr = cipherKeys.find(ENCRYPT_HEADER_DOMAIN_ID);
+		ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid());
+		ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid());
+		uint8_t iv[AES_256_IV_LENGTH] = { 0 };
+		deterministicRandom()->randomBytes(iv, AES_256_IV_LENGTH);
+		BinaryWriter bw(AssumeVersion(ProtocolVersion::withEncryptionAtRest()));
+		bw << *this;
+		EncryptBlobCipherAes265Ctr cipher(textCipherItr->second,
+		                                  headerCipherItr->second,
+		                                  iv,
+		                                  AES_256_IV_LENGTH,
+		                                  ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
+		BlobCipherEncryptHeader* header = new (arena) BlobCipherEncryptHeader;
+		StringRef headerRef(reinterpret_cast<const uint8_t*>(header), sizeof(BlobCipherEncryptHeader));
+		StringRef payload =
+		    cipher.encrypt(static_cast<const uint8_t*>(bw.getData()), bw.getLength(), header, arena)->toStringRef();
+		return MutationRef(Encrypted, headerRef, payload);
+	}
+
+	MutationRef encryptMetadata(const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>& cipherKeys,
+	                            Arena& arena) const {
+		return encrypt(cipherKeys, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, arena);
+	}
+
+	MutationRef decrypt(const std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>& cipherKeys,
+	                    Arena& arena,
+	                    StringRef* buf = nullptr) const {
+		const BlobCipherEncryptHeader* header = encryptionHeader();
+		auto textCipherItr = cipherKeys.find(header->cipherTextDetails);
+		auto headerCipherItr = cipherKeys.find(header->cipherHeaderDetails);
+		ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid());
+		ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid());
+		DecryptBlobCipherAes256Ctr cipher(textCipherItr->second, headerCipherItr->second, header->iv);
+		StringRef plaintext = cipher.decrypt(param2.begin(), param2.size(), *header, arena)->toStringRef();
+		if (buf != nullptr) {
+			*buf = plaintext;
+		}
+		ArenaReader reader(arena, plaintext, AssumeVersion(ProtocolVersion::withEncryptionAtRest()));
+		MutationRef mutation;
+		reader >> mutation;
+		return mutation;
+	}
+
 	// These masks define which mutation types have particular properties (they are used to implement
 	// isSingleKeyMutation() etc)
 	enum {
--- a/fdbclient/include/fdbclient/DatabaseContext.h
+++ b/fdbclient/include/fdbclient/DatabaseContext.h
@ -378,12 +378,18 @@ public:
 	Future<OverlappingChangeFeedsInfo> getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion);
 	Future<Void> popChangeFeedMutations(Key rangeID, Version version);

+	// BlobGranule API.
 	Future<Key> purgeBlobGranules(KeyRange keyRange,
 	                              Version purgeVersion,
 	                              Optional<TenantName> tenant,
 	                              bool force = false);
 	Future<Void> waitPurgeGranulesComplete(Key purgeKey);

+	Future<bool> blobbifyRange(KeyRange range);
+	Future<bool> unblobbifyRange(KeyRange range);
+	Future<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRanges(KeyRange range, int rangeLimit);
+	Future<Version> verifyBlobRange(const KeyRange& range, Optional<Version> version);
+
 	// private:
 	explicit DatabaseContext(Reference<AsyncVar<Reference<IClusterConnectionRecord>>> connectionRecord,
 	                         Reference<AsyncVar<ClientDBInfo>> clientDBInfo,
--- a/fdbclient/include/fdbclient/IClientApi.h
+++ b/fdbclient/include/fdbclient/IClientApi.h
@ -78,7 +78,8 @@ public:
 	virtual ThreadFuture<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(const KeyRangeRef& range,
 	                                                                        int64_t chunkSize) = 0;

-	virtual ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRangeRef& keyRange) = 0;
+	virtual ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRangeRef& keyRange,
+	                                                                              int rowLimit) = 0;

 	virtual ThreadResult<RangeResult> readBlobGranules(const KeyRangeRef& keyRange,
 	                                                   Version beginVersion,
@ -172,6 +173,13 @@ public:
 	virtual ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) = 0;
 	virtual ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) = 0;

+	virtual ThreadFuture<bool> blobbifyRange(const KeyRangeRef& keyRange) = 0;
+	virtual ThreadFuture<bool> unblobbifyRange(const KeyRangeRef& keyRange) = 0;
+	virtual ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRanges(const KeyRangeRef& keyRange,
+	                                                                              int rangeLimit) = 0;
+
+	virtual ThreadFuture<Version> verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) = 0;
+
 	// Interface to manage shared state across multiple connections to the same Database
 	virtual ThreadFuture<DatabaseSharedState*> createSharedState() = 0;
 	virtual void setSharedState(DatabaseSharedState* p) = 0;
--- a/fdbclient/include/fdbclient/IConfigTransaction.h
+++ b/fdbclient/include/fdbclient/IConfigTransaction.h
@ -55,7 +55,7 @@ public:
 	Future<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(KeyRange const& range, int64_t chunkSize) override {
 		throw client_invalid_operation();
 	}
-	Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(KeyRange const& range) override {
+	Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(KeyRange const& range, int rowLimit) override {
 		throw client_invalid_operation();
 	}
 	Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranules(KeyRange const& range,
--- a/fdbclient/include/fdbclient/ISingleThreadTransaction.h
+++ b/fdbclient/include/fdbclient/ISingleThreadTransaction.h
@ -80,7 +80,7 @@ public:
 	virtual Future<Standalone<VectorRef<const char*>>> getAddressesForKey(Key const& key) = 0;
 	virtual Future<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(KeyRange const& range, int64_t chunkSize) = 0;
 	virtual Future<int64_t> getEstimatedRangeSizeBytes(KeyRange const& keys) = 0;
-	virtual Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(KeyRange const& range) = 0;
+	virtual Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(KeyRange const& range, int rangeLimit) = 0;
 	virtual Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranules(KeyRange const& range,
 	                                                                            Version begin,
 	                                                                            Optional<Version> readVersion,
--- a/fdbclient/include/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/include/fdbclient/MultiVersionTransaction.h
@ -171,6 +171,32 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	                                                uint8_t const* purge_key_name,
 	                                                int purge_key_name_length);

+	FDBFuture* (*databaseBlobbifyRange)(FDBDatabase* db,
+	                                    uint8_t const* begin_key_name,
+	                                    int begin_key_name_length,
+	                                    uint8_t const* end_key_name,
+	                                    int end_key_name_length);
+
+	FDBFuture* (*databaseUnblobbifyRange)(FDBDatabase* db,
+	                                      uint8_t const* begin_key_name,
+	                                      int begin_key_name_length,
+	                                      uint8_t const* end_key_name,
+	                                      int end_key_name_length);
+
+	FDBFuture* (*databaseListBlobbifiedRanges)(FDBDatabase* db,
+	                                           uint8_t const* begin_key_name,
+	                                           int begin_key_name_length,
+	                                           uint8_t const* end_key_name,
+	                                           int end_key_name_length,
+	                                           int rangeLimit);
+
+	FDBFuture* (*databaseVerifyBlobRange)(FDBDatabase* db,
+	                                      uint8_t const* begin_key_name,
+	                                      int begin_key_name_length,
+	                                      uint8_t const* end_key_name,
+	                                      int end_key_name_length,
+	                                      Optional<Version> version);
+
 	// Tenant
 	fdb_error_t (*tenantCreateTransaction)(FDBTenant* tenant, FDBTransaction** outTransaction);

@ -276,7 +302,8 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	                                              uint8_t const* begin_key_name,
 	                                              int begin_key_name_length,
 	                                              uint8_t const* end_key_name,
-	                                              int end_key_name_length);
+	                                              int end_key_name_length,
+	                                              int rangeLimit);

 	FDBResult* (*transactionReadBlobGranules)(FDBTransaction* db,
 	                                          uint8_t const* begin_key_name,
@ -376,7 +403,8 @@ public:
 	ThreadFuture<int64_t> getEstimatedRangeSizeBytes(const KeyRangeRef& keys) override;
 	ThreadFuture<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(const KeyRangeRef& range,
 	                                                                int64_t chunkSize) override;
-	ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRangeRef& keyRange) override;
+	ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRangeRef& keyRange,
+	                                                                      int rangeLimit) override;

 	ThreadResult<RangeResult> readBlobGranules(const KeyRangeRef& keyRange,
 	                                           Version beginVersion,
@ -476,6 +504,12 @@ public:
 	ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override;
 	ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) override;

+	ThreadFuture<bool> blobbifyRange(const KeyRangeRef& keyRange) override;
+	ThreadFuture<bool> unblobbifyRange(const KeyRangeRef& keyRange) override;
+	ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRanges(const KeyRangeRef& keyRange,
+	                                                                      int rangeLimit) override;
+	ThreadFuture<Version> verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) override;
+
 	ThreadFuture<DatabaseSharedState*> createSharedState() override;
 	void setSharedState(DatabaseSharedState* p) override;

@ -574,7 +608,8 @@ public:

 	ThreadFuture<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(const KeyRangeRef& range,
 	                                                                int64_t chunkSize) override;
-	ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRangeRef& keyRange) override;
+	ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRangeRef& keyRange,
+	                                                                      int rangeLimit) override;

 	ThreadResult<RangeResult> readBlobGranules(const KeyRangeRef& keyRange,
 	                                           Version beginVersion,
@ -817,6 +852,12 @@ public:
 	ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override;
 	ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) override;

+	ThreadFuture<bool> blobbifyRange(const KeyRangeRef& keyRange) override;
+	ThreadFuture<bool> unblobbifyRange(const KeyRangeRef& keyRange) override;
+	ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRanges(const KeyRangeRef& keyRange,
+	                                                                      int rangeLimit) override;
+	ThreadFuture<Version> verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) override;
+
 	ThreadFuture<DatabaseSharedState*> createSharedState() override;
 	void setSharedState(DatabaseSharedState* p) override;

--- a/fdbclient/include/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/include/fdbclient/NativeAPI.actor.h
@ -415,7 +415,7 @@ public:
 	// The returned list would still be in form of [keys.begin, splitPoint1, splitPoint2, ... , keys.end]
 	Future<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(KeyRange const& keys, int64_t chunkSize);

-	Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRange& range);
+	Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRange& range, int rangeLimit);
 	Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranules(const KeyRange& range,
 	                                                                    Version begin,
 	                                                                    Optional<Version> readVersion,
--- a/fdbclient/include/fdbclient/ReadYourWrites.h
+++ b/fdbclient/include/fdbclient/ReadYourWrites.h
@ -121,7 +121,7 @@ public:
 	Future<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(const KeyRange& range, int64_t chunkSize) override;
 	Future<int64_t> getEstimatedRangeSizeBytes(const KeyRange& keys) override;

-	Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRange& range) override;
+	Future<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRange& range, int rangeLimit) override;
 	Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranules(const KeyRange& range,
 	                                                                    Version begin,
 	                                                                    Optional<Version> readVersion,
--- a/fdbclient/include/fdbclient/ServerKnobs.h
+++ b/fdbclient/include/fdbclient/ServerKnobs.h
@ -156,9 +156,14 @@ public:
 	int PRIORITY_TEAM_FAILED; // Priority when a server in the team is excluded as failed
 	int PRIORITY_TEAM_0_LEFT;
 	int PRIORITY_SPLIT_SHARD;
+	int PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD; // Priority when a physical shard is oversize or anonymous

 	// Data distribution
 	bool SHARD_ENCODE_LOCATION_METADATA; // If true, location metadata will contain shard ID.
+	bool ENABLE_DD_PHYSICAL_SHARD; // EXPERIMENTAL; If true, SHARD_ENCODE_LOCATION_METADATA must be true.
+	int64_t MAX_PHYSICAL_SHARD_BYTES;
+	double PHYSICAL_SHARD_METRICS_DELAY;
+	double ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME;

 	double READ_REBALANCE_CPU_THRESHOLD; // read rebalance only happens if the source servers' CPU > threshold
 	int READ_REBALANCE_SRC_PARALLELISM; // the max count a server become a source server within a certain interval
@ -298,6 +303,7 @@ public:
 	int64_t REPLACE_CONTENTS_BYTES;

 	// KeyValueStoreRocksDB
+	int ROCKSDB_READ_RANGE_ROW_LIMIT;
 	int ROCKSDB_BACKGROUND_PARALLELISM;
 	int ROCKSDB_READ_PARALLELISM;
 	int64_t ROCKSDB_MEMTABLE_BYTES;
@ -324,6 +330,7 @@ public:
 	std::string DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY;
 	bool ROCKSDB_PERFCONTEXT_ENABLE; // Enable rocks perf context metrics. May cause performance overhead
 	double ROCKSDB_PERFCONTEXT_SAMPLE_RATE;
+	double ROCKSDB_METRICS_SAMPLE_INTERVAL;
 	int ROCKSDB_MAX_SUBCOMPACTIONS;
 	int64_t ROCKSDB_SOFT_PENDING_COMPACT_BYTES_LIMIT;
 	int64_t ROCKSDB_HARD_PENDING_COMPACT_BYTES_LIMIT;
@ -333,6 +340,10 @@ public:
 	int64_t ROCKSDB_COMPACTION_READAHEAD_SIZE;
 	int64_t ROCKSDB_BLOCK_SIZE;
 	bool ENABLE_SHARDED_ROCKSDB;
+	int64_t ROCKSDB_WRITE_BUFFER_SIZE;
+	int64_t ROCKSDB_MAX_TOTAL_WAL_SIZE;
+	int64_t ROCKSDB_MAX_BACKGROUND_JOBS;
+	int64_t ROCKSDB_DELETE_OBSOLETE_FILE_PERIOD;

 	// Leader election
 	int MAX_NOTIFICATIONS;
--- a/fdbclient/include/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/include/fdbclient/SpecialKeySpace.actor.h
@ -548,6 +548,15 @@ public:
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 };

+class WorkerInterfacesSpecialKeyImpl : public SpecialKeyRangeReadImpl {
+public:
+	explicit WorkerInterfacesSpecialKeyImpl(KeyRangeRef kr);
+
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw,
+	                             KeyRangeRef kr,
+	                             GetRangeLimits limitsHint) const override;
+};
+
 // If the underlying set of key-value pairs of a key space is not changing, then we expect repeating a read to give the
 // same result. Additionally, we can generate the expected result of any read if that read is reading a subrange. This
 // actor performs a read of an arbitrary subrange of [begin, end) and validates the results.
--- a/fdbclient/include/fdbclient/SystemData.h
+++ b/fdbclient/include/fdbclient/SystemData.h
@ -594,6 +594,8 @@ const Value blobManagerEpochValueFor(int64_t epoch);
 int64_t decodeBlobManagerEpochValue(ValueRef const& value);

 // blob granule keys
+extern const StringRef blobRangeActive;
+extern const StringRef blobRangeInactive;

 extern const uint8_t BG_FILE_TYPE_DELTA;
 extern const uint8_t BG_FILE_TYPE_SNAPSHOT;
--- a/fdbclient/include/fdbclient/ThreadSafeTransaction.h
+++ b/fdbclient/include/fdbclient/ThreadSafeTransaction.h
@ -62,6 +62,13 @@ public:
 	ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override;
 	ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) override;

+	ThreadFuture<bool> blobbifyRange(const KeyRangeRef& keyRange) override;
+	ThreadFuture<bool> unblobbifyRange(const KeyRangeRef& keyRange) override;
+	ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRanges(const KeyRangeRef& keyRange,
+	                                                                      int rangeLimit) override;
+
+	ThreadFuture<Version> verifyBlobRange(const KeyRangeRef& keyRange, Optional<Version> version) override;
+
 	ThreadFuture<DatabaseSharedState*> createSharedState() override;
 	void setSharedState(DatabaseSharedState* p) override;

@ -149,7 +156,8 @@ public:
 	ThreadFuture<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(const KeyRangeRef& range,
 	                                                                int64_t chunkSize) override;

-	ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRangeRef& keyRange) override;
+	ThreadFuture<Standalone<VectorRef<KeyRangeRef>>> getBlobGranuleRanges(const KeyRangeRef& keyRange,
+	                                                                      int rangeLimit) override;

 	ThreadResult<RangeResult> readBlobGranules(const KeyRangeRef& keyRange,
 	                                           Version beginVersion,
--- a/fdbserver/ApplyMetadataMutation.cpp
+++ b/fdbserver/ApplyMetadataMutation.cpp
@ -25,7 +25,6 @@
 #include "fdbclient/Notified.h"
 #include "fdbclient/SystemData.h"
 #include "fdbserver/ApplyMetadataMutation.h"
-#include "fdbserver/EncryptedMutationMessage.h"
 #include "fdbserver/EncryptionOpsUtils.h"
 #include "fdbserver/IKeyValueStore.h"
 #include "fdbserver/LogProtocolMessage.h"
@ -169,7 +168,7 @@ private:
 		} else {
 			ASSERT(cipherKeys != nullptr);
 			Arena arena;
-			toCommit->writeTypedMessage(EncryptedMutationMessage::encryptMetadata(arena, *cipherKeys, m));
+			toCommit->writeTypedMessage(m.encryptMetadata(*cipherKeys, arena));
 		}
 	}

--- a/fdbserver/BackupWorker.actor.cpp
+++ b/fdbserver/BackupWorker.actor.cpp
@ -25,7 +25,6 @@
 #include "fdbclient/SystemData.h"
 #include "fdbserver/BackupInterface.h"
 #include "fdbserver/BackupProgress.actor.h"
-#include "fdbserver/EncryptedMutationMessage.h"
 #include "fdbserver/GetEncryptCipherKeys.h"
 #include "fdbserver/Knobs.h"
 #include "fdbserver/LogProtocolMessage.h"
@ -75,24 +74,25 @@ struct VersionedMessage {
 			CODE_PROBE(true, "Returning false for OTELSpanContextMessage");
 			return false;
 		}
-		if (EncryptedMutationMessage::isNextIn(reader)) {
+		reader >> *m;
+		if (m->isEncrypted()) {
 			// In case the mutation is encrypted, get the decrypted mutation and also update message to point to
 			// the decrypted mutation.
 			// We use dedicated arena for decrypt buffer, as the other arena is used to count towards backup lock bytes.
-			*m = EncryptedMutationMessage::decrypt(reader, decryptArena, cipherKeys, &message);
-		} else {
-			reader >> *m;
+			*m = m->decrypt(cipherKeys, decryptArena, &message);
 		}
 		return normalKeys.contains(m->param1) || m->param1 == metadataVersionKey;
 	}

 	void collectCipherDetailIfEncrypted(std::unordered_set<BlobCipherDetails>& cipherDetails) {
-		ArenaReader reader(arena, message, AssumeVersion(g_network->protocolVersion()));
-		if (EncryptedMutationMessage::isNextIn(reader)) {
-			EncryptedMutationMessage emm;
-			reader >> emm;
-			cipherDetails.insert(emm.header.cipherTextDetails);
-			cipherDetails.insert(emm.header.cipherHeaderDetails);
+		ASSERT(!message.empty());
+		if (*message.begin() == MutationRef::Encrypted) {
+			ArenaReader reader(arena, message, AssumeVersion(ProtocolVersion::withEncryptionAtRest()));
+			MutationRef m;
+			reader >> m;
+			const BlobCipherEncryptHeader* header = m.encryptionHeader();
+			cipherDetails.insert(header->cipherTextDetails);
+			cipherDetails.insert(header->cipherHeaderDetails);
 		}
 	}
 };
@ -453,20 +453,30 @@ struct BackupData {
 	ACTOR static Future<Version> _getMinKnownCommittedVersion(BackupData* self) {
 		state Span span("BA:GetMinCommittedVersion"_loc);
 		loop {
-			GetReadVersionRequest request(span.context,
-			                              0,
-			                              TransactionPriority::DEFAULT,
-			                              invalidVersion,
-			                              GetReadVersionRequest::FLAG_USE_MIN_KNOWN_COMMITTED_VERSION);
-			choose {
-				when(wait(self->cx->onProxiesChanged())) {}
-				when(GetReadVersionReply reply =
-				         wait(basicLoadBalance(self->cx->getGrvProxies(UseProvisionalProxies::False),
-				                               &GrvProxyInterface::getConsistentReadVersion,
-				                               request,
-				                               self->cx->taskID))) {
-					self->cx->ssVersionVectorCache.applyDelta(reply.ssVersionVectorDelta);
-					return reply.version;
+			try {
+				GetReadVersionRequest request(span.context,
+				                              0,
+				                              TransactionPriority::DEFAULT,
+				                              invalidVersion,
+				                              GetReadVersionRequest::FLAG_USE_MIN_KNOWN_COMMITTED_VERSION);
+				choose {
+					when(wait(self->cx->onProxiesChanged())) {}
+					when(GetReadVersionReply reply =
+					         wait(basicLoadBalance(self->cx->getGrvProxies(UseProvisionalProxies::False),
+					                               &GrvProxyInterface::getConsistentReadVersion,
+					                               request,
+					                               self->cx->taskID))) {
+						self->cx->ssVersionVectorCache.applyDelta(reply.ssVersionVectorDelta);
+						return reply.version;
+					}
+				}
+			} catch (Error& e) {
+				if (e.code() == error_code_batch_transaction_throttled ||
+				    e.code() == error_code_grv_proxy_memory_limit_exceeded) {
+					// GRV Proxy returns an error
+					wait(delayJittered(CLIENT_KNOBS->GRV_ERROR_RETRY_DELAY));
+				} else {
+					throw;
 				}
 			}
 		}
--- a/fdbserver/BlobGranuleValidation.actor.cpp
+++ b/fdbserver/BlobGranuleValidation.actor.cpp
@ -189,7 +189,7 @@ ACTOR Future<Void> clearAndAwaitMerge(Database cx, KeyRange range) {
 	state int reClearInterval = 1; // do quadratic backoff on clear rate, b/c large keys can keep it not write-cold
 	loop {
 		try {
-			Standalone<VectorRef<KeyRangeRef>> ranges = wait(tr.getBlobGranuleRanges(range));
+			Standalone<VectorRef<KeyRangeRef>> ranges = wait(tr.getBlobGranuleRanges(range, 2));
 			if (ranges.size() == 1) {
 				return Void();
 			}
--- a/fdbserver/BlobManager.actor.cpp
+++ b/fdbserver/BlobManager.actor.cpp
@ -130,7 +130,7 @@ void updateClientBlobRanges(KeyRangeMap<bool>* knownBlobRanges,
 				}
 				break;
 			}
-			bool active = dbBlobRanges[i].value == LiteralStringRef("1");
+			bool active = dbBlobRanges[i].value == blobRangeActive;
 			if (active) {
 				if (BM_DEBUG) {
 					fmt::print("BM sees client range [{0} - {1})\n",
@ -547,6 +547,12 @@ ACTOR Future<BlobGranuleSplitPoints> alignKeys(Reference<BlobManagerData> bmData

 	state Transaction tr = Transaction(bmData->db);
 	state int idx = 1;
+	state Reference<GranuleTenantData> tenantData = bmData->tenantData.getDataForGranule(granuleRange);
+	while (SERVER_KNOBS->BG_METADATA_SOURCE == "tenant" && !tenantData.isValid()) {
+		// this is a bit of a hack, but if we know this range is supposed to have a tenant, and it doesn't, just wait
+		wait(delay(1.0));
+		tenantData = bmData->tenantData.getDataForGranule(granuleRange);
+	}
 	for (; idx < splits.size() - 1; idx++) {
 		loop {
 			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
@ -559,7 +565,6 @@ ACTOR Future<BlobGranuleSplitPoints> alignKeys(Reference<BlobManagerData> bmData
 					break;
 				}

-				Reference<GranuleTenantData> tenantData = bmData->tenantData.getDataForGranule(granuleRange);
 				alignKeyBoundary(bmData, tenantData, nextKeyRes[0].key, offset, splitPoints);
 				break;
 			} catch (Error& e) {
@ -682,7 +687,6 @@ ACTOR Future<BlobGranuleSplitPoints> splitRange(Reference<BlobManagerData> bmDat

 // Picks a worker with the fewest number of already assigned ranges.
 // If there is a tie, picks one such worker at random.
-// TODO: add desired per-blob-worker limit? don't assign ranges to each worker past that limit?
 ACTOR Future<UID> pickWorkerForAssign(Reference<BlobManagerData> bmData,
                                      Optional<std::pair<UID, Error>> previousFailure) {
 	// wait until there are BWs to pick from
@ -1017,6 +1021,8 @@ static bool handleRangeIsAssign(Reference<BlobManagerData> bmData, RangeAssignme
 	}
 	ASSERT(count == 1);

+	bool forcePurging = bmData->isForcePurging(assignment.keyRange);
+
 	if (assignment.worker.present() && assignment.worker.get().isValid()) {
 		if (BM_DEBUG) {
 			fmt::print("BW {0} already chosen for seqno {1} in BM {2}\n",
@ -1034,8 +1040,10 @@ static bool handleRangeIsAssign(Reference<BlobManagerData> bmData, RangeAssignme
 			// assignsInProgress
 			bmData->addActor.send(doRangeAssignment(bmData, assignment, workerId, bmData->epoch, seqNo));
 		} else {
-			bmData->assignsInProgress.insert(assignment.keyRange,
-			                                 doRangeAssignment(bmData, assignment, workerId, bmData->epoch, seqNo));
+			if (!forcePurging) {
+				bmData->assignsInProgress.insert(assignment.keyRange,
+				                                 doRangeAssignment(bmData, assignment, workerId, bmData->epoch, seqNo));
+			}
 			if (bmData->workerStats.count(workerId)) {
 				bmData->workerStats[workerId].numGranulesAssigned += 1;
 			}
@ -1044,8 +1052,10 @@ static bool handleRangeIsAssign(Reference<BlobManagerData> bmData, RangeAssignme
 		// Ensure the key boundaries are updated before we pick a worker
 		bmData->workerAssignments.insert(assignment.keyRange, UID());
 		ASSERT(assignment.assign.get().type != AssignRequestType::Continue);
-		bmData->assignsInProgress.insert(assignment.keyRange,
-		                                 doRangeAssignment(bmData, assignment, Optional<UID>(), bmData->epoch, seqNo));
+		if (!forcePurging) {
+			bmData->assignsInProgress.insert(
+			    assignment.keyRange, doRangeAssignment(bmData, assignment, Optional<UID>(), bmData->epoch, seqNo));
+		}
 	}
 	return true;
 }
@ -1094,10 +1104,6 @@ static bool handleRangeIsRevoke(Reference<BlobManagerData> bmData, RangeAssignme
 }

 static bool handleRangeAssign(Reference<BlobManagerData> bmData, RangeAssignment assignment) {
-	if ((assignment.isAssign || !assignment.revoke.get().dispose) && bmData->isForcePurging(assignment.keyRange)) {
-		return false;
-	}
-
 	int64_t seqNo = bmData->seqNo;
 	bmData->seqNo++;

@ -1242,7 +1248,6 @@ ACTOR Future<Void> monitorClientRanges(Reference<BlobManagerData> bmData) {
 				// read change key at this point along with data
 				state Optional<Value> ckvBegin = wait(tr->get(blobRangeChangeKey));

-				// TODO why is there separate arena?
 				state Arena ar;
 				state RangeResult results = wait(krmGetRanges(tr,
 				                                              blobRangeKeys.begin,
@ -1280,7 +1285,7 @@ ACTOR Future<Void> monitorClientRanges(Reference<BlobManagerData> bmData) {
 					needToCoalesce = false;

 					for (int i = 0; i < results.size() - 1; i++) {
-						bool active = results[i].value == LiteralStringRef("1");
+						bool active = results[i].value == blobRangeActive;
 						bmData->knownBlobRanges.insert(KeyRangeRef(results[i].key, results[i + 1].key), active);
 					}
 				}
@ -1453,7 +1458,7 @@ ACTOR Future<Void> reevaluateInitialSplit(Reference<BlobManagerData> bmData,
 	}

 	if (BM_DEBUG) {
-		fmt::print("Re-evaluated split ({0}:\n", newRanges.size());
+		fmt::print("Re-evaluated split ({0}):\n", newRanges.size());
 		for (auto& it : newRanges) {
 			fmt::print("    {0}\n", it.printable());
 		}
@ -1466,7 +1471,7 @@ ACTOR Future<Void> reevaluateInitialSplit(Reference<BlobManagerData> bmData,
 	ASSERT(finalSplit.keys.size() > 2);

 	if (BM_DEBUG) {
-		fmt::print("Aligned split ({0}:\n", finalSplit.keys.size());
+		fmt::print("Aligned split ({0}):\n", finalSplit.keys.size());
 		for (auto& it : finalSplit.keys) {
 			fmt::print("    {0}{1}\n", it.printable(), finalSplit.boundaries.count(it) ? " *" : "");
 		}
@ -1486,6 +1491,24 @@ ACTOR Future<Void> reevaluateInitialSplit(Reference<BlobManagerData> bmData,
 			// make sure we're still manager when this transaction gets committed
 			wait(checkManagerLock(tr, bmData));

+			ForcedPurgeState purgeState = wait(getForcePurgedState(&tr->getTransaction(), granuleRange));
+			if (purgeState != ForcedPurgeState::NonePurged) {
+				CODE_PROBE(true, "Initial Split Re-evaluate stopped because of force purge");
+				TraceEvent("GranuleSplitReEvalCancelledForcePurge", bmData->id)
+				    .detail("Epoch", bmData->epoch)
+				    .detail("GranuleRange", granuleRange);
+
+				// destroy already created change feed from worker so it doesn't leak
+				wait(updateChangeFeed(&tr->getTransaction(),
+				                      granuleIDToCFKey(granuleID),
+				                      ChangeFeedStatus::CHANGE_FEED_DESTROY,
+				                      granuleRange));
+
+				wait(tr->commit());
+
+				return Void();
+			}
+
 			// this adds a read conflict range, so if another granule concurrently commits a file, we will retry and see
 			// that
 			KeyRange range = blobGranuleFileKeyRangeFor(granuleID);
@ -1633,6 +1656,10 @@ ACTOR Future<Void> maybeSplitRange(Reference<BlobManagerData> bmData,
                                   bool writeHot,
                                   int64_t originalEpoch,
                                   int64_t originalSeqno) {
+	if (bmData->isForcePurging(granuleRange)) {
+		// ignore
+		return Void();
+	}
 	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);

 	// first get ranges to split
@ -1952,7 +1979,10 @@ ACTOR Future<Void> maybeSplitRange(Reference<BlobManagerData> bmData,
 // read mapping from db to handle any in flight granules or other issues
 // Forces all granules in the specified key range to flush data to blob up to the specified version. This is required
 // for executing a merge.
-ACTOR Future<Void> forceGranuleFlush(Reference<BlobManagerData> bmData, KeyRange keyRange, Version version) {
+ACTOR Future<bool> forceGranuleFlush(Reference<BlobManagerData> bmData,
+                                     UID mergeGranuleID,
+                                     KeyRange keyRange,
+                                     Version version) {
 	state Transaction tr(bmData->db);
 	state KeyRange currentRange = keyRange;

@ -1975,7 +2005,13 @@ ACTOR Future<Void> forceGranuleFlush(Reference<BlobManagerData> bmData, KeyRange
 				TraceEvent("GranuleFlushCancelledForcePurge", bmData->id)
 				    .detail("Epoch", bmData->epoch)
 				    .detail("KeyRange", keyRange);
-				return Void();
+
+				// destroy already created change feed from earlier so it doesn't leak
+				wait(updateChangeFeed(
+				    &tr, granuleIDToCFKey(mergeGranuleID), ChangeFeedStatus::CHANGE_FEED_DESTROY, keyRange));
+
+				wait(tr.commit());
+				return false;
 			}

 			// TODO KNOB
@ -2091,7 +2127,7 @@ ACTOR Future<Void> forceGranuleFlush(Reference<BlobManagerData> bmData, KeyRange
 		           version);
 	}

-	return Void();
+	return true;
 }

 // Persist the merge intent for this merge in the database. Once this transaction commits, the merge is in progress. It
@ -2126,6 +2162,9 @@ ACTOR Future<std::pair<UID, Version>> persistMergeGranulesStart(Reference<BlobMa

 				wait(tr->commit());

+				bmData->activeGranuleMerges.insert(mergeRange, invalidVersion);
+				bmData->activeGranuleMerges.coalesce(mergeRange.begin);
+
 				// TODO better error?
 				return std::pair(UID(), invalidVersion);
 			}
@ -2171,10 +2210,9 @@ ACTOR Future<std::pair<UID, Version>> persistMergeGranulesStart(Reference<BlobMa
 	}
 }

-// FIXME: why not just make parentGranuleRanges vector of N+1 keys?
 // Persists the merge being complete in the database by clearing the merge intent. Once this transaction commits, the
 // merge is considered completed.
-ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
+ACTOR Future<bool> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
                                            UID mergeGranuleID,
                                            KeyRange mergeRange,
                                            Version mergeVersion,
@ -2219,10 +2257,14 @@ ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
 				                      ChangeFeedStatus::CHANGE_FEED_DESTROY,
 				                      mergeRange));

+				// TODO could also delete history entry here
+
 				wait(tr->commit());

-				return Void();
-				// TODO: check this in split re-eval too once that is merged!!
+				bmData->activeGranuleMerges.insert(mergeRange, invalidVersion);
+				bmData->activeGranuleMerges.coalesce(mergeRange.begin);
+
+				return false;
 			}

 			tr->clear(blobGranuleMergeKeyFor(mergeGranuleID));
@ -2298,7 +2340,7 @@ ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
 				           tr->getCommittedVersion());
 			}
 			CODE_PROBE(true, "Granule merge complete");
-			return Void();
+			return true;
 		} catch (Error& e) {
 			wait(tr->onError(e));
 		}
@ -2328,16 +2370,28 @@ ACTOR Future<Void> finishMergeGranules(Reference<BlobManagerData> bmData,
 	}

 	// force granules to persist state up to mergeVersion
-	wait(forceGranuleFlush(bmData, mergeRange, mergeVersion));
+	bool successFlush = wait(forceGranuleFlush(bmData, mergeGranuleID, mergeRange, mergeVersion));
+	if (!successFlush) {
+		bmData->activeGranuleMerges.insert(mergeRange, invalidVersion);
+		bmData->activeGranuleMerges.coalesce(mergeRange.begin);
+		--bmData->stats.activeMerges;
+		return Void();
+	}

 	// update state and clear merge intent
-	wait(persistMergeGranulesDone(bmData,
-	                              mergeGranuleID,
-	                              mergeRange,
-	                              mergeVersion,
-	                              parentGranuleIDs,
-	                              parentGranuleRanges,
-	                              parentGranuleStartVersions));
+	bool successFinish = wait(persistMergeGranulesDone(bmData,
+	                                                   mergeGranuleID,
+	                                                   mergeRange,
+	                                                   mergeVersion,
+	                                                   parentGranuleIDs,
+	                                                   parentGranuleRanges,
+	                                                   parentGranuleStartVersions));
+	if (!successFinish) {
+		bmData->activeGranuleMerges.insert(mergeRange, invalidVersion);
+		bmData->activeGranuleMerges.coalesce(mergeRange.begin);
+		--bmData->stats.activeMerges;
+		return Void();
+	}

 	int64_t seqnoForEval = bmData->seqNo;

@ -2387,6 +2441,7 @@ ACTOR Future<Void> doMerge(Reference<BlobManagerData> bmData,
 		    wait(persistMergeGranulesStart(bmData, mergeRange, ids, ranges, startVersions));
 		if (persistMerge.second == invalidVersion) {
 			// cancelled because of force purge
+
 			return Void();
 		}
 		wait(finishMergeGranules(
@ -2427,6 +2482,11 @@ static void attemptStartMerge(Reference<BlobManagerData> bmData,
 		}
 	}

+	if (bmData->isForcePurging(mergeRange)) {
+		// ignore
+		return;
+	}
+
 	if (BM_DEBUG) {
 		fmt::print("BM {0} Starting merge of [{1} - {2}) ({3})\n",
 		           bmData->epoch,
@ -2851,7 +2911,7 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
 						if (rep.blockedVersion < inProgressMergeVersion) {
 							CODE_PROBE(true, "merge blocking re-snapshot");
 							if (BM_DEBUG) {
-								fmt::print("DBG: BM {0} MERGE @ {1} blocking re-snapshot [{2} - {3}) @ {4}, "
+								fmt::print("BM {0} MERGE @ {1} blocking re-snapshot [{2} - {3}) @ {4}, "
 								           "continuing snapshot\n",
 								           bmData->epoch,
 								           inProgressMergeVersion,
@ -3324,7 +3384,8 @@ ACTOR Future<Void> loadBlobGranuleMergeBoundaries(Reference<BlobManagerData> bmD

 			// Add the mappings to our in memory key range map
 			for (int i = 0; i < results.size(); i++) {
-				bmData->mergeBoundaries[results[i].key] = decodeBlobGranuleMergeBoundaryValue(results[i].value);
+				bmData->mergeBoundaries[results[i].key.removePrefix(blobGranuleMergeBoundaryKeys.begin)] =
+				    decodeBlobGranuleMergeBoundaryValue(results[i].value);
 			}

 			if (!results.more) {
@ -3597,6 +3658,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
 		fmt::print("BM {0} final ranges:\n", bmData->epoch);
 	}

+	state int totalGranules = 0;
 	state int explicitAssignments = 0;
 	for (auto& range : workerAssignments.intersectingRanges(normalKeys)) {
 		int64_t epoch = std::get<1>(range.value());
@ -3605,6 +3667,8 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
 			continue;
 		}

+		totalGranules++;
+
 		UID workerId = std::get<0>(range.value());
 		bmData->workerAssignments.insert(range.range(), workerId);

@ -3647,7 +3711,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
 	TraceEvent("BlobManagerRecovered", bmData->id)
 	    .detail("Epoch", bmData->epoch)
 	    .detail("Duration", now() - recoveryStartTime)
-	    .detail("Granules", bmData->workerAssignments.size()) // TODO this includes un-set ranges, so it is inaccurate
+	    .detail("Granules", totalGranules)
 	    .detail("Assigned", explicitAssignments)
 	    .detail("Revoked", outOfDateAssignments.size());

@ -3904,14 +3968,14 @@ ACTOR Future<GranuleFiles> loadHistoryFiles(Reference<BlobManagerData> bmData, U
 	}
 }

-ACTOR Future<bool> canDeleteFullGranule(Reference<BlobManagerData> self, UID granuleId) {
+ACTOR Future<bool> canDeleteFullGranuleSplit(Reference<BlobManagerData> self, UID granuleId) {
 	state Transaction tr(self->db);
 	state KeyRange splitRange = blobGranuleSplitKeyRangeFor(granuleId);
 	state KeyRange checkRange = splitRange;
 	state bool retry = false;

 	if (BM_PURGE_DEBUG) {
-		fmt::print("BM {0} Fully delete granule check {1}\n", self->epoch, granuleId.toString());
+		fmt::print("BM {0} Fully delete granule split check {1}\n", self->epoch, granuleId.toString());
 	}

 	loop {
@ -3990,6 +4054,51 @@ ACTOR Future<bool> canDeleteFullGranule(Reference<BlobManagerData> self, UID gra
 	return false;
 }

+ACTOR Future<Void> canDeleteFullGranuleMerge(Reference<BlobManagerData> self, Optional<UID> mergeChildId) {
+	// if this granule is the parent of a merged granule, it needs to re-snapshot the merged granule before we can
+	// delete this one
+	if (!mergeChildId.present()) {
+		return Void();
+	}
+	CODE_PROBE(true, "checking canDeleteFullGranuleMerge");
+
+	if (BM_PURGE_DEBUG) {
+		fmt::print("BM {0} Fully delete granule merge check {1}\n", self->epoch, mergeChildId.get().toString());
+	}
+
+	state Transaction tr(self->db);
+	state KeyRange granuleFileRange = blobGranuleFileKeyRangeFor(mergeChildId.get());
+	// loop until granule has snapshotted
+	loop {
+		try {
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+			RangeResult files = wait(tr.getRange(granuleFileRange, 1));
+			if (!files.empty()) {
+				if (BM_PURGE_DEBUG) {
+					fmt::print("BM {0} Fully delete granule merge check {1} done\n",
+					           self->epoch,
+					           mergeChildId.get().toString());
+				}
+				return Void();
+			}
+			wait(delay(1.0));
+		} catch (Error& e) {
+			wait(tr.onError(e));
+		}
+	}
+}
+
+ACTOR Future<bool> canDeleteFullGranule(Reference<BlobManagerData> self, UID granuleId, Optional<UID> mergeChildId) {
+	state Future<bool> split = canDeleteFullGranuleSplit(self, granuleId);
+	state Future<Void> merge = canDeleteFullGranuleMerge(self, mergeChildId);
+
+	wait(success(split) && merge);
+	bool canDeleteHistory = wait(split);
+	return canDeleteHistory;
+}
+
 static Future<Void> deleteFile(Reference<BlobConnectionProvider> bstoreProvider, std::string filePath) {
 	Reference<BackupContainerFileSystem> bstore = bstoreProvider->getForRead(filePath);
 	return bstore->deleteFile(filePath);
@ -4022,6 +4131,7 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
                                      Key historyKey,
                                      Version purgeVersion,
                                      KeyRange granuleRange,
+                                      Optional<UID> mergeChildID,
                                      bool force) {
 	if (BM_PURGE_DEBUG) {
 		fmt::print("BM {0} Fully deleting granule [{1} - {2}): {3} @ {4}{5}\n",
@ -4042,7 +4152,7 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
 	if (force) {
 		canDeleteHistoryKey = true;
 	} else {
-		wait(store(canDeleteHistoryKey, canDeleteFullGranule(self, granuleId)));
+		wait(store(canDeleteHistoryKey, canDeleteFullGranule(self, granuleId, mergeChildID)));
 	}
 	state Reference<BlobConnectionProvider> bstore = wait(getBStoreForGranule(self, granuleRange));

@ -4069,9 +4179,9 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
 		           self->epoch,
 		           granuleId.toString(),
 		           filesToDelete.size());
-		for (auto filename : filesToDelete) {
-			fmt::print(" - {}\n", filename.c_str());
-		}
+		/*for (auto filename : filesToDelete) {
+		    fmt::print(" - {}\n", filename.c_str());
+		}*/
 	}

 	// delete the files before the corresponding metadata.
@ -4203,9 +4313,9 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self,
 		           self->epoch,
 		           granuleId.toString(),
 		           filesToDelete.size());
-		for (auto filename : filesToDelete) {
-			fmt::print(" - {0}\n", filename);
-		}
+		/*for (auto filename : filesToDelete) {
+		    fmt::print(" - {0}\n", filename);
+		}*/
 	}

 	// TODO: the following comment relies on the assumption that BWs will not get requests to
@ -4281,11 +4391,11 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 	    .detail("PurgeVersion", purgeVersion)
 	    .detail("Force", force);

-	// queue of <range, startVersion, endVersion> for BFS traversal of history
-	state std::queue<std::tuple<KeyRange, Version, Version>> historyEntryQueue;
+	// queue of <range, startVersion, endVersion, mergeChildID> for BFS traversal of history
+	state std::queue<std::tuple<KeyRange, Version, Version, Optional<UID>>> historyEntryQueue;

-	// stacks of <granuleId, historyKey> and <granuleId> to track which granules to delete
-	state std::vector<std::tuple<UID, Key, KeyRange>> toFullyDelete;
+	// stacks of <granuleId, historyKey> and <granuleId> (and mergeChildID) to track which granules to delete
+	state std::vector<std::tuple<UID, Key, KeyRange, Optional<UID>>> toFullyDelete;
 	state std::vector<std::pair<UID, KeyRange>> toPartiallyDelete;

 	// track which granules we have already added to traversal
@ -4323,6 +4433,25 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 		tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 	}

+	// wait for all active splits and merges in the range to come to a stop, so no races with purging
+	std::vector<Future<Void>> activeBoundaryEvals;
+	auto boundaries = self->boundaryEvaluations.intersectingRanges(range);
+	for (auto& it : boundaries) {
+		auto& f = it.cvalue().inProgress;
+		if (f.isValid() && !f.isReady() && !f.isError()) {
+			activeBoundaryEvals.push_back(f);
+		}
+	}
+
+	if (!activeBoundaryEvals.empty()) {
+		wait(waitForAll(activeBoundaryEvals));
+	}
+
+	// some merges aren't counted in boundary evals, for merge/split race reasons
+	while (self->isMergeActive(range)) {
+		wait(delayJittered(1.0));
+	}
+
 	auto ranges = self->workerAssignments.intersectingRanges(range);
 	state std::vector<KeyRange> activeRanges;

@ -4383,7 +4512,7 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 						           history.get().version);
 					}
 					visited.insert({ activeRange.begin.toString(), history.get().version });
-					historyEntryQueue.push({ activeRange, history.get().version, MAX_VERSION });
+					historyEntryQueue.push({ activeRange, history.get().version, MAX_VERSION, {} });
 				} else if (BM_PURGE_DEBUG) {
 					fmt::print("BM {0}   No history for range, ignoring\n", self->epoch);
 				}
@ -4406,7 +4535,8 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 		state KeyRange currRange;
 		state Version startVersion;
 		state Version endVersion;
-		std::tie(currRange, startVersion, endVersion) = historyEntryQueue.front();
+		state Optional<UID> mergeChildID;
+		std::tie(currRange, startVersion, endVersion, mergeChildID) = historyEntryQueue.front();
 		historyEntryQueue.pop();

 		if (BM_PURGE_DEBUG) {
@ -4462,7 +4592,7 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 				fmt::print(
 				    "BM {0}   Granule {1} will be FULLY deleted\n", self->epoch, currHistoryNode.granuleID.toString());
 			}
-			toFullyDelete.push_back({ currHistoryNode.granuleID, historyKey, currRange });
+			toFullyDelete.push_back({ currHistoryNode.granuleID, historyKey, currRange, mergeChildID });
 		} else if (startVersion < purgeVersion) {
 			if (BM_PURGE_DEBUG) {
 				fmt::print("BM {0}   Granule {1} will be partially deleted\n",
@ -4476,6 +4606,8 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 		if (BM_PURGE_DEBUG) {
 			fmt::print("BM {0}   Checking {1} parents\n", self->epoch, currHistoryNode.parentVersions.size());
 		}
+		Optional<UID> mergeChildID =
+		    currHistoryNode.parentVersions.size() > 1 ? currHistoryNode.granuleID : Optional<UID>();
 		for (int i = 0; i < currHistoryNode.parentVersions.size(); i++) {
 			// for (auto& parent : currHistoryNode.parentVersions.size()) {
 			// if we already added this node to queue, skip it; otherwise, mark it as visited
@ -4505,7 +4637,7 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range

 			// the parent's end version is this node's startVersion,
 			// since this node must have started where it's parent finished
-			historyEntryQueue.push({ parentRange, parentVersion, startVersion });
+			historyEntryQueue.push({ parentRange, parentVersion, startVersion, mergeChildID });
 		}
 	}

@ -4546,12 +4678,13 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 		state UID granuleId;
 		Key historyKey;
 		KeyRange keyRange;
-		std::tie(granuleId, historyKey, keyRange) = toFullyDelete[i];
+		Optional<UID> mergeChildId;
+		std::tie(granuleId, historyKey, keyRange, mergeChildId) = toFullyDelete[i];
 		// FIXME: consider batching into a single txn (need to take care of txn size limit)
 		if (BM_PURGE_DEBUG) {
 			fmt::print("BM {0}: About to fully delete granule {1}\n", self->epoch, granuleId.toString());
 		}
-		wait(fullyDeleteGranule(self, granuleId, historyKey, purgeVersion, keyRange, force));
+		wait(fullyDeleteGranule(self, granuleId, historyKey, purgeVersion, keyRange, mergeChildId, force));
 	}

 	if (BM_PURGE_DEBUG) {
@ -5013,9 +5146,6 @@ TEST_CASE("/blobmanager/updateranges") {
 	VectorRef<KeyRangeRef> added;
 	VectorRef<KeyRangeRef> removed;

-	StringRef active = LiteralStringRef("1");
-	StringRef inactive = StringRef();
-
 	RangeResult dbDataEmpty;
 	std::vector<std::pair<KeyRangeRef, bool>> kbrRanges;

@ -5026,34 +5156,34 @@ TEST_CASE("/blobmanager/updateranges") {

 	// db data setup
 	RangeResult dbDataAB;
-	dbDataAB.emplace_back(ar, keyA, active);
-	dbDataAB.emplace_back(ar, keyB, inactive);
+	dbDataAB.emplace_back(ar, keyA, blobRangeActive);
+	dbDataAB.emplace_back(ar, keyB, blobRangeInactive);

 	RangeResult dbDataAC;
-	dbDataAC.emplace_back(ar, keyA, active);
-	dbDataAC.emplace_back(ar, keyC, inactive);
+	dbDataAC.emplace_back(ar, keyA, blobRangeActive);
+	dbDataAC.emplace_back(ar, keyC, blobRangeInactive);

 	RangeResult dbDataAD;
-	dbDataAD.emplace_back(ar, keyA, active);
-	dbDataAD.emplace_back(ar, keyD, inactive);
+	dbDataAD.emplace_back(ar, keyA, blobRangeActive);
+	dbDataAD.emplace_back(ar, keyD, blobRangeInactive);

 	RangeResult dbDataBC;
-	dbDataBC.emplace_back(ar, keyB, active);
-	dbDataBC.emplace_back(ar, keyC, inactive);
+	dbDataBC.emplace_back(ar, keyB, blobRangeActive);
+	dbDataBC.emplace_back(ar, keyC, blobRangeInactive);

 	RangeResult dbDataBD;
-	dbDataBD.emplace_back(ar, keyB, active);
-	dbDataBD.emplace_back(ar, keyD, inactive);
+	dbDataBD.emplace_back(ar, keyB, blobRangeActive);
+	dbDataBD.emplace_back(ar, keyD, blobRangeInactive);

 	RangeResult dbDataCD;
-	dbDataCD.emplace_back(ar, keyC, active);
-	dbDataCD.emplace_back(ar, keyD, inactive);
+	dbDataCD.emplace_back(ar, keyC, blobRangeActive);
+	dbDataCD.emplace_back(ar, keyD, blobRangeInactive);

 	RangeResult dbDataAB_CD;
-	dbDataAB_CD.emplace_back(ar, keyA, active);
-	dbDataAB_CD.emplace_back(ar, keyB, inactive);
-	dbDataAB_CD.emplace_back(ar, keyC, active);
-	dbDataAB_CD.emplace_back(ar, keyD, inactive);
+	dbDataAB_CD.emplace_back(ar, keyA, blobRangeActive);
+	dbDataAB_CD.emplace_back(ar, keyB, blobRangeInactive);
+	dbDataAB_CD.emplace_back(ar, keyC, blobRangeActive);
+	dbDataAB_CD.emplace_back(ar, keyD, blobRangeInactive);

 	// key ranges setup
 	KeyRangeRef rangeAB = KeyRangeRef(keyA, keyB);
--- a/fdbserver/BlobWorker.actor.cpp
+++ b/fdbserver/BlobWorker.actor.cpp
@ -84,7 +84,6 @@ struct GranuleStartState {
 	Optional<GranuleHistory> history;
 };

-// FIXME: add global byte limit for pending and buffered deltas
 struct GranuleMetadata : NonCopyable, ReferenceCounted<GranuleMetadata> {
 	KeyRange keyRange;

@ -1032,7 +1031,6 @@ ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData>

 	loop {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-		// FIXME: proper tenant support in Blob Worker
 		tr->setOption(FDBTransactionOptions::RAW_ACCESS);
 		tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
@ -1059,7 +1057,7 @@ ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData>
 			DEBUG_KEY_RANGE("BlobWorkerFDBSnapshot", readVersion, metadata->keyRange, bwData->id);

 			// initial snapshot is committed in fdb, we can pop the change feed up to this version
-			inFlightPops->push_back(bwData->db->popChangeFeedMutations(cfKey, readVersion));
+			inFlightPops->push_back(bwData->db->popChangeFeedMutations(cfKey, readVersion + 1));
 			return snapshotWriter.get();
 		} catch (Error& e) {
 			if (e.code() == error_code_operation_cancelled) {
@ -1373,6 +1371,42 @@ ACTOR Future<BlobFileIndex> checkSplitAndReSnapshot(Reference<BlobWorkerData> bw
 	return reSnapshotIdx;
 }

+ACTOR Future<BlobFileIndex> reSnapshotNoCheck(Reference<BlobWorkerData> bwData,
+                                              Reference<BlobConnectionProvider> bstore,
+                                              Reference<GranuleMetadata> metadata,
+                                              UID granuleID,
+                                              Future<BlobFileIndex> lastDeltaBeforeSnapshot) {
+	BlobFileIndex lastDeltaIdx = wait(lastDeltaBeforeSnapshot);
+	state Version reSnapshotVersion = lastDeltaIdx.version;
+	wait(delay(0, TaskPriority::BlobWorkerUpdateFDB));
+
+	CODE_PROBE(true, "re-snapshotting without BM check because still on old change feed!");
+
+	if (BW_DEBUG) {
+		fmt::print("Granule [{0} - {1}) re-snapshotting @ {2} WITHOUT checking with BM, because it is still on old "
+		           "change feed!\n",
+		           metadata->keyRange.begin.printable(),
+		           metadata->keyRange.end.printable(),
+		           reSnapshotVersion);
+	}
+
+	TraceEvent(SevDebug, "BlobGranuleReSnapshotOldFeed", bwData->id)
+	    .detail("Granule", metadata->keyRange)
+	    .detail("Version", reSnapshotVersion);
+
+	// wait for file updater to make sure that last delta file is in the metadata before
+	while (metadata->files.deltaFiles.empty() || metadata->files.deltaFiles.back().version < reSnapshotVersion) {
+		wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
+	}
+
+	std::vector<GranuleFiles> toSnapshot;
+	toSnapshot.push_back(metadata->files);
+	BlobFileIndex reSnapshotIdx =
+	    wait(compactFromBlob(bwData, bstore, metadata, granuleID, toSnapshot, reSnapshotVersion));
+
+	return reSnapshotIdx;
+}
+
 // wait indefinitely to tell manager to re-evaluate this split, until the granule is revoked
 ACTOR Future<Void> reevaluateInitialSplit(Reference<BlobWorkerData> bwData,
                                          UID granuleID,
@ -1536,11 +1570,10 @@ void handleCompletedDeltaFile(Reference<BlobWorkerData> bwData,
 		}
 		// FIXME: for a write-hot shard, we could potentially batch these and only pop the largest one after
 		// several have completed
-		// FIXME: we actually want to pop at this version + 1 because pop is exclusive?
 		// FIXME: since this is async, and worker could die, new blob worker that opens granule should probably
 		// kick off an async pop at its previousDurableVersion after opening the granule to guarantee it is
 		// eventually popped?
-		Future<Void> popFuture = bwData->db->popChangeFeedMutations(cfKey, completedDeltaFile.version);
+		Future<Void> popFuture = bwData->db->popChangeFeedMutations(cfKey, completedDeltaFile.version + 1);
 		// Do pop asynchronously
 		inFlightPops.push_back(popFuture);
 	}
@ -1962,6 +1995,13 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			                                                   metadata->keyRange,
 			                                                   bwData->changeFeedStreamReplyBufferSize,
 			                                                   false);
+			// in case previous worker died before popping the latest version, start another pop
+			if (startState.previousDurableVersion != invalidVersion) {
+				ASSERT(startState.previousDurableVersion >= startState.changeFeedStartVersion);
+				Future<Void> popFuture =
+				    bwData->db->popChangeFeedMutations(cfKey, startState.previousDurableVersion + 1);
+				inFlightPops.push_back(popFuture);
+			}
 		}

 		// Start actors BEFORE setting new change feed data to ensure the change feed data is properly
@ -2408,8 +2448,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			// yet

 			// If we have enough delta files, try to re-snapshot
-			if (snapshotEligible && metadata->bytesInNewDeltaFiles >= SERVER_KNOBS->BG_DELTA_BYTES_BEFORE_COMPACT &&
-			    metadata->pendingDeltaVersion >= startState.changeFeedStartVersion) {
+			if (snapshotEligible && metadata->bytesInNewDeltaFiles >= SERVER_KNOBS->BG_DELTA_BYTES_BEFORE_COMPACT) {
 				if (BW_DEBUG && !inFlightFiles.empty()) {
 					fmt::print("Granule [{0} - {1}) ready to re-snapshot at {2} after {3} > {4} bytes, "
 					           "waiting for "
@ -2437,13 +2476,19 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 					previousFuture = Future<BlobFileIndex>(metadata->files.deltaFiles.back());
 				}
 				int64_t versionsSinceLastSnapshot = metadata->pendingDeltaVersion - metadata->pendingSnapshotVersion;
-				Future<BlobFileIndex> inFlightBlobSnapshot = checkSplitAndReSnapshot(bwData,
-				                                                                     bstore,
-				                                                                     metadata,
-				                                                                     startState.granuleID,
-				                                                                     metadata->bytesInNewDeltaFiles,
-				                                                                     previousFuture,
-				                                                                     versionsSinceLastSnapshot);
+				Future<BlobFileIndex> inFlightBlobSnapshot;
+				if (metadata->pendingDeltaVersion >= startState.changeFeedStartVersion) {
+					inFlightBlobSnapshot = checkSplitAndReSnapshot(bwData,
+					                                               bstore,
+					                                               metadata,
+					                                               startState.granuleID,
+					                                               metadata->bytesInNewDeltaFiles,
+					                                               previousFuture,
+					                                               versionsSinceLastSnapshot);
+				} else {
+					inFlightBlobSnapshot =
+					    reSnapshotNoCheck(bwData, bstore, metadata, startState.granuleID, previousFuture);
+				}
 				inFlightFiles.push_back(InFlightFile(inFlightBlobSnapshot, metadata->pendingDeltaVersion, 0, true));
 				pendingSnapshots++;

@ -3412,6 +3457,16 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 						           req.readVersion);
 					}
 				}
+				// if feed was popped by another worker and BW only got empty versions, it wouldn't itself see that it
+				// got popped, but we can still reject the in theory this should never happen with other protections but
+				// it's a useful and inexpensive sanity check
+				Version emptyVersion = metadata->activeCFData.get()->popVersion - 1;
+				if (req.readVersion > metadata->durableDeltaVersion.get() &&
+				    emptyVersion > metadata->bufferedDeltaVersion) {
+					CODE_PROBE(true, "feed popped for read but granule updater didn't notice yet");
+					// FIXME: could try to cancel the actor here somehow, but it should find out eventually
+					throw wrong_shard_server();
+				}
 				rangeGranulePair.push_back(std::pair(metadata->keyRange, metadata->files));
 			}

@ -3494,8 +3549,6 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 					}
 				}

-				// FIXME: get cipher keys for delta files too!
-
 				// new deltas (if version is larger than version of last delta file)
 				// FIXME: do trivial key bounds here if key range is not fully contained in request key
 				// range
@ -3512,8 +3565,6 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl

 					// prune mutations based on begin version, if possible
 					ASSERT(metadata->durableDeltaVersion.get() == metadata->pendingDeltaVersion);
-					// FIXME: I think we can remove this dependsOn since we are doing push_back_deep
-					rep.arena.dependsOn(metadata->currentDeltas.arena());
 					MutationsAndVersionRef* mutationIt = metadata->currentDeltas.begin();
 					if (granuleBeginVersion > metadata->currentDeltas.back().version) {
 						CODE_PROBE(true, "beginVersion pruning all in-memory mutations");
@ -4164,7 +4215,9 @@ ACTOR Future<Void> handleRangeAssign(Reference<BlobWorkerData> bwData,
 			wait(waitForAll(toWait));

 			if (shouldStart) {
-				bwData->stats.numRangesAssigned++;
+				if (!isSelfReassign) {
+					bwData->stats.numRangesAssigned++;
+				}
 				auto m = bwData->granuleMetadata.rangeContaining(req.keyRange.begin);
 				ASSERT(m.begin() == req.keyRange.begin && m.end() == req.keyRange.end);
 				if (m.value().activeMetadata.isValid()) {
@ -4253,6 +4306,7 @@ void handleBlobVersionRequest(Reference<BlobWorkerData> bwData, MinBlobVersionRe
 	MinBlobVersionReply rep;
 	rep.version = bwData->db->getMinimumChangeFeedVersion();
 	bwData->stats.minimumCFVersion = rep.version;
+	bwData->stats.cfVersionLag = std::max((Version)0, req.grv - rep.version);
 	bwData->stats.notAtLatestChangeFeeds = bwData->db->notAtLatestChangeFeeds.size();
 	req.reply.send(rep);
 }
@ -4724,7 +4778,6 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
 					if (self->statusStreamInitialized) {
 						copy = self->currentManagerStatusStream.get();
 					}
-					// TODO: pick a reasonable byte limit instead of just piggy-backing
 					req.reply.setByteLimit(SERVER_KNOBS->BLOBWORKERSTATUSSTREAM_LIMIT_BYTES);
 					self->statusStreamInitialized = true;

--- a/fdbserver/CommitProxyServer.actor.cpp
+++ b/fdbserver/CommitProxyServer.actor.cpp
@ -34,7 +34,6 @@
 #include "fdbserver/ApplyMetadataMutation.h"
 #include "fdbserver/ConflictSet.h"
 #include "fdbserver/DataDistributorInterface.h"
-#include "fdbserver/EncryptedMutationMessage.h"
 #include "fdbserver/EncryptionOpsUtils.h"
 #include "fdbserver/FDBExecHelper.actor.h"
 #include "fdbserver/GetEncryptCipherKeys.h"
@ -1168,8 +1167,7 @@ void writeMutation(CommitBatchContext* self, int64_t tenantId, const MutationRef
 		self->toCommit.writeTypedMessage(mutation);
 	} else {
 		Arena arena;
-		self->toCommit.writeTypedMessage(
-		    EncryptedMutationMessage::encrypt(arena, self->cipherKeys, tenantId /*domainId*/, mutation));
+		self->toCommit.writeTypedMessage(mutation.encrypt(self->cipherKeys, tenantId /*domainId*/, arena));
 	}
 }

--- a/fdbserver/DataDistributionQueue.actor.cpp
+++ b/fdbserver/DataDistributionQueue.actor.cpp
@ -29,6 +29,7 @@
 #include "fdbrpc/sim_validation.h"
 #include "fdbclient/SystemData.h"
 #include "fdbserver/DataDistribution.actor.h"
+#include "fdbserver/DDSharedContext.h"
 #include "fdbclient/DatabaseContext.h"
 #include "fdbserver/MoveKeys.actor.h"
 #include "fdbserver/Knobs.h"
@ -89,7 +90,9 @@ std::pair<const DmReasonPriorityMapping*, const PriorityDmReasonMapping*> buildP
 		{ DataMovementReason::TEAM_1_LEFT, SERVER_KNOBS->PRIORITY_TEAM_1_LEFT },
 		{ DataMovementReason::TEAM_FAILED, SERVER_KNOBS->PRIORITY_TEAM_FAILED },
 		{ DataMovementReason::TEAM_0_LEFT, SERVER_KNOBS->PRIORITY_TEAM_0_LEFT },
-		{ DataMovementReason::SPLIT_SHARD, SERVER_KNOBS->PRIORITY_SPLIT_SHARD }
+		{ DataMovementReason::SPLIT_SHARD, SERVER_KNOBS->PRIORITY_SPLIT_SHARD },
+		{ DataMovementReason::ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD,
+		  SERVER_KNOBS->PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD }
 	};

 	static PriorityDmReasonMapping priorityReason;
@ -516,7 +519,7 @@ ACTOR Future<Void> dataDistributionRelocator(struct DDQueue* self,
                                             Future<Void> prevCleanup,
                                             const DDEnabledState* ddEnabledState);

-struct DDQueue {
+struct DDQueue : public IDDRelocationQueue {
 	struct DDDataMove {
 		DDDataMove() = default;
 		explicit DDDataMove(UID id) : id(id) {}
@ -629,6 +632,7 @@ struct DDQueue {

 	std::vector<TeamCollectionInterface> teamCollections;
 	Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure;
+	Reference<PhysicalShardCollection> physicalShardCollection;
 	PromiseStream<Promise<int64_t>> getAverageShardBytes;

 	FlowLock startMoveKeysParallelismLock;
@ -727,6 +731,7 @@ struct DDQueue {
 	        Database cx,
 	        std::vector<TeamCollectionInterface> teamCollections,
 	        Reference<ShardsAffectedByTeamFailure> sABTF,
+	        Reference<PhysicalShardCollection> physicalShardCollection,
 	        PromiseStream<Promise<int64_t>> getAverageShardBytes,
 	        int teamSize,
 	        int singleRegionTeamSize,
@ -734,8 +739,9 @@ struct DDQueue {
 	        FutureStream<RelocateShard> input,
 	        PromiseStream<GetMetricsRequest> getShardMetrics,
 	        PromiseStream<GetTopKMetricsRequest> getTopKMetrics)
-	  : distributorId(mid), lock(lock), cx(cx), txnProcessor(new DDTxnProcessor(cx)), teamCollections(teamCollections),
-	    shardsAffectedByTeamFailure(sABTF), getAverageShardBytes(getAverageShardBytes),
+	  : IDDRelocationQueue(), distributorId(mid), lock(lock), cx(cx), txnProcessor(new DDTxnProcessor(cx)),
+	    teamCollections(teamCollections), shardsAffectedByTeamFailure(sABTF),
+	    physicalShardCollection(physicalShardCollection), getAverageShardBytes(getAverageShardBytes),
 	    startMoveKeysParallelismLock(SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM),
 	    finishMoveKeysParallelismLock(SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM),
 	    cleanUpDataMoveParallelismLock(SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM),
@ -1224,7 +1230,11 @@ struct DDQueue {
 					// TODO(psm): The shard id is determined by DD.
 					rrs.dataMove.reset();
 					if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
-						rrs.dataMoveId = deterministicRandom()->randomUniqueID();
+						if (SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+							rrs.dataMoveId = UID();
+						} else {
+							rrs.dataMoveId = deterministicRandom()->randomUniqueID();
+						}
 					} else {
 						rrs.dataMoveId = anonymousShardId;
 					}
@ -1312,6 +1322,8 @@ struct DDQueue {
 		};
 		return recurring(f, SERVER_KNOBS->DD_QUEUE_COUNTER_REFRESH_INTERVAL);
 	}
+
+	int getUnhealthyRelocationCount() override { return unhealthyRelocations; }
 };

 ACTOR Future<Void> cancelDataMove(struct DDQueue* self, KeyRange range, const DDEnabledState* ddEnabledState) {
@ -1375,6 +1387,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 	state std::vector<std::pair<Reference<IDataDistributionTeam>, bool>> bestTeams;
 	state double startTime = now();
 	state std::vector<UID> destIds;
+	state uint64_t debugID = deterministicRandom()->randomUInt64();

 	try {
 		if (now() - self->lastInterval < 1.0) {
@ -1413,12 +1426,20 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 					    .detail("Range", kr);
 				}
 			}
-			self->dataMoves.insert(rd.keys, DDQueue::DDDataMove(rd.dataMoveId));
+			if (rd.isRestore() || !SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+				if (SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+					ASSERT(rd.dataMoveId.isValid());
+				}
+				self->dataMoves.insert(rd.keys, DDQueue::DDDataMove(rd.dataMoveId));
+			}
 		}

 		state StorageMetrics metrics =
 		    wait(brokenPromiseToNever(self->getShardMetrics.getReply(GetMetricsRequest(rd.keys))));

+		state uint64_t physicalShardIDCandidate = UID().first();
+		state bool forceToUseNewPhysicalShard = false;
+
 		ASSERT(rd.src.size());
 		loop {
 			destOverloadedCount = 0;
@ -1479,6 +1500,20 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 						req.src = rd.src;
 						req.completeSources = rd.completeSources;

+						if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
+						    tciIndex == 1) {
+							ASSERT(physicalShardIDCandidate != UID().first() &&
+							       physicalShardIDCandidate != anonymousShardId.first());
+							Optional<ShardsAffectedByTeamFailure::Team> remoteTeamWithPhysicalShard =
+							    self->physicalShardCollection->tryGetAvailableRemoteTeamWith(
+							        physicalShardIDCandidate, metrics, debugID);
+							if (remoteTeamWithPhysicalShard.present()) {
+								// Exists a remoteTeam in the mapping that has the physicalShardIDCandidate
+								// use the remoteTeam with the physicalShard as the bestTeam
+								req = GetTeamRequest(remoteTeamWithPhysicalShard.get().servers);
+							}
+						}
+
 						// bestTeam.second = false if the bestTeam in the teamCollection (in the DC) does not have any
 						// server that hosts the relocateData. This is possible, for example, in a fearless
 						// configuration when the remote DC is just brought up.
@ -1510,10 +1545,62 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 							anyWithSource = true;
 						}

-						bestTeams.emplace_back(bestTeam.first.get(), bestTeam.second);
+						if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+							// critical to the correctness of team selection by PhysicalShardCollection
+							// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary
+							// team Thus, tryGetAvailableRemoteTeamWith() may select an almost full remote team In this
+							// case, we must re-select a remote team We set foundTeams = false to avoid finishing team
+							// selection Then, forceToUseNewPhysicalShard is set, which enforce to use getTeam to select
+							// a remote team
+							if (tciIndex == 1 && !forceToUseNewPhysicalShard) {
+								bool minAvailableSpaceRatio = bestTeam.first.get()->getMinAvailableSpaceRatio(true);
+								if (minAvailableSpaceRatio < SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO) {
+									foundTeams = false;
+									break;
+								}
+							}
+						}
+
+						if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+							bestTeams.emplace_back(bestTeam.first.get(), true);
+							// Always set bestTeams[i].second = true to disable optimization in data move between DCs
+							// for the correctness of PhysicalShardCollection
+							// Currently, enabling the optimization will break the invariant of PhysicalShardCollection
+							// Invariant: once a physical shard is created with a specific set of SSes, this SS set will
+							// never get changed.
+						} else {
+							bestTeams.emplace_back(bestTeam.first.get(), bestTeam.second);
+						}
+
+						// get physicalShardIDCandidate
+						if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
+						    tciIndex == 0) {
+							ASSERT(foundTeams);
+							ShardsAffectedByTeamFailure::Team primaryTeam =
+							    ShardsAffectedByTeamFailure::Team(bestTeams[0].first->getServerIDs(), true);
+							physicalShardIDCandidate =
+							    self->physicalShardCollection->determinePhysicalShardIDGivenPrimaryTeam(
+							        primaryTeam, metrics, forceToUseNewPhysicalShard, debugID);
+							ASSERT(physicalShardIDCandidate != UID().first() &&
+							       physicalShardIDCandidate != anonymousShardId.first());
+						}
 					}
 					tciIndex++;
 				}
+
+				// critical to the correctness of team selection by PhysicalShardCollection
+				// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary team
+				// Thus, tryGetAvailableRemoteTeamWith() may select an unhealthy remote team
+				// In this case, we must re-select a remote team
+				// We set foundTeams = false to avoid finishing team selection
+				// Then, forceToUseNewPhysicalShard is set, which enforce to use getTeam to select a remote team
+				if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
+				    bestTeams.size() > 1 && !forceToUseNewPhysicalShard) {
+					if (!bestTeams[1].first->isHealthy()) {
+						foundTeams = false;
+					}
+				}
+
 				// once we've found healthy candidate teams, make sure they're not overloaded with outstanding moves
 				// already
 				anyDestOverloaded = !canLaunchDest(bestTeams, rd.priority, self->destBusymap);
@ -1534,6 +1621,11 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 					    .detail("AnyDestOverloaded", anyDestOverloaded)
 					    .detail("NumOfTeamCollections", self->teamCollections.size())
 					    .detail("Servers", destServersString(bestTeams));
+					if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+						if (rd.isRestore() && destOverloadedCount > 50) {
+							throw data_move_dest_team_not_found();
+						}
+					}
 					wait(delay(SERVER_KNOBS->DEST_OVERLOADED_DELAY, TaskPriority::DataDistributionLaunch));
 				} else {
 					CODE_PROBE(true, "did not find a healthy destination team on the first attempt");
@ -1550,10 +1642,42 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 					}
 					wait(delay(SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskPriority::DataDistributionLaunch));
 				}
+				// When forceToUseNewPhysicalShard = false, we get paired primary team and remote team
+				// However, this may be failed
+				// Any retry triggers to use new physicalShard which enters the normal routine
+				if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+					forceToUseNewPhysicalShard = true;
+				}

 				// TODO different trace event + knob for overloaded? Could wait on an async var for done moves
 			}

+			if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+				if (!rd.isRestore()) {
+					// when !rd.isRestore(), dataMoveId is just decided as physicalShardIDCandidate
+					// thus, update the physicalShardIDCandidate to related data structures
+					ASSERT(physicalShardIDCandidate != UID().first());
+					rd.dataMoveId = newShardId(physicalShardIDCandidate, AssignEmptyRange::False);
+					auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
+					inFlightRange.value().dataMoveId = rd.dataMoveId;
+					auto f = self->dataMoves.intersectingRanges(rd.keys);
+					for (auto it = f.begin(); it != f.end(); ++it) {
+						KeyRangeRef kr(it->range().begin, it->range().end);
+						const UID mId = it->value().id;
+						if (mId.isValid() && mId != rd.dataMoveId) {
+							TraceEvent("DDRelocatorConflictingDataMoveAfterGetTeam", distributorId)
+							    .detail("CurrentDataMoveID", rd.dataMoveId)
+							    .detail("DataMoveID", mId)
+							    .detail("Range", kr);
+						}
+					}
+					self->dataMoves.insert(rd.keys, DDQueue::DDDataMove(rd.dataMoveId));
+				}
+				ASSERT(rd.dataMoveId.first() != UID().first());
+				auto dataMoveRange = self->dataMoves.rangeContaining(rd.keys.begin);
+				ASSERT(dataMoveRange.value().id == rd.dataMoveId);
+			}
+
 			// set cancellable to false on inFlight's entry for this key range
 			auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
 			ASSERT(inFlightRange.range() == rd.keys);
@ -1772,6 +1896,20 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 					self->bytesWritten += metrics.bytes;
 					self->shardsAffectedByTeamFailure->finishMove(rd.keys);
 					relocationComplete.send(rd);
+
+					if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+						// update physical shard collection
+						std::vector<ShardsAffectedByTeamFailure::Team> selectedTeams;
+						for (int i = 0; i < bestTeams.size(); i++) {
+							auto serverIds = bestTeams[i].first->getServerIDs();
+							selectedTeams.push_back(ShardsAffectedByTeamFailure::Team(serverIds, i == 0));
+						}
+						// The update of PhysicalShardToTeams, PhysicalShardInstances, keyRangePhysicalShardIDMap should
+						// be atomic
+						self->physicalShardCollection->updatePhysicalShardCollection(
+						    rd.keys, rd.isRestore(), selectedTeams, rd.dataMoveId.first(), metrics, debugID);
+					}
+
 					return Void();
 				} else {
 					throw error;
@ -2333,6 +2471,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
                                         Reference<AsyncVar<bool>> processingWiggle,
                                         std::vector<TeamCollectionInterface> teamCollections,
                                         Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
+                                         Reference<PhysicalShardCollection> physicalShardCollection,
                                         MoveKeysLock lock,
                                         PromiseStream<Promise<int64_t>> getAverageShardBytes,
                                         FutureStream<Promise<int>> getUnhealthyRelocationCount,
@ -2345,6 +2484,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
 	                   cx,
 	                   teamCollections,
 	                   shardsAffectedByTeamFailure,
+	                   physicalShardCollection,
 	                   getAverageShardBytes,
 	                   teamSize,
 	                   singleRegionTeamSize,
@ -2489,7 +2629,9 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
 				}
 				when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator
 				when(wait(waitForAll(ddQueueFutures))) {}
-				when(Promise<int> r = waitNext(getUnhealthyRelocationCount)) { r.send(self.unhealthyRelocations); }
+				when(Promise<int> r = waitNext(getUnhealthyRelocationCount)) {
+					r.send(self.getUnhealthyRelocationCount());
+				}
 			}
 		}
 	} catch (Error& e) {
@ -2502,6 +2644,8 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
 	}
 }

+ACTOR Future<Void> dataDistributionQueue(Reference<DDSharedContext> context, Database cx);
+
 TEST_CASE("/DataDistribution/DDQueue/ServerCounterTrace") {
 	state double duration = 2.5 * SERVER_KNOBS->DD_QUEUE_COUNTER_REFRESH_INTERVAL;
 	state DDQueue self;
--- a/fdbserver/DataDistributionTracker.actor.cpp
+++ b/fdbserver/DataDistributionTracker.actor.cpp
@ -21,6 +21,7 @@
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbclient/SystemData.h"
 #include "fdbserver/DataDistribution.actor.h"
+#include "fdbserver/DDSharedContext.h"
 #include "fdbserver/Knobs.h"
 #include "fdbclient/DatabaseContext.h"
 #include "flow/ActorCollection.h"
@ -68,10 +69,10 @@ ACTOR Future<Void> updateMaxShardSize(Reference<AsyncVar<int64_t>> dbSizeEstimat
 	}
 }

-struct DataDistributionTracker {
+struct DataDistributionTracker : public IDDShardTracker {
 	Database cx;
 	UID distributorId;
-	KeyRangeMap<ShardTrackedData>& shards;
+	KeyRangeMap<ShardTrackedData>* shards;
 	ActorCollection sizeChanges;

 	int64_t systemSizeEstimate;
@ -83,6 +84,9 @@ struct DataDistributionTracker {
 	PromiseStream<RelocateShard> output;
 	Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure;

+	// PhysicalShard Tracker
+	Reference<PhysicalShardCollection> physicalShardCollection;
+
 	Promise<Void> readyToStart;
 	Reference<AsyncVar<bool>> anyZeroHealthyTeams;

@ -92,7 +96,7 @@ struct DataDistributionTracker {
 	// The reference to trackerCancelled must be extracted by actors,
 	// because by the time (trackerCancelled == true) this memory cannot
 	// be accessed
-	bool& trackerCancelled;
+	bool* trackerCancelled;

 	// This class extracts the trackerCancelled reference from a DataDistributionTracker object
 	// Because some actors spawned by the dataDistributionTracker outlive the DataDistributionTracker
@ -104,7 +108,7 @@ struct DataDistributionTracker {

 	public:
 		SafeAccessor(DataDistributionTracker* tracker)
-		  : trackerCancelled(tracker->trackerCancelled), tracker(*tracker) {
+		  : trackerCancelled(*tracker->trackerCancelled), tracker(*tracker) {
 			ASSERT(!trackerCancelled);
 		}

@ -122,24 +126,29 @@ struct DataDistributionTracker {
 	                        Promise<Void> const& readyToStart,
 	                        PromiseStream<RelocateShard> const& output,
 	                        Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
+	                        Reference<PhysicalShardCollection> physicalShardCollection,
 	                        Reference<AsyncVar<bool>> anyZeroHealthyTeams,
-	                        KeyRangeMap<ShardTrackedData>& shards,
-	                        bool& trackerCancelled)
-	  : cx(cx), distributorId(distributorId), shards(shards), sizeChanges(false), systemSizeEstimate(0),
-	    dbSizeEstimate(new AsyncVar<int64_t>()), maxShardSize(new AsyncVar<Optional<int64_t>>()), output(output),
-	    shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), readyToStart(readyToStart),
+	                        KeyRangeMap<ShardTrackedData>* shards,
+	                        bool* trackerCancelled)
+	  : IDDShardTracker(), cx(cx), distributorId(distributorId), shards(shards), sizeChanges(false),
+	    systemSizeEstimate(0), dbSizeEstimate(new AsyncVar<int64_t>()), maxShardSize(new AsyncVar<Optional<int64_t>>()),
+	    output(output), shardsAffectedByTeamFailure(shardsAffectedByTeamFailure),
+	    physicalShardCollection(physicalShardCollection), readyToStart(readyToStart),
 	    anyZeroHealthyTeams(anyZeroHealthyTeams), trackerCancelled(trackerCancelled) {}

-	~DataDistributionTracker() {
-		trackerCancelled = true;
+	~DataDistributionTracker() override {
+		*trackerCancelled = true;
 		// Cancel all actors so they aren't waiting on sizeChanged broken promise
 		sizeChanges.clear(false);
 	}
+
+	double getAverageShardBytes() override { return maxShardSize->get().get() / 2.0; }
 };

 void restartShardTrackers(DataDistributionTracker* self,
                          KeyRangeRef keys,
-                          Optional<ShardMetrics> startingMetrics = Optional<ShardMetrics>());
+                          Optional<ShardMetrics> startingMetrics = Optional<ShardMetrics>(),
+                          bool whenDDInit = false);

 // Gets the permitted size and IO bounds for a shard. A shard that starts at allKeys.begin
 //  (i.e. '') will have a permitted size of 0, since the database can contain no data.
@ -186,7 +195,8 @@ int64_t getMaxShardSize(double dbSizeEstimate) {

 ACTOR Future<Void> trackShardMetrics(DataDistributionTracker::SafeAccessor self,
                                     KeyRange keys,
-                                     Reference<AsyncVar<Optional<ShardMetrics>>> shardMetrics) {
+                                     Reference<AsyncVar<Optional<ShardMetrics>>> shardMetrics,
+                                     bool whenDDInit) {
 	state BandwidthStatus bandwidthStatus =
 	    shardMetrics->get().present() ? getBandwidthStatus(shardMetrics->get().get().metrics) : BandwidthStatusNormal;
 	state double lastLowBandwidthStartTime =
@ -195,7 +205,7 @@ ACTOR Future<Void> trackShardMetrics(DataDistributionTracker::SafeAccessor self,
 	state ReadBandwidthStatus readBandwidthStatus = shardMetrics->get().present()
 	                                                    ? getReadBandwidthStatus(shardMetrics->get().get().metrics)
 	                                                    : ReadBandwidthStatusNormal;
-
+	state bool initWithNewMetrics = whenDDInit;
 	wait(delay(0, TaskPriority::DataDistribution));

 	/*TraceEvent("TrackShardMetricsStarting")
@ -303,6 +313,23 @@ ACTOR Future<Void> trackShardMetrics(DataDistributionTracker::SafeAccessor self,
 					if (shardMetrics->get().present()) {
 						self()->dbSizeEstimate->set(self()->dbSizeEstimate->get() + metrics.first.get().bytes -
 						                            shardMetrics->get().get().metrics.bytes);
+						if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+							// update physicalShard metrics and return whether the keys needs to move out of
+							// physicalShard
+							const MoveKeyRangeOutPhysicalShard needToMove =
+							    self()->physicalShardCollection->trackPhysicalShard(
+							        keys, metrics.first.get(), shardMetrics->get().get().metrics, initWithNewMetrics);
+							if (needToMove) {
+								// Do we need to update shardsAffectedByTeamFailure here?
+								self()->output.send(
+								    RelocateShard(keys,
+								                  DataMovementReason::ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD,
+								                  RelocateReason::OTHER));
+							}
+							if (initWithNewMetrics) {
+								initWithNewMetrics = false;
+							}
+						}
 						if (keys.begin >= systemKeys.begin) {
 							self()->systemSizeEstimate +=
 							    metrics.first.get().bytes - shardMetrics->get().get().metrics.bytes;
@ -399,7 +426,7 @@ ACTOR Future<int64_t> getFirstSize(Reference<AsyncVar<Optional<ShardMetrics>>> s
 ACTOR Future<Void> changeSizes(DataDistributionTracker* self, KeyRange keys, int64_t oldShardsEndingSize) {
 	state std::vector<Future<int64_t>> sizes;
 	state std::vector<Future<int64_t>> systemSizes;
-	for (auto it : self->shards.intersectingRanges(keys)) {
+	for (auto it : self->shards->intersectingRanges(keys)) {
 		Future<int64_t> thisSize = getFirstSize(it->value().stats);
 		sizes.push_back(thisSize);
 		if (it->range().begin >= systemKeys.begin) {
@ -557,8 +584,8 @@ Future<Void> shardMerger(DataDistributionTracker* self,
                         Reference<AsyncVar<Optional<ShardMetrics>>> shardSize) {
 	int64_t maxShardSize = self->maxShardSize->get().get();

-	auto prevIter = self->shards.rangeContaining(keys.begin);
-	auto nextIter = self->shards.rangeContaining(keys.begin);
+	auto prevIter = self->shards->rangeContaining(keys.begin);
+	auto nextIter = self->shards->rangeContaining(keys.begin);

 	CODE_PROBE(true, "shard to be merged");
 	ASSERT(keys.begin > allKeys.begin);
@ -778,8 +805,11 @@ ACTOR Future<Void> shardTracker(DataDistributionTracker::SafeAccessor self,
 	}
 }

-void restartShardTrackers(DataDistributionTracker* self, KeyRangeRef keys, Optional<ShardMetrics> startingMetrics) {
-	auto ranges = self->shards.getAffectedRangesAfterInsertion(keys, ShardTrackedData());
+void restartShardTrackers(DataDistributionTracker* self,
+                          KeyRangeRef keys,
+                          Optional<ShardMetrics> startingMetrics,
+                          bool whenDDInit) {
+	auto ranges = self->shards->getAffectedRangesAfterInsertion(keys, ShardTrackedData());
 	for (int i = 0; i < ranges.size(); i++) {
 		if (!ranges[i].value.trackShard.isValid() && ranges[i].begin != keys.begin) {
 			// When starting, key space will be full of "dummy" default contructed entries.
@ -805,8 +835,9 @@ void restartShardTrackers(DataDistributionTracker* self, KeyRangeRef keys, Optio
 		ShardTrackedData data;
 		data.stats = shardMetrics;
 		data.trackShard = shardTracker(DataDistributionTracker::SafeAccessor(self), ranges[i], shardMetrics);
-		data.trackBytes = trackShardMetrics(DataDistributionTracker::SafeAccessor(self), ranges[i], shardMetrics);
-		self->shards.insert(ranges[i], data);
+		data.trackBytes =
+		    trackShardMetrics(DataDistributionTracker::SafeAccessor(self), ranges[i], shardMetrics, whenDDInit);
+		self->shards->insert(ranges[i], data);
 	}
 }

@ -819,7 +850,8 @@ ACTOR Future<Void> trackInitialShards(DataDistributionTracker* self, Reference<I

 	state int s;
 	for (s = 0; s < initData->shards.size() - 1; s++) {
-		restartShardTrackers(self, KeyRangeRef(initData->shards[s].key, initData->shards[s + 1].key));
+		restartShardTrackers(
+		    self, KeyRangeRef(initData->shards[s].key, initData->shards[s + 1].key), Optional<ShardMetrics>(), true);
 		wait(yield(TaskPriority::DataDistribution));
 	}

@ -848,7 +880,7 @@ ACTOR Future<Void> fetchTopKShardMetrics_impl(DataDistributionTracker* self, Get
 			for (i = 0; i < SERVER_KNOBS->DD_SHARD_COMPARE_LIMIT && i < req.keys.size(); ++i) {
 				auto range = req.keys[i];
 				StorageMetrics metrics;
-				for (auto t : self->shards.intersectingRanges(range)) {
+				for (auto t : self->shards->intersectingRanges(range)) {
 					auto& stats = t.value().stats;
 					if (!stats->get().present()) {
 						onChange = stats->onChange();
@ -914,7 +946,7 @@ ACTOR Future<Void> fetchShardMetrics_impl(DataDistributionTracker* self, GetMetr
 		loop {
 			Future<Void> onChange;
 			StorageMetrics returnMetrics;
-			for (auto t : self->shards.intersectingRanges(req.keys)) {
+			for (auto t : self->shards->intersectingRanges(req.keys)) {
 				auto& stats = t.value().stats;
 				if (!stats->get().present()) {
 					onChange = stats->onChange();
@ -958,8 +990,8 @@ ACTOR Future<Void> fetchShardMetricsList_impl(DataDistributionTracker* self, Get
 			// list of metrics, regenerate on loop when full range unsuccessful
 			Standalone<VectorRef<DDMetricsRef>> result;
 			Future<Void> onChange;
-			auto beginIter = self->shards.containedRanges(req.keys).begin();
-			auto endIter = self->shards.intersectingRanges(req.keys).end();
+			auto beginIter = self->shards->containedRanges(req.keys).begin();
+			auto endIter = self->shards->intersectingRanges(req.keys).end();
 			for (auto t = beginIter; t != endIter; ++t) {
 				auto& stats = t.value().stats;
 				if (!stats->get().present()) {
@ -1000,6 +1032,7 @@ ACTOR Future<Void> dataDistributionTracker(Reference<InitialDataDistribution> in
                                           Database cx,
                                           PromiseStream<RelocateShard> output,
                                           Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
+                                           Reference<PhysicalShardCollection> physicalShardCollection,
                                           PromiseStream<GetMetricsRequest> getShardMetrics,
                                           FutureStream<GetTopKMetricsRequest> getTopKMetrics,
                                           PromiseStream<GetMetricsListRequest> getShardMetricsList,
@ -1014,9 +1047,10 @@ ACTOR Future<Void> dataDistributionTracker(Reference<InitialDataDistribution> in
 	                                   readyToStart,
 	                                   output,
 	                                   shardsAffectedByTeamFailure,
+	                                   physicalShardCollection,
 	                                   anyZeroHealthyTeams,
-	                                   *shards,
-	                                   *trackerCancelled);
+	                                   shards,
+	                                   trackerCancelled);
 	state Future<Void> loggingTrigger = Void();
 	state Future<Void> readHotDetect = readHotDetector(&self);
 	state Reference<EventCacheHolder> ddTrackerStatsEventHolder = makeReference<EventCacheHolder>("DDTrackerStats");
@ -1025,12 +1059,10 @@ ACTOR Future<Void> dataDistributionTracker(Reference<InitialDataDistribution> in
 		initData = Reference<InitialDataDistribution>();

 		loop choose {
-			when(Promise<int64_t> req = waitNext(getAverageShardBytes)) {
-				req.send(self.maxShardSize->get().get() / 2);
-			}
+			when(Promise<int64_t> req = waitNext(getAverageShardBytes)) { req.send(self.getAverageShardBytes()); }
 			when(wait(loggingTrigger)) {
 				TraceEvent("DDTrackerStats", self.distributorId)
-				    .detail("Shards", self.shards.size())
+				    .detail("Shards", self.shards->size())
 				    .detail("TotalSizeBytes", self.dbSizeEstimate->get())
 				    .detail("SystemSizeBytes", self.systemSizeEstimate)
 				    .trackLatest(ddTrackerStatsEventHolder->trackingKey);
@ -1056,3 +1088,542 @@ ACTOR Future<Void> dataDistributionTracker(Reference<InitialDataDistribution> in
 		throw e;
 	}
 }
+
+// Not used yet
+ACTOR Future<Void> dataDistributionTracker(Reference<DDSharedContext> context,
+                                           Reference<InitialDataDistribution> initData,
+                                           Database cx,
+                                           KeyRangeMap<ShardTrackedData>* shards);
+
+// Methods for PhysicalShardCollection
+FDB_DEFINE_BOOLEAN_PARAM(InAnonymousPhysicalShard);
+FDB_DEFINE_BOOLEAN_PARAM(PhysicalShardHasMoreThanKeyRange);
+FDB_DEFINE_BOOLEAN_PARAM(InOverSizePhysicalShard);
+FDB_DEFINE_BOOLEAN_PARAM(PhysicalShardAvailable);
+FDB_DEFINE_BOOLEAN_PARAM(MoveKeyRangeOutPhysicalShard);
+
+// Decide whether a physical shard is available at the moment when the func is calling
+PhysicalShardAvailable PhysicalShardCollection::checkPhysicalShardAvailable(uint64_t physicalShardID,
+                                                                            StorageMetrics const& moveInMetrics) {
+	ASSERT(physicalShardID != UID().first() && physicalShardID != anonymousShardId.first());
+	ASSERT(physicalShardInstances.count(physicalShardID) > 0);
+	if (physicalShardInstances[physicalShardID].metrics.bytes + moveInMetrics.bytes >
+	    SERVER_KNOBS->MAX_PHYSICAL_SHARD_BYTES) {
+		return PhysicalShardAvailable::False;
+	}
+	return PhysicalShardAvailable::True;
+}
+
+std::string PhysicalShardCollection::convertIDsToString(std::set<uint64_t> ids) {
+	std::string r = "";
+	for (auto id : ids) {
+		r = r + std::to_string(id) + " ";
+	}
+	return r;
+}
+
+void PhysicalShardCollection::updateTeamPhysicalShardIDsMap(uint64_t inputPhysicalShardID,
+                                                            std::vector<ShardsAffectedByTeamFailure::Team> inputTeams,
+                                                            uint64_t debugID) {
+	ASSERT(inputTeams.size() <= 2);
+	ASSERT(inputPhysicalShardID != anonymousShardId.first() && inputPhysicalShardID != UID().first());
+	for (auto inputTeam : inputTeams) {
+		if (teamPhysicalShardIDs.count(inputTeam) == 0) {
+			std::set<uint64_t> physicalShardIDSet;
+			physicalShardIDSet.insert(inputPhysicalShardID);
+			teamPhysicalShardIDs.insert(std::make_pair(inputTeam, physicalShardIDSet));
+		} else {
+			teamPhysicalShardIDs[inputTeam].insert(inputPhysicalShardID);
+		}
+	}
+	return;
+}
+
+void PhysicalShardCollection::insertPhysicalShardToCollection(uint64_t physicalShardID,
+                                                              StorageMetrics const& metrics,
+                                                              std::vector<ShardsAffectedByTeamFailure::Team> teams,
+                                                              uint64_t debugID,
+                                                              PhysicalShardCreationTime whenCreated) {
+	ASSERT(physicalShardID != anonymousShardId.first() && physicalShardID != UID().first());
+	ASSERT(physicalShardInstances.count(physicalShardID) == 0);
+	physicalShardInstances.insert(
+	    std::make_pair(physicalShardID, PhysicalShard(physicalShardID, metrics, teams, whenCreated)));
+	return;
+}
+
+void PhysicalShardCollection::updatekeyRangePhysicalShardIDMap(KeyRange keyRange,
+                                                               uint64_t physicalShardID,
+                                                               uint64_t debugID) {
+	ASSERT(physicalShardID != UID().first());
+	keyRangePhysicalShardIDMap.insert(keyRange, physicalShardID);
+	return;
+}
+
+// At beginning of the transition from the initial state without physical shard notion
+// to the physical shard aware state, the physicalShard set only contains one element which is anonymousShardId[0]
+// After a period in the transition, the physicalShard set of the team contains some meaningful physicalShardIDs
+Optional<uint64_t> PhysicalShardCollection::trySelectAvailablePhysicalShardFor(ShardsAffectedByTeamFailure::Team team,
+                                                                               StorageMetrics const& moveInMetrics,
+                                                                               uint64_t debugID) {
+	ASSERT(team.servers.size() > 0);
+	// Case: The team is not tracked in the mapping (teamPhysicalShardIDs)
+	if (teamPhysicalShardIDs.count(team) == 0) {
+		return Optional<uint64_t>();
+	}
+	ASSERT(teamPhysicalShardIDs[team].size() >= 1);
+	// Case: The team is tracked in the mapping and the system already has physical shard notion
+	// 		and the number of physicalShard is large
+	std::vector<uint64_t> availablePhysicalShardIDs;
+	for (auto physicalShardID : teamPhysicalShardIDs[team]) {
+		if (physicalShardID == anonymousShardId.first() || physicalShardID == UID().first()) {
+			ASSERT(false);
+		}
+		ASSERT(physicalShardInstances.count(physicalShardID));
+		/*TraceEvent("TryGetPhysicalShardIDCandidates")
+		    .detail("PhysicalShardID", physicalShardID)
+		    .detail("Bytes", physicalShardInstances[physicalShardID].metrics.bytes)
+		    .detail("BelongTeam", team.toString())
+		    .detail("DebugID", debugID);*/
+		if (!checkPhysicalShardAvailable(physicalShardID, moveInMetrics)) {
+			continue;
+		}
+		availablePhysicalShardIDs.push_back(physicalShardID);
+	}
+	if (availablePhysicalShardIDs.size() == 0) {
+		/*TraceEvent("TryGetPhysicalShardIDResultFailed")
+		    .detail("Reason", "no valid physicalShard")
+		    .detail("MoveInBytes", moveInMetrics.bytes)
+		    .detail("MaxPhysicalShardBytes", SERVER_KNOBS->MAX_PHYSICAL_SHARD_BYTES)
+		    .detail("DebugID", debugID);*/
+		return Optional<uint64_t>();
+	}
+	return deterministicRandom()->randomChoice(availablePhysicalShardIDs);
+}
+
+uint64_t PhysicalShardCollection::generateNewPhysicalShardID(uint64_t debugID) {
+	uint64_t physicalShardID = UID().first();
+	int stuckCount = 0;
+	while (physicalShardID == UID().first() || physicalShardID == anonymousShardId.first()) {
+		physicalShardID = deterministicRandom()->randomUInt64();
+		stuckCount = stuckCount + 1;
+		if (stuckCount > 50) {
+			ASSERT(false);
+		}
+	}
+	ASSERT(physicalShardID != UID().first() && physicalShardID != anonymousShardId.first());
+	//TraceEvent("GenerateNewPhysicalShardID").detail("PhysicalShardID", physicalShardID).detail("DebugID", debugID);
+	return physicalShardID;
+}
+
+void PhysicalShardCollection::reduceMetricsForMoveOut(uint64_t physicalShardID, StorageMetrics const& moveOutMetrics) {
+	ASSERT(physicalShardInstances.count(physicalShardID) != 0);
+	ASSERT(physicalShardID != UID().first() && physicalShardID != anonymousShardId.first());
+	physicalShardInstances[physicalShardID].metrics = physicalShardInstances[physicalShardID].metrics - moveOutMetrics;
+	return;
+}
+
+void PhysicalShardCollection::increaseMetricsForMoveIn(uint64_t physicalShardID, StorageMetrics const& moveInMetrics) {
+	ASSERT(physicalShardInstances.count(physicalShardID) != 0);
+	ASSERT(physicalShardID != UID().first() && physicalShardID != anonymousShardId.first());
+	physicalShardInstances[physicalShardID].metrics = physicalShardInstances[physicalShardID].metrics + moveInMetrics;
+	return;
+}
+
+void PhysicalShardCollection::updatePhysicalShardMetricsByKeyRange(KeyRange keyRange,
+                                                                   StorageMetrics const& newMetrics,
+                                                                   StorageMetrics const& oldMetrics,
+                                                                   bool initWithNewMetrics) {
+	auto ranges = keyRangePhysicalShardIDMap.intersectingRanges(keyRange);
+	std::set<uint64_t> physicalShardIDSet;
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		physicalShardIDSet.insert(it->value());
+	}
+	StorageMetrics delta;
+	if (initWithNewMetrics) {
+		delta = newMetrics;
+	} else {
+		delta = newMetrics - oldMetrics;
+	}
+	for (auto physicalShardID : physicalShardIDSet) {
+		ASSERT(physicalShardID != UID().first());
+		if (physicalShardID == anonymousShardId.first()) {
+			continue; // we ignore anonymousShard when updating physicalShard metrics
+		}
+		increaseMetricsForMoveIn(physicalShardID, (delta * (1.0 / physicalShardIDSet.size())));
+	}
+	return;
+}
+
+InAnonymousPhysicalShard PhysicalShardCollection::isInAnonymousPhysicalShard(KeyRange keyRange) {
+	InAnonymousPhysicalShard res = InAnonymousPhysicalShard::True;
+	auto ranges = keyRangePhysicalShardIDMap.intersectingRanges(keyRange);
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		uint64_t physicalShardID = it->value();
+		if (physicalShardID != anonymousShardId.first()) {
+			// res = false if exists a part of keyRange belongs to a non-anonymous physicalShard
+			// exist a case where some keyRange of anonymousShard is decided to move
+			// to a non-anonymous physicalShard but not completes
+			res = InAnonymousPhysicalShard::False;
+		}
+	}
+	return res;
+}
+
+// TODO: require optimize
+// It is slow to go through the keyRangePhysicalShardIDRanges for each time
+// Do we need a D/S to store the keyRange for each physicalShard?
+PhysicalShardHasMoreThanKeyRange PhysicalShardCollection::whetherPhysicalShardHasMoreThanKeyRange(
+    uint64_t physicalShardID,
+    KeyRange keyRange) {
+	KeyRangeMap<uint64_t>::Ranges keyRangePhysicalShardIDRanges = keyRangePhysicalShardIDMap.ranges();
+	KeyRangeMap<uint64_t>::iterator it = keyRangePhysicalShardIDRanges.begin();
+	for (; it != keyRangePhysicalShardIDRanges.end(); ++it) {
+		if (it->value() != physicalShardID) {
+			continue;
+		}
+		auto keyRangePiece = KeyRangeRef(it->range().begin, it->range().end);
+		if (!keyRange.intersects(keyRangePiece)) {
+			return PhysicalShardHasMoreThanKeyRange::True;
+		}
+		// if keyRange and keyRangePiece have intersection
+		if (!keyRange.contains(keyRangePiece)) {
+			return PhysicalShardHasMoreThanKeyRange::True;
+		}
+	}
+	return PhysicalShardHasMoreThanKeyRange::False;
+}
+
+InOverSizePhysicalShard PhysicalShardCollection::isInOverSizePhysicalShard(KeyRange keyRange) {
+	auto ranges = keyRangePhysicalShardIDMap.intersectingRanges(keyRange);
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		uint64_t physicalShardID = it->value();
+		ASSERT(physicalShardID != UID().first());
+		if (physicalShardID == anonymousShardId.first()) {
+			continue;
+		}
+		if (checkPhysicalShardAvailable(physicalShardID, StorageMetrics())) {
+			continue;
+		}
+		if (!whetherPhysicalShardHasMoreThanKeyRange(physicalShardID, keyRange)) {
+			continue;
+		}
+		return InOverSizePhysicalShard::True;
+	}
+	return InOverSizePhysicalShard::False;
+}
+
+uint64_t PhysicalShardCollection::determinePhysicalShardIDGivenPrimaryTeam(
+    ShardsAffectedByTeamFailure::Team primaryTeam,
+    StorageMetrics const& metrics,
+    bool forceToUseNewPhysicalShard,
+    uint64_t debugID) {
+	ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
+	ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD);
+	ASSERT(primaryTeam.primary == true);
+	if (forceToUseNewPhysicalShard) {
+		return generateNewPhysicalShardID(debugID);
+	}
+	Optional<uint64_t> physicalShardIDFetch = trySelectAvailablePhysicalShardFor(primaryTeam, metrics, debugID);
+	if (!physicalShardIDFetch.present()) {
+		return generateNewPhysicalShardID(debugID);
+	}
+	return physicalShardIDFetch.get();
+}
+
+// May return a problematic remote team
+Optional<ShardsAffectedByTeamFailure::Team> PhysicalShardCollection::tryGetAvailableRemoteTeamWith(
+    uint64_t inputPhysicalShardID,
+    StorageMetrics const& moveInMetrics,
+    uint64_t debugID) {
+	ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
+	ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD);
+	ASSERT(inputPhysicalShardID != anonymousShardId.first() && inputPhysicalShardID != UID().first());
+	if (physicalShardInstances.count(inputPhysicalShardID) == 0) {
+		return Optional<ShardsAffectedByTeamFailure::Team>();
+	}
+	if (!checkPhysicalShardAvailable(inputPhysicalShardID, moveInMetrics)) {
+		return Optional<ShardsAffectedByTeamFailure::Team>();
+	}
+	for (auto team : physicalShardInstances[inputPhysicalShardID].teams) {
+		if (team.primary == false) {
+			/*TraceEvent("TryGetRemoteTeamWith")
+			    .detail("PhysicalShardID", inputPhysicalShardID)
+			    .detail("Team", team.toString())
+			    .detail("TeamSize", team.servers.size())
+			    .detail("PhysicalShardsOfTeam", convertIDsToString(teamPhysicalShardIDs[team]))
+			    .detail("DebugID", debugID);*/
+			return team;
+		}
+	}
+	UNREACHABLE();
+}
+
+// The update of PhysicalShardToTeams, Collection, keyRangePhysicalShardIDMap should be atomic
+void PhysicalShardCollection::initPhysicalShardCollection(KeyRange keys,
+                                                          std::vector<ShardsAffectedByTeamFailure::Team> selectedTeams,
+                                                          uint64_t physicalShardID,
+                                                          uint64_t debugID) {
+	ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
+	ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD);
+	ASSERT(physicalShardID != UID().first());
+	if (physicalShardID != anonymousShardId.first()) {
+		updateTeamPhysicalShardIDsMap(physicalShardID, selectedTeams, debugID);
+		if (physicalShardInstances.count(physicalShardID) == 0) {
+			insertPhysicalShardToCollection(
+			    physicalShardID, StorageMetrics(), selectedTeams, debugID, PhysicalShardCreationTime::DDInit);
+		} else {
+			// This assertion will be broken if we enable the optimization of data move traffic between DCs
+			ASSERT(physicalShardInstances[physicalShardID].teams == selectedTeams);
+		}
+	} else {
+		// If any physicalShard restored when DD init is the anonymousShard,
+		// Then DD enters Transition state where DD graduatelly moves Shard (or KeyRange)
+		// out of the anonymousShard
+		setTransitionCheck();
+	}
+	updatekeyRangePhysicalShardIDMap(keys, physicalShardID, debugID);
+	return;
+}
+
+// The update of PhysicalShardToTeams, Collection, keyRangePhysicalShardIDMap should be atomic
+void PhysicalShardCollection::updatePhysicalShardCollection(
+    KeyRange keys,
+    bool isRestore,
+    std::vector<ShardsAffectedByTeamFailure::Team> selectedTeams,
+    uint64_t physicalShardID,
+    const StorageMetrics& metrics,
+    uint64_t debugID) {
+	ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
+	ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD);
+	ASSERT(physicalShardID != UID().first());
+	/*TraceEvent e("UpdatePhysicalShard");
+	e.detail("DebugID", debugID);
+	e.detail("KeyRange", keys);
+	e.detail("IsRestore", isRestore);*/
+	// When updates metrics in physicalShard collection, we assume:
+	// It is impossible to move a keyRange from anonymousShard to a valid physicalShard
+	// Thus, we ignore anonymousShard when updating metrics
+	if (physicalShardID != anonymousShardId.first()) {
+		updateTeamPhysicalShardIDsMap(physicalShardID, selectedTeams, debugID);
+		// Update physicalShardInstances
+		// Add the metrics to in-physicalShard
+		// e.detail("PhysicalShardIDIn", physicalShardID);
+		if (physicalShardInstances.count(physicalShardID) == 0) {
+			// e.detail("Op", "Insert");
+			insertPhysicalShardToCollection(
+			    physicalShardID, metrics, selectedTeams, debugID, PhysicalShardCreationTime::DDRelocator);
+		} else {
+			// e.detail("Op", "Update");
+			//  This assertion is true since we disable the optimization of data move traffic between DCs
+			ASSERT(physicalShardInstances[physicalShardID].teams == selectedTeams);
+			increaseMetricsForMoveIn(physicalShardID, metrics);
+		}
+	}
+	// Minus the metrics from the existing (multiple) out-physicalShard(s)
+	auto ranges = keyRangePhysicalShardIDMap.intersectingRanges(keys);
+	std::set<uint64_t> physicalShardIDSet;
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		physicalShardIDSet.insert(it->value());
+	}
+	/*std::string physicalShardIDOut = "";
+	for (auto id : physicalShardIDSet) {
+	    physicalShardIDOut = physicalShardIDOut + std::to_string(id) + " ";
+	}*/
+	// e.detail("PhysicalShardIDOut", physicalShardIDOut);
+	for (auto physicalShardID : physicalShardIDSet) { // imprecise: evenly move out bytes
+		if (physicalShardID == anonymousShardId.first()) {
+			continue; // we ignore anonymousShard when updating physicalShard metrics
+		}
+		StorageMetrics toReduceMetrics = metrics * (1.0 / physicalShardIDSet.size());
+		reduceMetricsForMoveOut(physicalShardID, toReduceMetrics);
+	}
+	// keyRangePhysicalShardIDMap must be update after updating the metrics of physicalShardInstances
+	updatekeyRangePhysicalShardIDMap(keys, physicalShardID, debugID);
+	return;
+}
+
+// return false if no need to move keyRange out of current physical shard
+MoveKeyRangeOutPhysicalShard PhysicalShardCollection::trackPhysicalShard(KeyRange keyRange,
+                                                                         StorageMetrics const& newMetrics,
+                                                                         StorageMetrics const& oldMetrics,
+                                                                         bool initWithNewMetrics) {
+	ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
+	ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD);
+	updatePhysicalShardMetricsByKeyRange(keyRange, newMetrics, oldMetrics, initWithNewMetrics);
+	if (requireTransitionCheck() &&
+	    now() - lastTransitionStartTime > SERVER_KNOBS->ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME) {
+		if (isInAnonymousPhysicalShard(keyRange)) {
+			// Currently, whenever a shard updates metrics, it checks whether is in AnonymousPhysicalShard
+			// If yes, and if the shard has been created for long time, then triggers a data move on the shard.
+			resetLastTransitionStartTime();
+			TraceEvent("PhysicalShardTiggerTransitionMove")
+			    .detail("KeyRange", keyRange)
+			    .detail("TransitionCoolDownTime", SERVER_KNOBS->ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME);
+			return MoveKeyRangeOutPhysicalShard::True;
+		}
+	}
+	if (isInOverSizePhysicalShard(keyRange)) {
+		return MoveKeyRangeOutPhysicalShard::True;
+	}
+	return MoveKeyRangeOutPhysicalShard::False;
+}
+
+// The update of PhysicalShardToTeams, PhysicalShardInstances, KeyRangePhysicalShardIDMap should be atomic
+void PhysicalShardCollection::cleanUpPhysicalShardCollection() {
+	ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
+	ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD);
+	std::set<uint64_t> physicalShardsInUse;
+	std::map<uint64_t, StorageMetrics> metricsReplies;
+	KeyRangeMap<uint64_t>::Ranges keyRangePhysicalShardIDRanges = keyRangePhysicalShardIDMap.ranges();
+	KeyRangeMap<uint64_t>::iterator it = keyRangePhysicalShardIDRanges.begin();
+	// Assume that once a physical shard is disappear in keyRangePhysicalShardIDMap,
+	// the physical shard (with the deleted id) should be deprecated.
+	// This function aims at clean up those deprecated physical shards in PhysicalShardCollection
+	// This function collects the physicalShard usage info from KeyRangePhysicalShardIDMap,
+	// then based on the info to update PhysicalShardToTeams and PhysicalShardInstances
+
+	// keyRangePhysicalShardIDMap indicates which physicalShard actually has data
+	// Step 1: Clear unused physicalShard in physicalShardInstances based on keyRangePhysicalShardIDMap
+	for (; it != keyRangePhysicalShardIDRanges.end(); ++it) {
+		uint64_t physicalShardID = it->value();
+		if (physicalShardID == anonymousShardId.first()) {
+			continue;
+		}
+		physicalShardsInUse.insert(physicalShardID);
+	}
+	for (auto it = physicalShardInstances.begin(); it != physicalShardInstances.end();) {
+		uint64_t physicalShardID = it->first;
+		ASSERT(physicalShardInstances.count(physicalShardID) > 0);
+		if (physicalShardsInUse.count(physicalShardID) == 0) {
+			/*TraceEvent("PhysicalShardisEmpty")
+			    .detail("PhysicalShard", physicalShardID)
+			    .detail("RemainBytes", physicalShardInstances[physicalShardID].metrics.bytes);*/
+			// "RemainBytes" indicates the deviation of current physical shard metric update
+			it = physicalShardInstances.erase(it);
+		} else {
+			it++;
+		}
+	}
+	// Step 2: Clean up teamPhysicalShardIDs
+	std::set<ShardsAffectedByTeamFailure::Team> toRemoveTeams;
+	for (auto [team, _] : teamPhysicalShardIDs) {
+		for (auto it = teamPhysicalShardIDs[team].begin(); it != teamPhysicalShardIDs[team].end();) {
+			uint64_t physicalShardID = *it;
+			if (physicalShardInstances.count(physicalShardID) == 0) {
+				// physicalShardID has been removed from physicalShardInstances (see step 1)
+				// So, remove the physicalShard from teamPhysicalShardID[team]
+				it = teamPhysicalShardIDs[team].erase(it);
+			} else {
+				it++;
+			}
+		}
+		if (teamPhysicalShardIDs[team].size() == 0) {
+			// If a team has no physicalShard, remove the team from teamPhysicalShardID
+			toRemoveTeams.insert(team);
+		}
+	}
+	for (auto team : toRemoveTeams) {
+		teamPhysicalShardIDs.erase(team);
+	}
+}
+
+void PhysicalShardCollection::logPhysicalShardCollection() {
+	ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
+	ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD);
+	// Step 1: Logging non-empty physicalShard
+	for (auto [physicalShardID, physicalShard] : physicalShardInstances) {
+		ASSERT(physicalShardID == physicalShard.id);
+		TraceEvent e("PhysicalShardStatus");
+		e.detail("PhysicalShardID", physicalShardID);
+		e.detail("TotalBytes", physicalShard.metrics.bytes);
+	}
+	// Step 2: Logging TeamPhysicalShardStatus
+	for (auto [team, physicalShardIDs] : teamPhysicalShardIDs) {
+		TraceEvent e("TeamPhysicalShardStatus");
+		e.detail("Team", team.toString());
+		// std::string metricsStr = "";
+		int64_t counter = 0;
+		int64_t totalBytes = 0;
+		int64_t maxPhysicalShardBytes = -1;
+		int64_t minPhysicalShardBytes = StorageMetrics::infinity;
+		uint64_t maxPhysicalShardID = 0;
+		uint64_t minPhysicalShardID = 0;
+		for (auto physicalShardID : physicalShardIDs) {
+			ASSERT(physicalShardInstances.count(physicalShardID) > 0);
+			uint64_t id = physicalShardInstances[physicalShardID].id;
+			int64_t bytes = physicalShardInstances[physicalShardID].metrics.bytes;
+			if (bytes > maxPhysicalShardBytes) {
+				maxPhysicalShardBytes = bytes;
+				maxPhysicalShardID = id;
+			}
+			if (bytes < minPhysicalShardBytes) {
+				minPhysicalShardBytes = bytes;
+				minPhysicalShardID = id;
+			}
+			totalBytes = totalBytes + bytes;
+			/* metricsStr = metricsStr + std::to_string(id) + ":" + std::to_string(bytes);
+			if (counter < physicalShardIDs.size() - 1) {
+			    metricsStr = metricsStr + ",";
+			} */
+			counter = counter + 1;
+		}
+		// e.detail("Metrics", metricsStr);
+		e.detail("TotalBytes", totalBytes);
+		e.detail("NumPhysicalShards", counter);
+		e.detail("MaxPhysicalShard", std::to_string(maxPhysicalShardID) + ":" + std::to_string(maxPhysicalShardBytes));
+		e.detail("MinPhysicalShard", std::to_string(minPhysicalShardID) + ":" + std::to_string(minPhysicalShardBytes));
+	}
+	// Step 3: Logging StorageServerPhysicalShardStatus
+	std::map<UID, std::map<uint64_t, int64_t>> storageServerPhysicalShardStatus;
+	for (auto [team, _] : teamPhysicalShardIDs) {
+		for (auto ssid : team.servers) {
+			for (auto it = teamPhysicalShardIDs[team].begin(); it != teamPhysicalShardIDs[team].end();) {
+				uint64_t physicalShardID = *it;
+				if (storageServerPhysicalShardStatus.count(ssid) != 0) {
+					if (storageServerPhysicalShardStatus[ssid].count(physicalShardID) == 0) {
+						ASSERT(physicalShardInstances.count(physicalShardID) > 0);
+						storageServerPhysicalShardStatus[ssid].insert(
+						    std::make_pair(physicalShardID, physicalShardInstances[physicalShardID].metrics.bytes));
+					}
+				} else {
+					ASSERT(physicalShardInstances.count(physicalShardID) > 0);
+					std::map<uint64_t, int64_t> tmp;
+					tmp.insert(std::make_pair(physicalShardID, physicalShardInstances[physicalShardID].metrics.bytes));
+					storageServerPhysicalShardStatus.insert(std::make_pair(ssid, tmp));
+				}
+				it++;
+			}
+		}
+	}
+	for (auto [serverID, physicalShardMetrics] : storageServerPhysicalShardStatus) {
+		TraceEvent e("ServerPhysicalShardStatus");
+		e.detail("Server", serverID);
+		e.detail("NumPhysicalShards", physicalShardMetrics.size());
+		int64_t totalBytes = 0;
+		int64_t maxPhysicalShardBytes = -1;
+		int64_t minPhysicalShardBytes = StorageMetrics::infinity;
+		uint64_t maxPhysicalShardID = 0;
+		uint64_t minPhysicalShardID = 0;
+		// std::string metricsStr = "";
+		// int64_t counter = 0;
+		for (auto [physicalShardID, bytes] : physicalShardMetrics) {
+			totalBytes = totalBytes + bytes;
+			if (bytes > maxPhysicalShardBytes) {
+				maxPhysicalShardBytes = bytes;
+				maxPhysicalShardID = physicalShardID;
+			}
+			if (bytes < minPhysicalShardBytes) {
+				minPhysicalShardBytes = bytes;
+				minPhysicalShardID = physicalShardID;
+			}
+			/* metricsStr = metricsStr + std::to_string(physicalShardID) + ":" + std::to_string(bytes);
+			if (counter < physicalShardMetrics.size() - 1) {
+			        metricsStr = metricsStr + ",";
+			}
+			counter = counter + 1; */
+		}
+		e.detail("TotalBytes", totalBytes);
+		e.detail("MaxPhysicalShard", std::to_string(maxPhysicalShardID) + ":" + std::to_string(maxPhysicalShardBytes));
+		e.detail("MinPhysicalShard", std::to_string(minPhysicalShardID) + ":" + std::to_string(minPhysicalShardBytes));
+	}
+}
--- a/fdbserver/DDTxnProcessor.actor.cpp
+++ b/fdbserver/DDTxnProcessor.actor.cpp
@ -395,6 +395,61 @@ class DDTxnProcessorImpl {
 			}
 		}
 	}
+
+	ACTOR static Future<bool> isDataDistributionEnabled(Database cx, const DDEnabledState* ddEnabledState) {
+		state Transaction tr(cx);
+		loop {
+			try {
+				Optional<Value> mode = wait(tr.get(dataDistributionModeKey));
+				if (!mode.present() && ddEnabledState->isDDEnabled())
+					return true;
+				if (mode.present()) {
+					BinaryReader rd(mode.get(), Unversioned());
+					int m;
+					rd >> m;
+					if (m && ddEnabledState->isDDEnabled()) {
+						TraceEvent(SevDebug, "IsDDEnabledSucceeded")
+						    .detail("Mode", m)
+						    .detail("IsDDEnabled", ddEnabledState->isDDEnabled());
+						return true;
+					}
+				}
+				// SOMEDAY: Write a wrapper in MoveKeys.actor.h
+				Optional<Value> readVal = wait(tr.get(moveKeysLockOwnerKey));
+				UID currentOwner =
+				    readVal.present() ? BinaryReader::fromStringRef<UID>(readVal.get(), Unversioned()) : UID();
+				if (ddEnabledState->isDDEnabled() && (currentOwner != dataDistributionModeLock)) {
+					TraceEvent(SevDebug, "IsDDEnabledSucceeded")
+					    .detail("CurrentOwner", currentOwner)
+					    .detail("DDModeLock", dataDistributionModeLock)
+					    .detail("IsDDEnabled", ddEnabledState->isDDEnabled());
+					return true;
+				}
+				TraceEvent(SevDebug, "IsDDEnabledFailed")
+				    .detail("CurrentOwner", currentOwner)
+				    .detail("DDModeLock", dataDistributionModeLock)
+				    .detail("IsDDEnabled", ddEnabledState->isDDEnabled());
+				return false;
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+	}
+
+	ACTOR static Future<Void> pollMoveKeysLock(Database cx, MoveKeysLock lock, const DDEnabledState* ddEnabledState) {
+		loop {
+			wait(delay(SERVER_KNOBS->MOVEKEYS_LOCK_POLLING_DELAY));
+			state Transaction tr(cx);
+			loop {
+				try {
+					wait(checkMoveKeysLockReadOnly(&tr, lock, ddEnabledState));
+					break;
+				} catch (Error& e) {
+					wait(tr.onError(e));
+				}
+			}
+		}
+	}
 };

 Future<IDDTxnProcessor::SourceServers> DDTxnProcessor::getSourceServersForRange(const KeyRangeRef range) {
@ -406,7 +461,7 @@ Future<std::vector<std::pair<StorageServerInterface, ProcessClass>>> DDTxnProces
 	return NativeAPI::getServerListAndProcessClasses(&tr);
 }

-Future<MoveKeysLock> DDTxnProcessor::takeMoveKeysLock(UID ddId) const {
+Future<MoveKeysLock> DDTxnProcessor::takeMoveKeysLock(const UID& ddId) const {
 	return ::takeMoveKeysLock(cx, ddId);
 }

@ -431,3 +486,11 @@ Future<Reference<InitialDataDistribution>> DDTxnProcessor::getInitialDataDistrib
 Future<Void> DDTxnProcessor::waitForDataDistributionEnabled(const DDEnabledState* ddEnabledState) const {
 	return DDTxnProcessorImpl::waitForDataDistributionEnabled(cx, ddEnabledState);
 }
+
+Future<bool> DDTxnProcessor::isDataDistributionEnabled(const DDEnabledState* ddEnabledState) const {
+	return DDTxnProcessorImpl::isDataDistributionEnabled(cx, ddEnabledState);
+}
+
+Future<Void> DDTxnProcessor::pollMoveKeysLock(const MoveKeysLock& lock, const DDEnabledState* ddEnabledState) const {
+	return DDTxnProcessorImpl::pollMoveKeysLock(cx, lock, ddEnabledState);
+}
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -49,7 +49,7 @@
 #include "flow/serialize.h"
 #include "flow/Trace.h"
 #include "flow/UnitTest.h"
-
+#include "fdbserver/DDSharedContext.h"
 #include "flow/actorcompiler.h" // This must be the last #include.

 void DataMove::validateShard(const DDShardInfo& shard, KeyRangeRef range, int priority) {
@ -202,46 +202,6 @@ ACTOR Future<Void> remoteRecovered(Reference<AsyncVar<ServerDBInfo> const> db) {
 	return Void();
 }

-ACTOR Future<bool> isDataDistributionEnabled(Database cx, const DDEnabledState* ddEnabledState) {
-	state Transaction tr(cx);
-	loop {
-		try {
-			Optional<Value> mode = wait(tr.get(dataDistributionModeKey));
-			if (!mode.present() && ddEnabledState->isDDEnabled())
-				return true;
-			if (mode.present()) {
-				BinaryReader rd(mode.get(), Unversioned());
-				int m;
-				rd >> m;
-				if (m && ddEnabledState->isDDEnabled()) {
-					TraceEvent(SevDebug, "IsDDEnabledSucceeded")
-					    .detail("Mode", m)
-					    .detail("IsDDEnabled", ddEnabledState->isDDEnabled());
-					return true;
-				}
-			}
-			// SOMEDAY: Write a wrapper in MoveKeys.actor.h
-			Optional<Value> readVal = wait(tr.get(moveKeysLockOwnerKey));
-			UID currentOwner =
-			    readVal.present() ? BinaryReader::fromStringRef<UID>(readVal.get(), Unversioned()) : UID();
-			if (ddEnabledState->isDDEnabled() && (currentOwner != dataDistributionModeLock)) {
-				TraceEvent(SevDebug, "IsDDEnabledSucceeded")
-				    .detail("CurrentOwner", currentOwner)
-				    .detail("DDModeLock", dataDistributionModeLock)
-				    .detail("IsDDEnabled", ddEnabledState->isDDEnabled());
-				return true;
-			}
-			TraceEvent(SevDebug, "IsDDEnabledFailed")
-			    .detail("CurrentOwner", currentOwner)
-			    .detail("DDModeLock", dataDistributionModeLock)
-			    .detail("IsDDEnabled", ddEnabledState->isDDEnabled());
-			return false;
-		} catch (Error& e) {
-			wait(tr.onError(e));
-		}
-	}
-}
-
 // Ensures that the serverKeys key space is properly coalesced
 // This method is only used for testing and is not implemented in a manner that is safe for large databases
 ACTOR Future<Void> debugCheckCoalescing(Database cx) {
@ -284,24 +244,10 @@ static std::set<int> const& normalDDQueueErrors() {
 	return s;
 }

-ACTOR Future<Void> pollMoveKeysLock(Database cx, MoveKeysLock lock, const DDEnabledState* ddEnabledState) {
-	loop {
-		wait(delay(SERVER_KNOBS->MOVEKEYS_LOCK_POLLING_DELAY));
-		state Transaction tr(cx);
-		loop {
-			try {
-				wait(checkMoveKeysLockReadOnly(&tr, lock, ddEnabledState));
-				break;
-			} catch (Error& e) {
-				wait(tr.onError(e));
-			}
-		}
-	}
-}
-
 struct DataDistributor : NonCopyable, ReferenceCounted<DataDistributor> {
 public:
 	Reference<AsyncVar<ServerDBInfo> const> dbInfo;
+	Reference<DDSharedContext> context;
 	UID ddId;
 	PromiseStream<Future<Void>> addActor;

@ -325,9 +271,13 @@ public:
 	// consumer is a yield stream from producer. The RelocateShard is pushed into relocationProducer and popped from
 	// relocationConsumer (by DDQueue)
 	PromiseStream<RelocateShard> relocationProducer, relocationConsumer;
+	Reference<PhysicalShardCollection> physicalShardCollection;

-	DataDistributor(Reference<AsyncVar<ServerDBInfo> const> const& db, UID id)
-	  : dbInfo(db), ddId(id), txnProcessor(nullptr), initialDDEventHolder(makeReference<EventCacheHolder>("InitialDD")),
+	StorageQuotaInfo storageQuotaInfo;
+
+	DataDistributor(Reference<AsyncVar<ServerDBInfo> const> const& db, UID id, Reference<DDSharedContext> context)
+	  : dbInfo(db), context(context), ddId(id), txnProcessor(nullptr),
+	    initialDDEventHolder(makeReference<EventCacheHolder>("InitialDD")),
 	    movingDataEventHolder(makeReference<EventCacheHolder>("MovingData")),
 	    totalDataInFlightEventHolder(makeReference<EventCacheHolder>("TotalDataInFlight")),
 	    totalDataInFlightRemoteEventHolder(makeReference<EventCacheHolder>("TotalDataInFlightRemote")),
@ -343,13 +293,13 @@ public:
 		return txnProcessor->updateReplicaKeys(primaryDcId, remoteDcIds, configuration);
 	}

-	Future<Void> loadInitialDataDistribution(const DDEnabledState* ddEnabledState) {
+	Future<Void> loadInitialDataDistribution() {
 		return store(initData,
 		             txnProcessor->getInitialDataDistribution(
 		                 ddId,
 		                 lock,
 		                 configuration.usableRegions > 1 ? remoteDcIds : std::vector<Optional<Key>>(),
-		                 ddEnabledState));
+		                 context->ddEnabledState.get()));
 	}

 	void initDcInfo() {
@ -364,14 +314,14 @@ public:
 		}
 	}

-	Future<Void> waitDataDistributorEnabled(const DDEnabledState* ddEnabledState) const {
-		return txnProcessor->waitForDataDistributionEnabled(ddEnabledState);
+	Future<Void> waitDataDistributorEnabled() const {
+		return txnProcessor->waitForDataDistributionEnabled(context->ddEnabledState.get());
 	}

 	// Initialize the required internal states of DataDistributor. It's necessary before DataDistributor start working.
 	// Doesn't include initialization of optional components, like TenantCache, DDQueue, Tracker, TeamCollection. The
 	// components should call its own ::init methods.
-	ACTOR static Future<Void> init(Reference<DataDistributor> self, const DDEnabledState* ddEnabledState) {
+	ACTOR static Future<Void> init(Reference<DataDistributor> self) {
 		loop {
 			TraceEvent("DDInitTakingMoveKeysLock", self->ddId).log();
 			wait(self->takeMoveKeysLock());
@ -384,7 +334,7 @@ public:
 			wait(self->updateReplicaKeys());
 			TraceEvent("DDInitUpdatedReplicaKeys", self->ddId).log();

-			wait(self->loadInitialDataDistribution(ddEnabledState));
+			wait(self->loadInitialDataDistribution());

 			if (self->initData->shards.size() > 1) {
 				TraceEvent("DDInitGotInitialDD", self->ddId)
@ -402,7 +352,7 @@ public:
 				    .trackLatest(self->initialDDEventHolder->trackingKey);
 			}

-			if (self->initData->mode && ddEnabledState->isDDEnabled()) {
+			if (self->initData->mode && self->context->isDDEnabled()) {
 				// mode may be set true by system operator using fdbcli and isDDEnabled() set to true
 				break;
 			}
@ -443,13 +393,27 @@ public:
 			    .detail("HighestPriority", self->configuration.usableRegions > 1 ? 0 : -1)
 			    .trackLatest(self->totalDataInFlightRemoteEventHolder->trackingKey);

-			wait(self->waitDataDistributorEnabled(ddEnabledState));
+			wait(self->waitDataDistributorEnabled());
 			TraceEvent("DataDistributionEnabled").log();
 		}
 		return Void();
 	}

 	ACTOR static Future<Void> resumeFromShards(Reference<DataDistributor> self, bool traceShard) {
+		// All physicalShard init must be completed before issuing data move
+		if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+			for (int i = 0; i < self->initData->shards.size() - 1; i++) {
+				const DDShardInfo& iShard = self->initData->shards[i];
+				KeyRangeRef keys = KeyRangeRef(iShard.key, self->initData->shards[i + 1].key);
+				std::vector<ShardsAffectedByTeamFailure::Team> teams;
+				teams.push_back(ShardsAffectedByTeamFailure::Team(iShard.primarySrc, true));
+				if (self->configuration.usableRegions > 1) {
+					teams.push_back(ShardsAffectedByTeamFailure::Team(iShard.remoteSrc, false));
+				}
+				self->physicalShardCollection->initPhysicalShardCollection(keys, teams, iShard.srcId.first(), 0);
+			}
+		}
+
 		state int shard = 0;
 		for (; shard < self->initData->shards.size() - 1; shard++) {
 			const DDShardInfo& iShard = self->initData->shards[shard];
@ -543,12 +507,58 @@ public:
 		Future<Void> shardsReady = resumeFromShards(Reference<DataDistributor>::addRef(this), g_network->isSimulated());
 		return resumeFromDataMoves(Reference<DataDistributor>::addRef(this), shardsReady);
 	}
+
+	Future<Void> pollMoveKeysLock() { return txnProcessor->pollMoveKeysLock(lock, context->ddEnabledState.get()); }
+
+	Future<bool> isDataDistributionEnabled() const {
+		return txnProcessor->isDataDistributionEnabled(context->ddEnabledState.get());
+	}
+
+	Future<Void> removeKeysFromFailedServer(const UID& serverID, const std::vector<UID>& teamForDroppedRange) const {
+		return txnProcessor->removeKeysFromFailedServer(
+		    serverID, teamForDroppedRange, lock, context->ddEnabledState.get());
+	}
+
+	Future<Void> removeStorageServer(const UID& serverID, const Optional<UID>& tssPairID = Optional<UID>()) const {
+		return txnProcessor->removeStorageServer(serverID, tssPairID, lock, context->ddEnabledState.get());
+	}
 };

+ACTOR Future<Void> storageQuotaTracker(Database cx, StorageQuotaInfo* storageQuotaInfo) {
+	loop {
+		state Transaction tr(cx);
+		loop {
+			try {
+				state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY));
+				TraceEvent("StorageQuota_ReadCurrentQuotas").detail("Size", currentQuotas.size());
+				for (auto const kv : currentQuotas) {
+					Key const key = kv.key.removePrefix(storageQuotaPrefix);
+					uint64_t const quota = BinaryReader::fromStringRef<uint64_t>(kv.value, Unversioned());
+					storageQuotaInfo->quotaMap[key] = quota;
+				}
+				wait(delay(5.0));
+				break;
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+	}
+}
+
+// Periodically check and log the physicalShard status; clean up empty physicalShard;
+ACTOR Future<Void> monitorPhysicalShardStatus(Reference<PhysicalShardCollection> self) {
+	ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
+	ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD);
+	loop {
+		self->cleanUpPhysicalShardCollection();
+		self->logPhysicalShardCollection();
+		wait(delay(SERVER_KNOBS->PHYSICAL_SHARD_METRICS_DELAY));
+	}
+}
+
 // Runs the data distribution algorithm for FDB, including the DD Queue, DD tracker, and DD team collection
 ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
-                                    PromiseStream<GetMetricsListRequest> getShardMetricsList,
-                                    const DDEnabledState* ddEnabledState) {
+                                    PromiseStream<GetMetricsListRequest> getShardMetricsList) {
 	state Database cx = openDBOnServer(self->dbInfo, TaskPriority::DataDistributionLaunch, LockAware::True);
 	cx->locationCacheSize = SERVER_KNOBS->DD_LOCATION_CACHE_SIZE;
 	self->txnProcessor = std::shared_ptr<IDDTxnProcessor>(new DDTxnProcessor(cx));
@ -572,7 +582,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 		state KeyRangeMap<ShardTrackedData> shards;
 		state Promise<UID> removeFailedServer;
 		try {
-			wait(DataDistributor::init(self, ddEnabledState));
+			wait(DataDistributor::init(self));

 			state Reference<TenantCache> ddTenantCache;
 			if (ddIsTenantAware) {
@ -592,6 +602,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 			state Promise<Void> readyToStart;

 			self->shardsAffectedByTeamFailure = makeReference<ShardsAffectedByTeamFailure>();
+			self->physicalShardCollection = makeReference<PhysicalShardCollection>();
 			wait(self->resumeRelocations());

 			std::vector<TeamCollectionInterface> tcis; // primary and remote region interface
@ -618,11 +629,12 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 				    ddTenantCache->monitorTenantMap(), "DDTenantCacheMonitor", self->ddId, &normalDDQueueErrors()));
 			}

-			actors.push_back(pollMoveKeysLock(cx, self->lock, ddEnabledState));
+			actors.push_back(self->pollMoveKeysLock());
 			actors.push_back(reportErrorsExcept(dataDistributionTracker(self->initData,
 			                                                            cx,
 			                                                            self->relocationProducer,
 			                                                            self->shardsAffectedByTeamFailure,
+			                                                            self->physicalShardCollection,
 			                                                            getShardMetrics,
 			                                                            getTopKShardMetrics.getFuture(),
 			                                                            getShardMetricsList,
@ -644,17 +656,23 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 			                                                          processingWiggle,
 			                                                          tcis,
 			                                                          self->shardsAffectedByTeamFailure,
+			                                                          self->physicalShardCollection,
 			                                                          self->lock,
 			                                                          getAverageShardBytes,
 			                                                          getUnhealthyRelocationCount.getFuture(),
 			                                                          self->ddId,
 			                                                          storageTeamSize,
 			                                                          self->configuration.storageTeamSize,
-			                                                          ddEnabledState),
+			                                                          self->context->ddEnabledState.get()),
 			                                    "DDQueue",
 			                                    self->ddId,
 			                                    &normalDDQueueErrors()));

+			actors.push_back(reportErrorsExcept(storageQuotaTracker(cx, &self->storageQuotaInfo),
+			                                    "StorageQuotaTracker",
+			                                    self->ddId,
+			                                    &normalDDQueueErrors()));
+
 			std::vector<DDTeamCollection*> teamCollectionsPtrs;
 			primaryTeamCollection = makeReference<DDTeamCollection>(
 			    cx,
@ -696,24 +714,32 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 				                                    getUnhealthyRelocationCount);
 				teamCollectionsPtrs.push_back(remoteTeamCollection.getPtr());
 				remoteTeamCollection->teamCollections = teamCollectionsPtrs;
-				actors.push_back(reportErrorsExcept(
-				    DDTeamCollection::run(
-				        remoteTeamCollection, self->initData, tcis[1], recruitStorage, *ddEnabledState),
-				    "DDTeamCollectionSecondary",
-				    self->ddId,
-				    &normalDDQueueErrors()));
+				actors.push_back(reportErrorsExcept(DDTeamCollection::run(remoteTeamCollection,
+				                                                          self->initData,
+				                                                          tcis[1],
+				                                                          recruitStorage,
+				                                                          *self->context->ddEnabledState.get()),
+				                                    "DDTeamCollectionSecondary",
+				                                    self->ddId,
+				                                    &normalDDQueueErrors()));
 				actors.push_back(DDTeamCollection::printSnapshotTeamsInfo(remoteTeamCollection));
 			}
 			primaryTeamCollection->teamCollections = teamCollectionsPtrs;
 			self->teamCollection = primaryTeamCollection.getPtr();
-			actors.push_back(reportErrorsExcept(
-			    DDTeamCollection::run(primaryTeamCollection, self->initData, tcis[0], recruitStorage, *ddEnabledState),
-			    "DDTeamCollectionPrimary",
-			    self->ddId,
-			    &normalDDQueueErrors()));
+			actors.push_back(reportErrorsExcept(DDTeamCollection::run(primaryTeamCollection,
+			                                                          self->initData,
+			                                                          tcis[0],
+			                                                          recruitStorage,
+			                                                          *self->context->ddEnabledState.get()),
+			                                    "DDTeamCollectionPrimary",
+			                                    self->ddId,
+			                                    &normalDDQueueErrors()));

 			actors.push_back(DDTeamCollection::printSnapshotTeamsInfo(primaryTeamCollection));
 			actors.push_back(yieldPromiseStream(self->relocationProducer.getFuture(), self->relocationConsumer));
+			if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+				actors.push_back(monitorPhysicalShardStatus(self->physicalShardCollection));
+			}

 			wait(waitForAll(actors));
 			return Void();
@ -750,17 +776,14 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 			TraceEvent("DataDistributorTeamCollectionsDestroyed").error(err);
 			if (removeFailedServer.getFuture().isReady() && !removeFailedServer.getFuture().isError()) {
 				TraceEvent("RemoveFailedServer", removeFailedServer.getFuture().get()).error(err);
-				wait(removeKeysFromFailedServer(
-				    cx, removeFailedServer.getFuture().get(), teamForDroppedRange, self->lock, ddEnabledState));
-				Optional<UID> tssPairID;
-				wait(removeStorageServer(
-				    cx, removeFailedServer.getFuture().get(), tssPairID, self->lock, ddEnabledState));
+				wait(self->removeKeysFromFailedServer(removeFailedServer.getFuture().get(), teamForDroppedRange));
+				wait(self->removeStorageServer(removeFailedServer.getFuture().get()));
 			} else {
 				if (err.code() != error_code_movekeys_conflict) {
 					throw err;
 				}

-				bool ddEnabled = wait(isDataDistributionEnabled(cx, ddEnabledState));
+				bool ddEnabled = wait(self->isDataDistributionEnabled());
 				TraceEvent("DataDistributionMoveKeysConflict").error(err).detail("DataDistributionEnabled", ddEnabled);
 				if (ddEnabled) {
 					throw err;
@ -1301,12 +1324,12 @@ ACTOR Future<Void> ddGetMetrics(GetDataDistributorMetricsRequest req,
 }

 ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncVar<ServerDBInfo> const> db) {
-	state Reference<DataDistributor> self(new DataDistributor(db, di.id()));
+	state Reference<DDSharedContext> context(new DDSharedContext(di.id()));
+	state Reference<DataDistributor> self(new DataDistributor(db, di.id(), context));
 	state Future<Void> collection = actorCollection(self->addActor.getFuture());
 	state PromiseStream<GetMetricsListRequest> getShardMetricsList;
 	state Database cx = openDBOnServer(db, TaskPriority::DefaultDelay, LockAware::True);
 	state ActorCollection actors(false);
-	state DDEnabledState ddEnabledState;
 	state std::map<UID, DistributorSnapRequest> ddSnapReqMap;
 	state std::map<UID, ErrorOr<Void>> ddSnapReqResultMap;
 	self->addActor.send(actors.getResult());
@ -1316,11 +1339,8 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
 		TraceEvent("DataDistributorRunning", di.id());
 		self->addActor.send(waitFailureServer(di.waitFailure.getFuture()));
 		self->addActor.send(cacheServerWatcher(&cx));
-		state Future<Void> distributor =
-		    reportErrorsExcept(dataDistribution(self, getShardMetricsList, &ddEnabledState),
-		                       "DataDistribution",
-		                       di.id(),
-		                       &normalDataDistributorErrors());
+		state Future<Void> distributor = reportErrorsExcept(
+		    dataDistribution(self, getShardMetricsList), "DataDistribution", di.id(), &normalDataDistributorErrors());

 		loop choose {
 			when(wait(distributor || collection)) {
@ -1351,7 +1371,8 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
 					ddSnapReqMap[snapUID] = snapReq;
 				} else {
 					ddSnapReqMap[snapUID] = snapReq;
-					actors.add(ddSnapCreate(snapReq, db, &ddEnabledState, &ddSnapReqMap, &ddSnapReqResultMap));
+					actors.add(ddSnapCreate(
+					    snapReq, db, self->context->ddEnabledState.get(), &ddSnapReqMap, &ddSnapReqResultMap));
 					auto* ddSnapReqResultMapPtr = &ddSnapReqResultMap;
 					actors.add(fmap(
 					    [ddSnapReqResultMapPtr, snapUID](Void _) {
@ -1421,10 +1442,14 @@ TEST_CASE("/DataDistribution/StorageWiggler/Order") {
 }

 TEST_CASE("/DataDistribution/Initialization/ResumeFromShard") {
+	state Reference<DDSharedContext> context(new DDSharedContext(UID()));
 	state Reference<AsyncVar<ServerDBInfo> const> dbInfo;
-	state Reference<DataDistributor> self(new DataDistributor(dbInfo, UID()));
+	state Reference<DataDistributor> self(new DataDistributor(dbInfo, UID(), context));

 	self->shardsAffectedByTeamFailure = makeReference<ShardsAffectedByTeamFailure>();
+	if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
+		self->physicalShardCollection = makeReference<PhysicalShardCollection>();
+	}
 	self->initData = makeReference<InitialDataDistribution>();
 	self->configuration.usableRegions = 1;
 	self->configuration.storageTeamSize = 1;
--- a/fdbserver/GetEncryptCipherKeys.actor.cpp
+++ b/fdbserver/GetEncryptCipherKeys.actor.cpp
@ -18,7 +18,9 @@
 * limitations under the License.
 */

+#include "fdbserver/EncryptKeyProxyInterface.h"
 #include "fdbserver/GetEncryptCipherKeys.h"
+#include "flow/IRandom.h"

 #include <boost/functional/hash.hpp>

@ -105,8 +107,12 @@ ACTOR Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>
 			for (const EKPBaseCipherDetails& details : reply.baseCipherDetails) {
 				EncryptCipherDomainId domainId = details.encryptDomainId;
 				if (domains.count(domainId) > 0 && cipherKeys.count(domainId) == 0) {
-					Reference<BlobCipherKey> cipherKey = cipherKeyCache->insertCipherKey(
-					    domainId, details.baseCipherId, details.baseCipherKey.begin(), details.baseCipherKey.size());
+					Reference<BlobCipherKey> cipherKey = cipherKeyCache->insertCipherKey(domainId,
+					                                                                     details.baseCipherId,
+					                                                                     details.baseCipherKey.begin(),
+					                                                                     details.baseCipherKey.size(),
+					                                                                     details.refreshAt,
+					                                                                     details.expireAt);
 					ASSERT(cipherKey.isValid());
 					cipherKeys[domainId] = cipherKey;
 				}
@ -191,10 +197,10 @@ ACTOR Future<std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>> ge
 	// Fetch any uncached cipher keys.
 	loop choose {
 		when(EKPGetBaseCipherKeysByIdsReply reply = wait(getUncachedEncryptCipherKeys(db, request))) {
-			std::unordered_map<BaseCipherIndex, StringRef, boost::hash<BaseCipherIndex>> baseCipherKeys;
+			std::unordered_map<BaseCipherIndex, EKPBaseCipherDetails, boost::hash<BaseCipherIndex>> baseCipherKeys;
 			for (const EKPBaseCipherDetails& baseDetails : reply.baseCipherDetails) {
 				BaseCipherIndex baseIdx = std::make_pair(baseDetails.encryptDomainId, baseDetails.baseCipherId);
-				baseCipherKeys[baseIdx] = baseDetails.baseCipherKey;
+				baseCipherKeys[baseIdx] = baseDetails;
 			}
 			// Insert base cipher keys into cache and construct result.
 			for (const BlobCipherDetails& details : cipherDetails) {
@ -211,9 +217,11 @@ ACTOR Future<std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>> ge
 				}
 				Reference<BlobCipherKey> cipherKey = cipherKeyCache->insertCipherKey(details.encryptDomainId,
 				                                                                     details.baseCipherId,
-				                                                                     itr->second.begin(),
-				                                                                     itr->second.size(),
-				                                                                     details.salt);
+				                                                                     itr->second.baseCipherKey.begin(),
+				                                                                     itr->second.baseCipherKey.size(),
+				                                                                     details.salt,
+				                                                                     itr->second.refreshAt,
+				                                                                     itr->second.expireAt);
 				ASSERT(cipherKey.isValid());
 				cipherKeys[details] = cipherKey;
 			}
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@ -18,6 +18,7 @@
 * limitations under the License.
 */

+#include "fdbclient/ClientKnobs.h"
 #include "fdbclient/DatabaseContext.h"
 #include "fdbclient/Knobs.h"
 #include "fdbclient/Notified.h"
@ -31,6 +32,7 @@
 #include "fdbserver/WaitFailure.h"
 #include "fdbserver/WorkerInterface.actor.h"
 #include "fdbrpc/sim_validation.h"
+#include "flow/IRandom.h"
 #include "flow/flow.h"
 #include "flow/actorcompiler.h" // This must be the last #include.

@ -560,7 +562,9 @@ ACTOR Future<Void> queueGetReadVersionRequests(
 			// WARNING: this code is run at a high priority, so it needs to do as little work as possible
 			bool canBeQueued = true;
 			if (stats->txnRequestIn.getValue() - stats->txnRequestOut.getValue() >
-			    SERVER_KNOBS->START_TRANSACTION_MAX_QUEUE_SIZE) {
+			        SERVER_KNOBS->START_TRANSACTION_MAX_QUEUE_SIZE ||
+			    (g_network->isSimulated() && !g_simulator.speedUpSimulation &&
+			     deterministicRandom()->random01() < 0.01)) {
 				// When the limit is hit, try to drop requests from the lower priority queues.
 				if (req.priority == TransactionPriority::BATCH) {
 					canBeQueued = false;
--- a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp
@ -164,26 +164,6 @@ std::string getShardMappingKey(KeyRef key, StringRef prefix) {
 	return prefix.toString() + key.toString();
 }

-std::vector<std::pair<KeyRange, std::string>> decodeShardMapping(const RangeResult& result, StringRef prefix) {
-	std::vector<std::pair<KeyRange, std::string>> shards;
-	KeyRef endKey;
-	std::string name;
-
-	for (const auto& kv : result) {
-		auto keyWithoutPrefix = kv.key.removePrefix(prefix);
-		if (name.size() > 0) {
-			shards.push_back({ KeyRange(KeyRangeRef(endKey, keyWithoutPrefix)), name });
-			TraceEvent(SevDebug, "DecodeShardMapping")
-			    .detail("BeginKey", endKey)
-			    .detail("EndKey", keyWithoutPrefix)
-			    .detail("Name", name);
-		}
-		endKey = keyWithoutPrefix;
-		name = kv.value.toString();
-	}
-	return shards;
-}
-
 void logRocksDBError(const rocksdb::Status& status, const std::string& method) {
 	auto level = status.IsTimedOut() ? SevWarn : SevError;
 	TraceEvent e(level, "ShardedRocksDBError");
@ -219,7 +199,7 @@ const char* ShardOpToString(ShardOp op) {
 	}
 }
 void logShardEvent(StringRef name, ShardOp op, Severity severity = SevInfo, const std::string& message = "") {
-	TraceEvent e(severity, "ShardedRocksKVSShardEvent");
+	TraceEvent e(severity, "ShardedRocksDBKVSShardEvent");
 	e.detail("Name", name).detail("Action", ShardOpToString(op));
 	if (!message.empty()) {
 		e.detail("Message", message);
@ -230,7 +210,7 @@ void logShardEvent(StringRef name,
                   ShardOp op,
                   Severity severity = SevInfo,
                   const std::string& message = "") {
-	TraceEvent e(severity, "ShardedRocksKVSShardEvent");
+	TraceEvent e(severity, "ShardedRocksDBKVSShardEvent");
 	e.detail("Name", name).detail("Action", ShardOpToString(op)).detail("Begin", range.begin).detail("End", range.end);
 	if (message != "") {
 		e.detail("Message", message);
@ -284,7 +264,7 @@ rocksdb::ColumnFamilyOptions getCFOptions() {
 	}

 	if (rocksdb_block_cache == nullptr) {
-		rocksdb_block_cache = rocksdb::NewLRUCache(128);
+		rocksdb_block_cache = rocksdb::NewLRUCache(SERVER_KNOBS->ROCKSDB_BLOCK_CACHE_SIZE);
 	}
 	bbOpts.block_cache = rocksdb_block_cache;

@ -301,6 +281,12 @@ rocksdb::Options getOptions() {
 		options.IncreaseParallelism(SERVER_KNOBS->ROCKSDB_BACKGROUND_PARALLELISM);
 	}

+	options.delete_obsolete_files_period_micros = SERVER_KNOBS->ROCKSDB_DELETE_OBSOLETE_FILE_PERIOD * 1000000;
+	options.max_total_wal_size = SERVER_KNOBS->ROCKSDB_MAX_TOTAL_WAL_SIZE;
+	options.max_subcompactions = SERVER_KNOBS->ROCKSDB_MAX_SUBCOMPACTIONS;
+	options.max_background_jobs = SERVER_KNOBS->ROCKSDB_MAX_BACKGROUND_JOBS;
+
+	options.db_write_buffer_size = SERVER_KNOBS->ROCKSDB_WRITE_BUFFER_SIZE;
 	options.statistics = rocksdb::CreateDBStatistics();
 	options.statistics->set_stats_level(rocksdb::kExceptHistogramOrTimers);
 	options.db_log_dir = SERVER_KNOBS->LOG_DIRECTORY;
@ -634,6 +620,7 @@ public:
 		if (foundMetadata) {
 			TraceEvent(SevInfo, "ShardedRocksInitLoadPhysicalShards", this->logId)
 			    .detail("PhysicalShardCount", handles.size());
+
 			for (auto handle : handles) {
 				if (handle->GetName() == "kvs-metadata") {
 					metadataShard = std::make_shared<PhysicalShard>(db, "kvs-metadata", handle);
@ -644,25 +631,58 @@ public:
 				TraceEvent(SevVerbose, "ShardedRocksInitPhysicalShard", this->logId)
 				    .detail("PhysicalShard", handle->GetName());
 			}
-			RangeResult metadata;
-			readRangeInDb(metadataShard.get(), prefixRange(shardMappingPrefix), UINT16_MAX, UINT16_MAX, &metadata);

-			std::vector<std::pair<KeyRange, std::string>> mapping = decodeShardMapping(metadata, shardMappingPrefix);
-
-			for (const auto& [range, name] : mapping) {
-				TraceEvent(SevVerbose, "ShardedRocksLoadRange", this->logId)
-				    .detail("Range", range)
-				    .detail("PhysicalShard", name);
-				auto it = physicalShards.find(name);
-				// Raise error if physical shard is missing.
-				if (it == physicalShards.end()) {
-					TraceEvent(SevError, "ShardedRocksDB").detail("MissingShard", name);
-					return rocksdb::Status::NotFound();
+			KeyRange keyRange = prefixRange(shardMappingPrefix);
+			while (true) {
+				RangeResult metadata;
+				const int bytes = readRangeInDb(metadataShard.get(),
+				                                keyRange,
+				                                std::max(2, SERVER_KNOBS->ROCKSDB_READ_RANGE_ROW_LIMIT),
+				                                UINT16_MAX,
+				                                &metadata);
+				if (bytes <= 0) {
+					break;
+				}
+
+				ASSERT_GT(metadata.size(), 0);
+				for (int i = 0; i < metadata.size() - 1; ++i) {
+					const std::string name = metadata[i].value.toString();
+					KeyRangeRef range(metadata[i].key.removePrefix(shardMappingPrefix),
+					                  metadata[i + 1].key.removePrefix(shardMappingPrefix));
+					TraceEvent(SevVerbose, "DecodeShardMapping", this->logId)
+					    .detail("Range", range)
+					    .detail("Name", name);
+
+					// Empty name indicates the shard doesn't belong to the SS/KVS.
+					if (name.empty()) {
+						continue;
+					}
+
+					auto it = physicalShards.find(name);
+					// Raise error if physical shard is missing.
+					if (it == physicalShards.end()) {
+						TraceEvent(SevError, "ShardedRocksDB", this->logId).detail("MissingShard", name);
+						return rocksdb::Status::NotFound();
+					}
+
+					std::unique_ptr<DataShard> dataShard = std::make_unique<DataShard>(range, it->second.get());
+					dataShardMap.insert(range, dataShard.get());
+					it->second->dataShards[range.begin.toString()] = std::move(dataShard);
+					activePhysicalShardIds.emplace(name);
+				}
+
+				if (metadata.back().key.removePrefix(shardMappingPrefix) == specialKeys.end) {
+					TraceEvent(SevVerbose, "ShardedRocksLoadShardMappingEnd", this->logId);
+					break;
+				}
+
+				// Read from the current last key since the shard begining with it hasn't been processed.
+				if (metadata.size() == 1 && metadata.back().value.toString().empty()) {
+					// Should not happen, just being paranoid.
+					keyRange = KeyRangeRef(keyAfter(metadata.back().key), keyRange.end);
+				} else {
+					keyRange = KeyRangeRef(metadata.back().key, keyRange.end);
 				}
-				std::unique_ptr<DataShard> dataShard = std::make_unique<DataShard>(range, it->second.get());
-				dataShardMap.insert(range, dataShard.get());
-				it->second->dataShards[range.begin.toString()] = std::move(dataShard);
-				activePhysicalShardIds.emplace(name);
 			}
 			// TODO: remove unused column families.
 		} else {
@ -673,7 +693,7 @@ public:
 			std::shared_ptr<PhysicalShard> defaultShard = std::make_shared<PhysicalShard>(db, "default", handles[0]);
 			columnFamilyMap[defaultShard->cf->GetID()] = defaultShard->cf;
 			std::unique_ptr<DataShard> dataShard = std::make_unique<DataShard>(specialKeys, defaultShard.get());
-			dataShardMap.insert(dataShard->range, dataShard.get());
+			dataShardMap.insert(specialKeys, dataShard.get());
 			defaultShard->dataShards[specialKeys.begin.toString()] = std::move(dataShard);
 			physicalShards[defaultShard->id] = defaultShard;

@ -1045,7 +1065,7 @@ public:
 	void logStats(rocksdb::DB* db);
 	// PerfContext
 	void resetPerfContext();
-	void setPerfContext(int index);
+	void collectPerfContext(int index);
 	void logPerfContext(bool ignoreZeroMetric);
 	// For Readers
 	Reference<Histogram> getReadRangeLatencyHistogram(int index);
@ -1349,9 +1369,9 @@ void RocksDBMetrics::logStats(rocksdb::DB* db) {
 		e.detail(name, stat - cumulation);
 		cumulation = stat;
 	}
-	for (auto& [name, property] : propertyStats) { // Zhe: TODO aggregation
+	for (auto& [name, property] : propertyStats) {
 		stat = 0;
-		ASSERT(db->GetIntProperty(property, &stat));
+		ASSERT(db->GetAggregatedIntProperty(property, &stat));
 		e.detail(name, stat);
 	}
 }
@ -1360,22 +1380,22 @@ void RocksDBMetrics::logMemUsage(rocksdb::DB* db) {
 	TraceEvent e(SevInfo, "ShardedRocksDBMemMetrics", debugID);
 	uint64_t stat;
 	ASSERT(db != nullptr);
-	ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kBlockCacheUsage, &stat));
+	ASSERT(db->GetAggregatedIntProperty(rocksdb::DB::Properties::kBlockCacheUsage, &stat));
 	e.detail("BlockCacheUsage", stat);
-	ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kEstimateTableReadersMem, &stat));
+	ASSERT(db->GetAggregatedIntProperty(rocksdb::DB::Properties::kEstimateTableReadersMem, &stat));
 	e.detail("EstimateSstReaderBytes", stat);
-	ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kCurSizeAllMemTables, &stat));
+	ASSERT(db->GetAggregatedIntProperty(rocksdb::DB::Properties::kCurSizeAllMemTables, &stat));
 	e.detail("AllMemtablesBytes", stat);
-	ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kBlockCachePinnedUsage, &stat));
+	ASSERT(db->GetAggregatedIntProperty(rocksdb::DB::Properties::kBlockCachePinnedUsage, &stat));
 	e.detail("BlockCachePinnedUsage", stat);
 }

 void RocksDBMetrics::resetPerfContext() {
-	rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableTimeExceptForMutex);
+	rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableCount);
 	rocksdb::get_perf_context()->Reset();
 }

-void RocksDBMetrics::setPerfContext(int index) {
+void RocksDBMetrics::collectPerfContext(int index) {
 	for (auto& [name, metric, vals] : perfContextMetrics) {
 		vals[index] = getRocksdbPerfcontextMetric(metric);
 	}
@ -1391,12 +1411,7 @@ void RocksDBMetrics::logPerfContext(bool ignoreZeroMetric) {
 		}
 		if (ignoreZeroMetric && s == 0)
 			continue;
-		for (int i = 0; i < SERVER_KNOBS->ROCKSDB_READ_PARALLELISM; i++) {
-			if (vals[i] != 0)
-				e.detail("RD" + std::to_string(i) + name, vals[i]);
-		}
-		if (vals[SERVER_KNOBS->ROCKSDB_READ_PARALLELISM] != 0)
-			e.detail("WR" + (std::string)name, vals[SERVER_KNOBS->ROCKSDB_READ_PARALLELISM]);
+		e.detail(name, s);
 	}
 }

@ -1612,6 +1627,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		std::unordered_map<uint32_t, rocksdb::ColumnFamilyHandle*>* columnFamilyMap;
 		std::shared_ptr<RocksDBMetrics> rocksDBMetrics;
 		std::shared_ptr<rocksdb::RateLimiter> rateLimiter;
+		double sampleStartTime;

 		explicit Writer(UID logId,
 		                int threadIndex,
@ -1625,12 +1641,22 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		                          10, // fairness
 		                          rocksdb::RateLimiter::Mode::kWritesOnly,
 		                          SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE)
-		                    : nullptr) {}
+		                    : nullptr),
+		    sampleStartTime(now()) {}

 		~Writer() override {}

 		void init() override {}

+		void sample() {
+			if (SERVER_KNOBS->ROCKSDB_METRICS_SAMPLE_INTERVAL > 0 &&
+			    now() - sampleStartTime >= SERVER_KNOBS->ROCKSDB_METRICS_SAMPLE_INTERVAL) {
+				rocksDBMetrics->collectPerfContext(threadIndex);
+				rocksDBMetrics->resetPerfContext();
+				sampleStartTime = now();
+			}
+		}
+
 		struct OpenAction : TypedAction<Writer, OpenAction> {
 			ShardManager* shardManager;
 			rocksdb::Options dbOptions;
@ -1709,7 +1735,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 			ThreadReturnPromise<Void> done;
 			double startTime;
 			bool getHistograms;
-			bool getPerfContext;
 			bool logShardMemUsage;
 			double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; }
 			CommitAction(rocksdb::DB* db,
@ -1724,12 +1749,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 				} else {
 					getHistograms = false;
 				}
-				if ((SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE != 0) &&
-				    (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE)) {
-					getPerfContext = true;
-				} else {
-					getPerfContext = false;
-				}
 			}
 		};

@ -1798,9 +1817,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		}

 		void action(CommitAction& a) {
-			if (a.getPerfContext) {
-				rocksDBMetrics->resetPerfContext();
-			}
 			double commitBeginTime;
 			if (a.getHistograms) {
 				commitBeginTime = timer_monotonic();
@ -1832,9 +1848,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 				rocksDBMetrics->getCommitLatencyHistogram()->sampleSeconds(currTime - a.startTime);
 			}

-			if (a.getPerfContext) {
-				rocksDBMetrics->setPerfContext(threadIndex);
-			}
+			sample();
 		}

 		struct CloseAction : TypedAction<Writer, CloseAction> {
@ -1864,9 +1878,10 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		double readRangeTimeout;
 		int threadIndex;
 		std::shared_ptr<RocksDBMetrics> rocksDBMetrics;
+		double sampleStartTime;

 		explicit Reader(UID logId, int threadIndex, std::shared_ptr<RocksDBMetrics> rocksDBMetrics)
-		  : logId(logId), threadIndex(threadIndex), rocksDBMetrics(rocksDBMetrics) {
+		  : logId(logId), threadIndex(threadIndex), rocksDBMetrics(rocksDBMetrics), sampleStartTime(now()) {
 			if (g_network->isSimulated()) {
 				// In simulation, increasing the read operation timeouts to 5 minutes, as some of the tests have
 				// very high load and single read thread cannot process all the load within the timeouts.
@ -1882,33 +1897,33 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {

 		void init() override {}

+		void sample() {
+			if (SERVER_KNOBS->ROCKSDB_METRICS_SAMPLE_INTERVAL > 0 &&
+			    now() - sampleStartTime >= SERVER_KNOBS->ROCKSDB_METRICS_SAMPLE_INTERVAL) {
+				rocksDBMetrics->collectPerfContext(threadIndex);
+				rocksDBMetrics->resetPerfContext();
+				sampleStartTime = now();
+			}
+		}
 		struct ReadValueAction : TypedAction<Reader, ReadValueAction> {
 			Key key;
 			PhysicalShard* shard;
 			Optional<UID> debugID;
 			double startTime;
 			bool getHistograms;
-			bool getPerfContext;
 			bool logShardMemUsage;
 			ThreadReturnPromise<Optional<Value>> result;

 			ReadValueAction(KeyRef key, PhysicalShard* shard, Optional<UID> debugID)
 			  : key(key), shard(shard), debugID(debugID), startTime(timer_monotonic()),
 			    getHistograms(
-			        (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) ? true : false),
-			    getPerfContext(
-			        (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE != 0) &&
-			                (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE)
-			            ? true
-			            : false) {}
+			        (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) ? true : false) {
+			}

 			double getTimeEstimate() const override { return SERVER_KNOBS->READ_VALUE_TIME_ESTIMATE; }
 		};

 		void action(ReadValueAction& a) {
-			if (a.getPerfContext) {
-				rocksDBMetrics->resetPerfContext();
-			}
 			double readBeginTime = timer_monotonic();
 			if (a.getHistograms) {
 				rocksDBMetrics->getReadValueQueueWaitHistogram(threadIndex)->sampleSeconds(readBeginTime - a.startTime);
@ -1961,9 +1976,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 				rocksDBMetrics->getReadValueActionHistogram(threadIndex)->sampleSeconds(currTime - readBeginTime);
 				rocksDBMetrics->getReadValueLatencyHistogram(threadIndex)->sampleSeconds(currTime - a.startTime);
 			}
-			if (a.getPerfContext) {
-				rocksDBMetrics->setPerfContext(threadIndex);
-			}
+
+			sample();
 		}

 		struct ReadValuePrefixAction : TypedAction<Reader, ReadValuePrefixAction> {
@ -1973,26 +1987,18 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 			Optional<UID> debugID;
 			double startTime;
 			bool getHistograms;
-			bool getPerfContext;
 			bool logShardMemUsage;
 			ThreadReturnPromise<Optional<Value>> result;

 			ReadValuePrefixAction(Key key, int maxLength, PhysicalShard* shard, Optional<UID> debugID)
 			  : key(key), maxLength(maxLength), shard(shard), debugID(debugID), startTime(timer_monotonic()),
-			    getHistograms(
-			        (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) ? true : false),
-			    getPerfContext(
-			        (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE != 0) &&
-			                (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE)
-			            ? true
-			            : false){};
+			    getHistograms((deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE)
+			                      ? true
+			                      : false){};
 			double getTimeEstimate() const override { return SERVER_KNOBS->READ_VALUE_TIME_ESTIMATE; }
 		};

 		void action(ReadValuePrefixAction& a) {
-			if (a.getPerfContext) {
-				rocksDBMetrics->resetPerfContext();
-			}
 			double readBeginTime = timer_monotonic();
 			if (a.getHistograms) {
 				rocksDBMetrics->getReadPrefixQueueWaitHistogram(threadIndex)
@ -2050,9 +2056,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 				rocksDBMetrics->getReadPrefixActionHistogram(threadIndex)->sampleSeconds(currTime - readBeginTime);
 				rocksDBMetrics->getReadPrefixLatencyHistogram(threadIndex)->sampleSeconds(currTime - a.startTime);
 			}
-			if (a.getPerfContext) {
-				rocksDBMetrics->setPerfContext(threadIndex);
-			}
+
+			sample();
 		}

 		struct ReadRangeAction : TypedAction<Reader, ReadRangeAction>, FastAllocated<ReadRangeAction> {
@ -2061,18 +2066,12 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 			int rowLimit, byteLimit;
 			double startTime;
 			bool getHistograms;
-			bool getPerfContext;
 			bool logShardMemUsage;
 			ThreadReturnPromise<RangeResult> result;
 			ReadRangeAction(KeyRange keys, std::vector<DataShard*> shards, int rowLimit, int byteLimit)
 			  : keys(keys), rowLimit(rowLimit), byteLimit(byteLimit), startTime(timer_monotonic()),
 			    getHistograms(
-			        (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) ? true : false),
-			    getPerfContext(
-			        (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE != 0) &&
-			                (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE)
-			            ? true
-			            : false) {
+			        (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) ? true : false) {
 				for (const DataShard* shard : shards) {
 					if (shard != nullptr) {
 						shardRanges.emplace_back(shard->physicalShard, keys & shard->range);
@ -2083,9 +2082,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		};

 		void action(ReadRangeAction& a) {
-			if (a.getPerfContext) {
-				rocksDBMetrics->resetPerfContext();
-			}
 			double readBeginTime = timer_monotonic();
 			if (a.getHistograms) {
 				rocksDBMetrics->getReadRangeQueueWaitHistogram(threadIndex)->sampleSeconds(readBeginTime - a.startTime);
@ -2149,9 +2145,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 				rocksDBMetrics->getReadRangeActionHistogram(threadIndex)->sampleSeconds(currTime - readBeginTime);
 				rocksDBMetrics->getReadRangeLatencyHistogram(threadIndex)->sampleSeconds(currTime - a.startTime);
 			}
-			if (a.getPerfContext) {
-				rocksDBMetrics->setPerfContext(threadIndex);
-			}
+
+			sample();
 		}
 	};

--- a/fdbserver/MutationTracking.cpp
+++ b/fdbserver/MutationTracking.cpp
@ -21,7 +21,6 @@
 #include <algorithm>
 #include <vector>
 #include "fdbclient/FDBTypes.h"
-#include "fdbserver/EncryptedMutationMessage.h"
 #include "fdbserver/MutationTracking.h"
 #include "fdbserver/LogProtocolMessage.h"
 #include "fdbserver/SpanContextMessage.h"
@ -103,8 +102,6 @@ TraceEvent debugTagsAndMessageEnabled(const char* context, Version version, Stri
 			BinaryReader br(mutationData, AssumeVersion(rdr.protocolVersion()));
 			OTELSpanContextMessage scm;
 			br >> scm;
-		} else if (EncryptedMutationMessage::startsEncryptedMutationMessage(mutationType)) {
-			throw encrypt_unsupported();
 		} else {
 			MutationRef m;
 			BinaryReader br(mutationData, AssumeVersion(rdr.protocolVersion()));
--- a/fdbserver/RestoreLoader.actor.cpp
+++ b/fdbserver/RestoreLoader.actor.cpp
@ -24,7 +24,6 @@
 #include "flow/UnitTest.h"
 #include "fdbclient/BackupContainer.h"
 #include "fdbclient/BackupAgent.actor.h"
-#include "fdbserver/EncryptedMutationMessage.h"
 #include "fdbserver/RestoreLoader.actor.h"
 #include "fdbserver/RestoreRoleCommon.actor.h"
 #include "fdbserver/MutationTracking.h"
@ -423,11 +422,11 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
 			ASSERT(inserted);

 			ArenaReader rd(buf.arena(), StringRef(message, msgSize), AssumeVersion(g_network->protocolVersion()));
-			if (EncryptedMutationMessage::isNextIn(rd)) {
-				throw encrypt_unsupported();
-			}
 			MutationRef mutation;
 			rd >> mutation;
+			if (mutation.isEncrypted()) {
+				throw encrypt_unsupported();
+			}

 			// Skip mutation whose commitVesion < range kv's version
 			if (logMutationTooOld(pRangeVersions, mutation, msgVersion.version)) {
--- a/fdbserver/SimKmsConnector.actor.cpp
+++ b/fdbserver/SimKmsConnector.actor.cpp
@ -76,18 +76,20 @@ struct SimKmsConnectorContext : NonCopyable, ReferenceCounted<SimKmsConnectorCon
 };

 namespace {
-Optional<int64_t> getRefreshInterval(int64_t now, int64_t defaultTtl) {
+Optional<int64_t> getRefreshInterval(const int64_t now, const int64_t defaultTtl) {
 	if (BUGGIFY) {
-		return Optional<int64_t>(now + defaultTtl);
+		return Optional<int64_t>(now);
 	}
-	return Optional<int64_t>();
+	return Optional<int64_t>(now + defaultTtl);
 }

-Optional<int64_t> getExpireInterval(Optional<int64_t> refTS) {
+Optional<int64_t> getExpireInterval(Optional<int64_t> refTS, const int64_t defaultTtl) {
+	ASSERT(refTS.present());
+
 	if (BUGGIFY) {
 		return Optional<int64_t>(-1);
 	}
-	return refTS;
+	return (refTS.get() + defaultTtl);
 }
 } // namespace

@ -105,11 +107,17 @@ ACTOR Future<Void> ekLookupByIds(Reference<SimKmsConnectorContext> ctx,
 	}

 	// Lookup corresponding EncryptKeyCtx for input keyId
+	const int64_t currTS = (int64_t)now();
+	// Fetch default TTL to avoid BUGGIFY giving different value per invocation causing refTS > expTS
+	const int64_t defaultTtl = FLOW_KNOBS->ENCRYPT_CIPHER_KEY_CACHE_TTL;
+	Optional<int64_t> refAtTS = getRefreshInterval(currTS, defaultTtl);
+	Optional<int64_t> expAtTS = getExpireInterval(refAtTS, defaultTtl);
+	TraceEvent("SimKms.EKLookupById").detail("RefreshAt", refAtTS).detail("ExpireAt", expAtTS);
 	for (const auto& item : req.encryptKeyInfos) {
 		const auto& itr = ctx->simEncryptKeyStore.find(item.baseCipherId);
 		if (itr != ctx->simEncryptKeyStore.end()) {
 			rep.cipherKeyDetails.emplace_back_deep(
-			    rep.arena, item.domainId, itr->first, StringRef(itr->second.get()->key));
+			    rep.arena, item.domainId, itr->first, StringRef(itr->second.get()->key), refAtTS, expAtTS);

 			if (dbgKIdTrace.present()) {
 				// {encryptDomainId, baseCipherId} forms a unique tuple across encryption domains
@ -145,11 +153,12 @@ ACTOR Future<Void> ekLookupByDomainIds(Reference<SimKmsConnectorContext> ctx,
 	// Map encryptionDomainId to corresponding EncryptKeyCtx element using a modulo operation. This
 	// would mean multiple domains gets mapped to the same encryption key which is fine, the
 	// EncryptKeyStore guarantees that keyId -> plaintext encryptKey mapping is idempotent.
-	int64_t currTS = (int64_t)now();
+	const int64_t currTS = (int64_t)now();
 	// Fetch default TTL to avoid BUGGIFY giving different value per invocation causing refTS > expTS
-	int64_t defaultTtl = FLOW_KNOBS->ENCRYPT_CIPHER_KEY_CACHE_TTL;
+	const int64_t defaultTtl = FLOW_KNOBS->ENCRYPT_CIPHER_KEY_CACHE_TTL;
 	Optional<int64_t> refAtTS = getRefreshInterval(currTS, defaultTtl);
-	Optional<int64_t> expAtTS = getExpireInterval(refAtTS);
+	Optional<int64_t> expAtTS = getExpireInterval(refAtTS, defaultTtl);
+	TraceEvent("SimKms.EKLookupByDomainId").detail("RefreshAt", refAtTS).detail("ExpireAt", expAtTS);
 	for (const auto& info : req.encryptDomainInfos) {
 		EncryptCipherBaseKeyId keyId = 1 + abs(info.domainId) % SERVER_KNOBS->SIM_KMS_MAX_KEYS;
 		const auto& itr = ctx->simEncryptKeyStore.find(keyId);
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@ -1078,76 +1078,61 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
 	return processMap;
 }

-struct ClientStats {
-	int count;
-	std::set<std::pair<NetworkAddress, Key>> examples;
-
-	ClientStats() : count(0) {}
-};
-
 static JsonBuilderObject clientStatusFetcher(
    std::map<NetworkAddress, std::pair<double, OpenDatabaseRequest>>* clientStatusMap) {
 	JsonBuilderObject clientStatus;

 	int64_t clientCount = 0;
-	std::map<Key, ClientStats> issues;
-	std::map<Standalone<ClientVersionRef>, ClientStats> supportedVersions;
-	std::map<Key, ClientStats> maxSupportedProtocol;
+	// Here we handle versions and maxSupportedProtocols, the issues will be handled in getClientIssuesAsMessages
+	std::map<Standalone<ClientVersionRef>, OpenDatabaseRequest::Samples> supportedVersions;
+	std::map<Key, OpenDatabaseRequest::Samples> maxSupportedProtocol;

 	for (auto iter = clientStatusMap->begin(); iter != clientStatusMap->end();) {
-		if (now() - iter->second.first < 2 * SERVER_KNOBS->COORDINATOR_REGISTER_INTERVAL) {
-			clientCount += iter->second.second.clientCount;
-			for (auto& it : iter->second.second.issues) {
-				auto& issue = issues[it.item];
-				issue.count += it.count;
-				issue.examples.insert(it.examples.begin(), it.examples.end());
-			}
-			for (auto& it : iter->second.second.supportedVersions) {
-				auto& version = supportedVersions[it.item];
-				version.count += it.count;
-				version.examples.insert(it.examples.begin(), it.examples.end());
-			}
-			for (auto& it : iter->second.second.maxProtocolSupported) {
-				auto& protocolVersion = maxSupportedProtocol[it.item];
-				protocolVersion.count += it.count;
-				protocolVersion.examples.insert(it.examples.begin(), it.examples.end());
-			}
-			++iter;
-		} else {
+		if (now() - iter->second.first >= 2 * SERVER_KNOBS->COORDINATOR_REGISTER_INTERVAL) {
 			iter = clientStatusMap->erase(iter);
+			continue;
 		}
+
+		clientCount += iter->second.second.clientCount;
+		for (const auto& [version, samples] : iter->second.second.supportedVersions) {
+			supportedVersions[version] += samples;
+		}
+		for (const auto& [protocol, samples] : iter->second.second.maxProtocolSupported) {
+			maxSupportedProtocol[protocol] += samples;
+		}
+		++iter;
 	}

 	clientStatus["count"] = clientCount;

 	JsonBuilderArray versionsArray = JsonBuilderArray();
-	for (auto& cv : supportedVersions) {
+	for (const auto& [clientVersionRef, samples] : supportedVersions) {
 		JsonBuilderObject ver;
-		ver["count"] = (int64_t)cv.second.count;
-		ver["client_version"] = cv.first.clientVersion.toString();
-		ver["protocol_version"] = cv.first.protocolVersion.toString();
-		ver["source_version"] = cv.first.sourceVersion.toString();
+		ver["count"] = (int64_t)samples.count;
+		ver["client_version"] = clientVersionRef.clientVersion.toString();
+		ver["protocol_version"] = clientVersionRef.protocolVersion.toString();
+		ver["source_version"] = clientVersionRef.sourceVersion.toString();

 		JsonBuilderArray clients = JsonBuilderArray();
-		for (auto& client : cv.second.examples) {
+		for (const auto& [networkAddress, trackLogGroup] : samples.samples) {
 			JsonBuilderObject cli;
-			cli["address"] = client.first.toString();
-			cli["log_group"] = client.second.toString();
+			cli["address"] = networkAddress.toString();
+			cli["log_group"] = trackLogGroup.toString();
 			clients.push_back(cli);
 		}

-		auto iter = maxSupportedProtocol.find(cv.first.protocolVersion);
-		if (iter != maxSupportedProtocol.end()) {
+		auto iter = maxSupportedProtocol.find(clientVersionRef.protocolVersion);
+		if (iter != std::end(maxSupportedProtocol)) {
 			JsonBuilderArray maxClients = JsonBuilderArray();
-			for (auto& client : iter->second.examples) {
+			for (const auto& [networkAddress, trackLogGroup] : iter->second.samples) {
 				JsonBuilderObject cli;
-				cli["address"] = client.first.toString();
-				cli["log_group"] = client.second.toString();
+				cli["address"] = networkAddress.toString();
+				cli["log_group"] = trackLogGroup.toString();
 				maxClients.push_back(cli);
 			}
 			ver["max_protocol_count"] = iter->second.count;
 			ver["max_protocol_clients"] = maxClients;
-			maxSupportedProtocol.erase(cv.first.protocolVersion);
+			maxSupportedProtocol.erase(clientVersionRef.protocolVersion);
 		}

 		ver["connected_clients"] = clients;
@ -2660,18 +2645,19 @@ static JsonBuilderArray getClientIssuesAsMessages(
 		std::map<std::string, std::pair<int, std::vector<std::string>>> deduplicatedIssues;

 		for (auto iter = clientStatusMap->begin(); iter != clientStatusMap->end();) {
-			if (now() - iter->second.first < 2 * SERVER_KNOBS->COORDINATOR_REGISTER_INTERVAL) {
-				for (auto& issue : iter->second.second.issues) {
-					auto& t = deduplicatedIssues[issue.item.toString()];
-					t.first += issue.count;
-					for (auto& example : issue.examples) {
-						t.second.push_back(formatIpPort(example.first.ip, example.first.port));
-					}
-				}
-				++iter;
-			} else {
+			if (now() - iter->second.first >= 2 * SERVER_KNOBS->COORDINATOR_REGISTER_INTERVAL) {
 				iter = clientStatusMap->erase(iter);
+				continue;
 			}
+
+			for (const auto& [issueKey, samples] : iter->second.second.issues) {
+				auto& t = deduplicatedIssues[issueKey.toString()];
+				t.first += samples.count;
+				for (const auto& sample : samples.samples) {
+					t.second.push_back(formatIpPort(sample.first.ip, sample.first.port));
+				}
+			}
+			++iter;
 		}

 		// FIXME: add the log_group in addition to the network address
--- a/fdbserver/StorageCache.actor.cpp
+++ b/fdbserver/StorageCache.actor.cpp
@ -23,7 +23,6 @@
 #include "fdbclient/FDBOptions.g.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/SystemData.h"
-#include "fdbserver/EncryptedMutationMessage.h"
 #include "fdbserver/GetEncryptCipherKeys.h"
 #include "fdbserver/Knobs.h"
 #include "fdbserver/ServerDBInfo.h"
@ -1909,25 +1908,19 @@ ACTOR Future<Void> pullAsyncData(StorageCacheData* data) {
 					           OTELSpanContextMessage::isNextIn(cloneReader)) {
 						OTELSpanContextMessage scm;
 						cloneReader >> scm;
-					} else if (cloneReader.protocolVersion().hasEncryptionAtRest() &&
-					           EncryptedMutationMessage::isNextIn(cloneReader) && !cipherKeys.present()) {
-						// Encrypted mutation found, but cipher keys haven't been fetch.
-						// Collect cipher details to fetch cipher keys in one batch.
-						EncryptedMutationMessage emm;
-						cloneReader >> emm;
-						cipherDetails.insert(emm.header.cipherTextDetails);
-						cipherDetails.insert(emm.header.cipherHeaderDetails);
-						collectingCipherKeys = true;
 					} else {
 						MutationRef msg;
-						if (cloneReader.protocolVersion().hasEncryptionAtRest() &&
-						    EncryptedMutationMessage::isNextIn(cloneReader)) {
-							assert(cipherKeys.present());
-							msg = EncryptedMutationMessage::decrypt(cloneReader, cloneReader.arena(), cipherKeys.get());
-						} else {
-							cloneReader >> msg;
+						cloneReader >> msg;
+						if (msg.isEncrypted()) {
+							if (!cipherKeys.present()) {
+								const BlobCipherEncryptHeader* header = msg.encryptionHeader();
+								cipherDetails.insert(header->cipherTextDetails);
+								cipherDetails.insert(header->cipherHeaderDetails);
+								collectingCipherKeys = true;
+							} else {
+								msg = msg.decrypt(cipherKeys.get(), cloneReader.arena());
+							}
 						}
-
 						if (!collectingCipherKeys) {
 							if (firstMutation && msg.param1.startsWith(systemKeys.end))
 								hasPrivateData = true;
@ -2019,10 +2012,9 @@ ACTOR Future<Void> pullAsyncData(StorageCacheData* data) {
 					reader >> oscm;
 				} else {
 					MutationRef msg;
-					if (reader.protocolVersion().hasEncryptionAtRest() && EncryptedMutationMessage::isNextIn(reader)) {
-						msg = EncryptedMutationMessage::decrypt(reader, reader.arena(), cipherKeys.get());
-					} else {
-						reader >> msg;
+					reader >> msg;
+					if (msg.isEncrypted()) {
+						msg = msg.decrypt(cipherKeys.get(), reader.arena());
 					}

 					if (ver != invalidVersion) // This change belongs to a version < minVersion
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@ -1092,7 +1092,10 @@ ACTOR Future<Void> updatePersistentData(TLogData* self, Reference<LogData> logDa
 	    BinaryWriter::toValue(logData->knownCommittedVersion, Unversioned())));
 	logData->persistentDataVersion = newPersistentDataVersion;

-	wait(self->persistentData->commit()); // SOMEDAY: This seems to be running pretty often, should we slow it down???
+	// SOMEDAY: This seems to be running pretty often, should we slow it down???
+	// This needs a timeout since nothing prevents I/O operations from hanging indefinitely.
+	wait(ioTimeoutError(self->persistentData->commit(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
+
 	wait(delay(0, TaskPriority::UpdateStorage));

 	// Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue,
@ -2356,7 +2359,7 @@ ACTOR Future<Void> initPersistentState(TLogData* self, Reference<LogData> logDat

 	// PERSIST: Initial setup of persistentData for a brand new tLog for a new database
 	state IKeyValueStore* storage = self->persistentData;
-	wait(ioTimeoutError(storage->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
+	wait(storage->init());
 	storage->set(persistFormat);
 	storage->set(
 	    KeyValueRef(BinaryWriter::toValue(logData->logId, Unversioned()).withPrefix(persistCurrentVersionKeys.begin),
@ -2388,7 +2391,7 @@ ACTOR Future<Void> initPersistentState(TLogData* self, Reference<LogData> logDat
 	}

 	TraceEvent("TLogInitCommit", logData->logId).log();
-	wait(ioTimeoutError(self->persistentData->commit(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
+	wait(self->persistentData->commit());
 	return Void();
 }

@ -3417,7 +3420,8 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality

 			logData->unpoppedRecoveredTagCount = req.allTags.size();
 			logData->unpoppedRecoveredTags = std::set<Tag>(req.allTags.begin(), req.allTags.end());
-			wait(initPersistentState(self, logData) || logData->removed);
+			wait(ioTimeoutError(initPersistentState(self, logData) || logData->removed,
+			                    SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));

 			TraceEvent("TLogRecover", self->dbgid)
 			    .detail("LogId", logData->logId)
@ -3481,7 +3485,8 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
 			logData->addActor.send(respondToRecovered(recruited, logData->recoveryComplete));
 		} else {
 			// Brand new tlog, initialization has already been done by caller
-			wait(initPersistentState(self, logData) || logData->removed);
+			wait(ioTimeoutError(initPersistentState(self, logData) || logData->removed,
+			                    SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));

 			if (logData->recoveryComplete.isSet()) {
 				throw worker_removed();
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@ -4954,6 +4954,9 @@ public:
 		constexpr static FileIdentifier file_identifier = 10847329;
 		constexpr static unsigned int FORMAT_VERSION = 17;

+		// Maximum size of the root pointer
+		constexpr static int maxRootPointerSize = 3000 / sizeof(LogicalPageID);
+
 		// This serves as the format version for the entire tree, individual pages will not be versioned
 		uint32_t formatVersion;
 		EncodingType encodingType;
@ -5961,15 +5964,22 @@ private:
 		return records;
 	}

-	ACTOR static Future<Standalone<VectorRef<RedwoodRecordRef>>> buildNewRoot(
+	// Takes a list of records commitSubtree() on the root and builds new root pages
+	// until there is only 1 root records which is small enough to fit in the BTree commit header.
+	ACTOR static Future<Standalone<VectorRef<RedwoodRecordRef>>> buildNewRootsIfNeeded(
 	    VersionedBTree* self,
 	    Version version,
 	    Standalone<VectorRef<RedwoodRecordRef>> records,
 	    unsigned int height) {
 		debug_printf("buildNewRoot start version %" PRId64 ", %d records\n", version, records.size());

-		// While there are multiple child pages for this version we must write new tree levels.
-		while (records.size() > 1) {
+		// While there are multiple child records or there is only one but it is too large to fit in the BTree
+		// commit record, build a new root page and update records to be a link to that new page.
+		// Root pointer size is limited because the pager commit header is limited to smallestPhysicalBlock in
+		// size.
+		while (records.size() > 1 ||
+		       records.front().getChildPage().size() > (BUGGIFY ? 1 : BTreeCommitHeader::maxRootPointerSize)) {
+			CODE_PROBE(records.size() == 1, "Writing a new root because the current root pointer would be too large");
 			self->m_header.height = ++height;
 			ASSERT(height < std::numeric_limits<int8_t>::max());
 			Standalone<VectorRef<RedwoodRecordRef>> newRecords = wait(
@ -7299,14 +7309,10 @@ private:
 				self->m_pager->updatePage(PagerEventReasons::Commit, self->m_header.height, rootNodeLink, page);
 			} else {
 				Standalone<VectorRef<RedwoodRecordRef>> newRootRecords(all.newLinks, all.newLinks.arena());
-				if (newRootRecords.size() == 1) {
-					rootNodeLink = newRootRecords.front().getChildPage();
-				} else {
-					// If the new root level's size is not 1 then build new root level(s)
-					Standalone<VectorRef<RedwoodRecordRef>> newRootPage =
-					    wait(buildNewRoot(self, batch.writeVersion, newRootRecords, self->m_header.height));
-					rootNodeLink = newRootPage.front().getChildPage();
-				}
+				// Build new root levels if there are multiple new root records or if the root pointer is too large
+				Standalone<VectorRef<RedwoodRecordRef>> newRootPage =
+				    wait(buildNewRootsIfNeeded(self, batch.writeVersion, newRootRecords, self->m_header.height));
+				rootNodeLink = newRootPage.front().getChildPage();
 			}
 		}

--- a/fdbserver/include/fdbserver/DDRelocationQueue.h
+++ b/fdbserver/include/fdbserver/DDRelocationQueue.h
@ -0,0 +1,35 @@
+/*
+ * DDRelocationQueue.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FOUNDATIONDB_DDRELOCATIONQUEUE_H
+#define FOUNDATIONDB_DDRELOCATIONQUEUE_H
+
+#include "DataDistribution.actor.h"
+// send request/signal to DDRelocationQueue through interface
+// call synchronous method from components outside DDRelocationQueue
+struct IDDRelocationQueue {
+	PromiseStream<RelocateShard> relocationProducer, relocationConsumer; // FIXME(xwang): not used yet
+	// PromiseStream<Promise<int>> getUnhealthyRelocationCount; // FIXME(xwang): change it to a synchronous call
+
+	virtual int getUnhealthyRelocationCount() = 0;
+	virtual ~IDDRelocationQueue() = default;
+	;
+};
+
+#endif // FOUNDATIONDB_DDRELOCATIONQUEUE_H
--- a/fdbserver/include/fdbserver/DDShardTracker.h
+++ b/fdbserver/include/fdbserver/DDShardTracker.h
@ -0,0 +1,40 @@
+/*
+ * DDShardTracker.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FOUNDATIONDB_DDSHARDTRACKER_H
+#define FOUNDATIONDB_DDSHARDTRACKER_H
+#include "DataDistribution.actor.h"
+
+// send request/signal to DDTracker through interface
+// call synchronous method from components outside DDShardTracker
+struct IDDShardTracker {
+	// FIXME: the streams are not used yet
+	Promise<Void> readyToStart;
+	PromiseStream<GetMetricsRequest> getShardMetrics;
+	PromiseStream<GetTopKMetricsRequest> getTopKMetrics;
+	PromiseStream<GetMetricsListRequest> getShardMetricsList;
+	PromiseStream<KeyRange> restartShardTracker;
+
+	// PromiseStream<Promise<int64_t>> averageShardBytes; // FIXME(xwang): change it to a synchronous call
+
+	virtual double getAverageShardBytes() = 0;
+	virtual ~IDDShardTracker() = default;
+};
+
+#endif // FOUNDATIONDB_DDSHARDTRACKER_H
--- a/fdbserver/include/fdbserver/DDSharedContext.h
+++ b/fdbserver/include/fdbserver/DDSharedContext.h
@ -0,0 +1,70 @@
+/*
+ * DDSharedContext.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FOUNDATIONDB_DDSHAREDCONTEXT_H
+#define FOUNDATIONDB_DDSHAREDCONTEXT_H
+#include "fdbserver/MoveKeys.actor.h"
+#include "fdbserver/ShardsAffectedByTeamFailure.h"
+#include "fdbserver/DDShardTracker.h"
+#include "fdbserver/DDRelocationQueue.h"
+#include "fdbserver/DDTeamCollection.h"
+
+// The common info shared by all DD components. Normally the DD components should share the reference to the same
+// context.
+// NOTE: We should avoid the shared class become an insanely large class, think twice before add member to it.
+class DDSharedContext : public ReferenceCounted<DDSharedContext> {
+	// FIXME(xwang) mark fields privates
+public:
+	std::unique_ptr<DDEnabledState>
+	    ddEnabledState; // Note: don't operate directly because it's shared with snapshot server
+	IDDShardTracker* shardTracker = nullptr;
+	IDDRelocationQueue* relocationQueue = nullptr;
+	std::vector<IDDTeamCollection*> teamCollections;
+
+	// public:
+	UID ddId;
+	MoveKeysLock lock;
+	bool trackerCancelled = false;
+	DatabaseConfiguration configuration;
+
+	Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure;
+	Reference<AsyncVar<bool>> processingUnhealthy, processingWiggle;
+
+	DDSharedContext() = default;
+
+	DDSharedContext(UID id)
+	  : ddEnabledState(new DDEnabledState), ddId(id), shardsAffectedByTeamFailure(new ShardsAffectedByTeamFailure),
+	    processingUnhealthy(new AsyncVar<bool>(false)), processingWiggle(new AsyncVar<bool>(false)) {}
+
+	UID id() const { return ddId; }
+
+	void markTrackerCancelled() { trackerCancelled = true; }
+
+	bool isTrackerCancelled() const { return trackerCancelled; }
+
+	decltype(auto) usableRegions() const { return configuration.usableRegions; }
+
+	bool isDDEnabled() const { return ddEnabledState->isDDEnabled(); };
+
+	void proposeRelocation(const RelocateShard& rs) const { return relocationQueue->relocationProducer.send(rs); }
+
+	void requestRestartShardTracker(KeyRange keys) const { return shardTracker->restartShardTracker.send(keys); }
+};
+
+#endif // FOUNDATIONDB_DDSHAREDCONTEXT_H
--- a/fdbserver/include/fdbserver/DDTeamCollection.h
+++ b/fdbserver/include/fdbserver/DDTeamCollection.h
@ -173,6 +173,13 @@ FDB_DECLARE_BOOLEAN_PARAM(IsRedundantTeam);
 FDB_DECLARE_BOOLEAN_PARAM(IsBadTeam);
 FDB_DECLARE_BOOLEAN_PARAM(WaitWiggle);

+// send request/signal to DDTeamCollection through interface
+// call synchronous method from components outside DDTeamCollection
+struct IDDTeamCollection {
+	PromiseStream<GetTeamRequest> getTeam;
+	virtual ~IDDTeamCollection() {}
+};
+
 class DDTeamCollection : public ReferenceCounted<DDTeamCollection> {
 	friend class DDTeamCollectionImpl;
 	friend class DDTeamCollectionUnitTest;
@ -402,7 +409,7 @@ class DDTeamCollection : public ReferenceCounted<DDTeamCollection> {
 	// in the next iteration of the loop. Otherwise, you may miss checking some elements in machineTeams
 	bool removeMachineTeam(Reference<TCMachineTeamInfo> targetMT);

-	// Adds storage servers held on process of which the Process Id is “id” into excludeServers which prevent
+	// Adds storage servers held on process of which the Process id is “id” into excludeServers which prevent
 	// recruiting the wiggling storage servers and let teamTracker start to move data off the affected teams;
 	// Return a vector of futures wait for all data is moved to other teams.
 	Future<Void> excludeStorageServersForWiggle(const UID& id);
--- a/fdbserver/include/fdbserver/DDTxnProcessor.h
+++ b/fdbserver/include/fdbserver/DDTxnProcessor.h
@ -50,7 +50,7 @@ public:

 	virtual ~IDDTxnProcessor() = default;

-	[[nodiscard]] virtual Future<MoveKeysLock> takeMoveKeysLock(UID ddId) const { return MoveKeysLock(); }
+	[[nodiscard]] virtual Future<MoveKeysLock> takeMoveKeysLock(const UID& ddId) const { return MoveKeysLock(); }

 	virtual Future<DatabaseConfiguration> getDatabaseConfiguration() const { return DatabaseConfiguration(); }

@ -61,6 +61,19 @@ public:
 	}

 	virtual Future<Void> waitForDataDistributionEnabled(const DDEnabledState* ddEnabledState) const = 0;
+
+	virtual Future<bool> isDataDistributionEnabled(const DDEnabledState* ddEnabledState) const = 0;
+
+	virtual Future<Void> pollMoveKeysLock(const MoveKeysLock& lock, const DDEnabledState* ddEnabledState) const = 0;
+
+	virtual Future<Void> removeKeysFromFailedServer(const UID& serverID,
+	                                                const std::vector<UID>& teamForDroppedRange,
+	                                                const MoveKeysLock& lock,
+	                                                const DDEnabledState* ddEnabledState) const = 0;
+	virtual Future<Void> removeStorageServer(const UID& serverID,
+	                                         const Optional<UID>& tssPairID,
+	                                         const MoveKeysLock& lock,
+	                                         const DDEnabledState* ddEnabledState) const = 0;
 };

 class DDTxnProcessorImpl;
@ -85,7 +98,7 @@ public:
 	    const std::vector<Optional<Key>>& remoteDcIds,
 	    const DDEnabledState* ddEnabledState) override;

-	Future<MoveKeysLock> takeMoveKeysLock(UID ddId) const override;
+	Future<MoveKeysLock> takeMoveKeysLock(UID const& ddId) const override;

 	Future<DatabaseConfiguration> getDatabaseConfiguration() const override;

@ -94,6 +107,24 @@ public:
 	                               const DatabaseConfiguration& configuration) const override;

 	Future<Void> waitForDataDistributionEnabled(const DDEnabledState* ddEnabledState) const override;
+
+	Future<bool> isDataDistributionEnabled(const DDEnabledState* ddEnabledState) const override;
+
+	Future<Void> pollMoveKeysLock(const MoveKeysLock& lock, const DDEnabledState* ddEnabledState) const override;
+
+	Future<Void> removeKeysFromFailedServer(const UID& serverID,
+	                                        const std::vector<UID>& teamForDroppedRange,
+	                                        const MoveKeysLock& lock,
+	                                        const DDEnabledState* ddEnabledState) const override {
+		return ::removeKeysFromFailedServer(cx, serverID, teamForDroppedRange, lock, ddEnabledState);
+	}
+
+	Future<Void> removeStorageServer(const UID& serverID,
+	                                 const Optional<UID>& tssPairID,
+	                                 const MoveKeysLock& lock,
+	                                 const DDEnabledState* ddEnabledState) const override {
+		return ::removeStorageServer(cx, serverID, tssPairID, lock, ddEnabledState);
+	}
 };

 // A mock transaction implementation for test usage.
--- a/fdbserver/include/fdbserver/DataDistribution.actor.h
+++ b/fdbserver/include/fdbserver/DataDistribution.actor.h
@ -36,6 +36,11 @@

 #include "flow/actorcompiler.h" // This must be the last #include.

+/////////////////////////////// Data //////////////////////////////////////
+#ifndef __INTEL_COMPILER
+#pragma region Data
+#endif
+
 // SOMEDAY: whether it's possible to combine RelocateReason and DataMovementReason together?
 // RelocateReason to DataMovementReason is one-to-N mapping
 class RelocateReason {
@ -88,7 +93,8 @@ enum class DataMovementReason {
 	TEAM_1_LEFT,
 	TEAM_FAILED,
 	TEAM_0_LEFT,
-	SPLIT_SHARD
+	SPLIT_SHARD,
+	ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD
 };
 extern int dataMovementPriority(DataMovementReason moveReason);
 extern DataMovementReason priorityToDataMovementReason(int priority);
@ -291,6 +297,7 @@ struct GetTopKMetricsReply {
 	GetTopKMetricsReply(std::vector<KeyRangeStorageMetrics> const& m, double minReadLoad, double maxReadLoad)
 	  : shardMetrics(m), minReadLoad(minReadLoad), maxReadLoad(maxReadLoad) {}
 };
+
 struct GetTopKMetricsRequest {
 	int topK = 1; // default only return the top 1 shard based on the GetTopKMetricsRequest::compare function
 	std::vector<KeyRange> keys;
@ -329,8 +336,186 @@ struct GetMetricsListRequest {
 	GetMetricsListRequest(KeyRange const& keys, const int shardLimit) : keys(keys), shardLimit(shardLimit) {}
 };

-struct TeamCollectionInterface {
-	PromiseStream<GetTeamRequest> getTeam;
+// PhysicalShardCollection maintains physical shard concepts in data distribution
+// A physical shard contains one or multiple shards (key range)
+// PhysicalShardCollection is responsible for creation and maintenance of physical shards (including metrics)
+// For multiple DCs, PhysicalShardCollection maintains a pair of primary team and remote team
+// A primary team and a remote team shares a physical shard
+// For each shard (key-range) move, PhysicalShardCollection decides which physical shard and corresponding team(s) to
+// move The current design of PhysicalShardCollection assumes that there exists at most two teamCollections
+// TODO: unit test needed
+FDB_DECLARE_BOOLEAN_PARAM(InAnonymousPhysicalShard);
+FDB_DECLARE_BOOLEAN_PARAM(PhysicalShardHasMoreThanKeyRange);
+FDB_DECLARE_BOOLEAN_PARAM(InOverSizePhysicalShard);
+FDB_DECLARE_BOOLEAN_PARAM(PhysicalShardAvailable);
+FDB_DECLARE_BOOLEAN_PARAM(MoveKeyRangeOutPhysicalShard);
+
+class PhysicalShardCollection : public ReferenceCounted<PhysicalShardCollection> {
+public:
+	PhysicalShardCollection() : requireTransition(false), lastTransitionStartTime(now()) {}
+
+	enum class PhysicalShardCreationTime { DDInit, DDRelocator };
+
+	struct PhysicalShard {
+		PhysicalShard() : id(UID().first()) {}
+
+		PhysicalShard(uint64_t id,
+		              StorageMetrics const& metrics,
+		              std::vector<ShardsAffectedByTeamFailure::Team> teams,
+		              PhysicalShardCreationTime whenCreated)
+		  : id(id), metrics(metrics), teams(teams), whenCreated(whenCreated) {}
+
+		std::string toString() const { return fmt::format("{}", std::to_string(id)); }
+
+		uint64_t id; // physical shard id (never changed)
+		StorageMetrics metrics; // current metrics, updated by shardTracker
+		std::vector<ShardsAffectedByTeamFailure::Team> teams; // which team owns this physical shard (never changed)
+		PhysicalShardCreationTime whenCreated; // when this physical shard is created (never changed)
+	};
+
+	// Two-step team selection
+	// Usage: getting primary dest team and remote dest team in dataDistributionRelocator()
+	// The overall process has two steps:
+	// Step 1: get a physical shard id given the input primary team
+	// Return a new physical shard id if the input primary team is new or the team has no available physical shard
+	// checkPhysicalShardAvailable() defines whether a physical shard is available
+	uint64_t determinePhysicalShardIDGivenPrimaryTeam(ShardsAffectedByTeamFailure::Team primaryTeam,
+	                                                  StorageMetrics const& metrics,
+	                                                  bool forceToUseNewPhysicalShard,
+	                                                  uint64_t debugID);
+
+	// Step 2: get a remote team which has the input physical shard
+	// Return empty if no such remote team
+	// May return a problematic remote team, and re-selection is required for this case
+	Optional<ShardsAffectedByTeamFailure::Team> tryGetAvailableRemoteTeamWith(uint64_t inputPhysicalShardID,
+	                                                                          StorageMetrics const& moveInMetrics,
+	                                                                          uint64_t debugID);
+	// Invariant:
+	// (1) If forceToUseNewPhysicalShard is set, use the bestTeams selected by getTeam(), and create a new physical
+	// shard for the teams
+	// (2) If forceToUseNewPhysicalShard is not set, use the primary team selected by getTeam()
+	//     If there exists a remote team which has an available physical shard with the primary team
+	//         Then, use the remote team. Note that the remote team may be unhealthy and the remote team
+	//         may be one who issues the current data relocation.
+	//         In this case, we set forceToUseNewPhysicalShard to use getTeam() to re-select the remote team
+	//     Otherwise, use getTeam() to re-select the remote team
+
+	// Create a physical shard when initializing PhysicalShardCollection
+	void initPhysicalShardCollection(KeyRange keys,
+	                                 std::vector<ShardsAffectedByTeamFailure::Team> selectedTeams,
+	                                 uint64_t physicalShardID,
+	                                 uint64_t debugID);
+
+	// Create a physical shard when updating PhysicalShardCollection
+	void updatePhysicalShardCollection(KeyRange keys,
+	                                   bool isRestore,
+	                                   std::vector<ShardsAffectedByTeamFailure::Team> selectedTeams,
+	                                   uint64_t physicalShardID,
+	                                   const StorageMetrics& metrics,
+	                                   uint64_t debugID);
+
+	// Update physicalShard metrics and return whether the keyRange needs to move out of its physical shard
+	MoveKeyRangeOutPhysicalShard trackPhysicalShard(KeyRange keyRange,
+	                                                StorageMetrics const& newMetrics,
+	                                                StorageMetrics const& oldMetrics,
+	                                                bool initWithNewMetrics);
+
+	// Clean up empty physicalShard
+	void cleanUpPhysicalShardCollection();
+
+	// Log physicalShard
+	void logPhysicalShardCollection();
+
+private:
+	// Track physicalShard metrics by tracking keyRange metrics
+	void updatePhysicalShardMetricsByKeyRange(KeyRange keyRange,
+	                                          StorageMetrics const& newMetrics,
+	                                          StorageMetrics const& oldMetrics,
+	                                          bool initWithNewMetrics);
+
+	// Check the input keyRange is in the anonymous physical shard
+	InAnonymousPhysicalShard isInAnonymousPhysicalShard(KeyRange keyRange);
+
+	// Check the input physicalShard has more keyRanges in addition to the input keyRange
+	PhysicalShardHasMoreThanKeyRange whetherPhysicalShardHasMoreThanKeyRange(uint64_t physicalShardID,
+	                                                                         KeyRange keyRange);
+
+	// Check the input keyRange is in an oversize physical shard
+	// This function returns true to enforce the keyRange to move out the physical shard
+	// Note that if the physical shard only contains the keyRange, always return FALSE
+	InOverSizePhysicalShard isInOverSizePhysicalShard(KeyRange keyRange);
+
+	// Generate a random physical shard ID, which is not UID().first() nor anonymousShardId.first()
+	uint64_t generateNewPhysicalShardID(uint64_t debugID);
+
+	// Check whether the input physical shard is available
+	// A physical shard is available if the current metric + moveInMetrics <= a threshold
+	PhysicalShardAvailable checkPhysicalShardAvailable(uint64_t physicalShardID, StorageMetrics const& moveInMetrics);
+
+	// If the input team has any available physical shard, return an available physical shard of the input team
+	Optional<uint64_t> trySelectAvailablePhysicalShardFor(ShardsAffectedByTeamFailure::Team team,
+	                                                      StorageMetrics const& metrics,
+	                                                      uint64_t debugID);
+
+	// Reduce the metics of input physical shard by the input metrics
+	void reduceMetricsForMoveOut(uint64_t physicalShardID, StorageMetrics const& metrics);
+
+	// Add the input metrics to the metrics of input physical shard
+	void increaseMetricsForMoveIn(uint64_t physicalShardID, StorageMetrics const& metrics);
+
+	// In physicalShardCollection, add a physical shard initialized by the input parameters to the collection
+	void insertPhysicalShardToCollection(uint64_t physicalShardID,
+	                                     StorageMetrics const& metrics,
+	                                     std::vector<ShardsAffectedByTeamFailure::Team> teams,
+	                                     uint64_t debugID,
+	                                     PhysicalShardCreationTime whenCreated);
+
+	// In teamPhysicalShardIDs, add the input physical shard id to the input teams
+	void updateTeamPhysicalShardIDsMap(uint64_t physicalShardID,
+	                                   std::vector<ShardsAffectedByTeamFailure::Team> inputTeams,
+	                                   uint64_t debugID);
+
+	// In keyRangePhysicalShardIDMap, set the input physical shard id to the input key range
+	void updatekeyRangePhysicalShardIDMap(KeyRange keyRange, uint64_t physicalShardID, uint64_t debugID);
+
+	// Return a string concating the input IDs interleaving with " "
+	std::string convertIDsToString(std::set<uint64_t> ids);
+
+	// Reset TransitionStartTime
+	// Consider a system without concept of physicalShard
+	// When restart, the system begins with a state where all keyRanges are in the anonymousShard
+	// Our goal is to make all keyRanges are out of the anonymousShard
+	// A keyRange moves out of the anonymousShard when the keyRange is triggered a data move
+	// It is possible that a keyRange is cold and no data move is triggered on this keyRange for long time
+	// In this case, we need to intentionally trigger data move on that keyRange
+	// The minimal time span between two successive data move for this purpose is TransitionStartTime
+	inline void resetLastTransitionStartTime() { // reset when a keyRange move is triggered for the transition
+		lastTransitionStartTime = now();
+		return;
+	}
+
+	// When DD restarts, it checks whether keyRange has anonymousShard
+	// If yes, setTransitionCheck() is call to trigger the process of removing anonymousShard
+	inline void setTransitionCheck() {
+		if (requireTransition == true) {
+			return;
+		}
+		requireTransition = true;
+		TraceEvent("PhysicalShardSetTransitionCheck");
+		return;
+	}
+
+	inline bool requireTransitionCheck() { return requireTransition; }
+
+	// Core data structures
+	// Physical shard instances indexed by physical shard id
+	std::unordered_map<uint64_t, PhysicalShard> physicalShardInstances;
+	// Indicate a key range belongs to which physical shard
+	KeyRangeMap<uint64_t> keyRangePhysicalShardIDMap;
+	// Indicate what physical shards owned by a team
+	std::map<ShardsAffectedByTeamFailure::Team, std::set<uint64_t>> teamPhysicalShardIDs;
+	double lastTransitionStartTime;
+	bool requireTransition;
 };

 // DDShardInfo is so named to avoid link-time name collision with ShardInfo within the StorageServer
@ -382,10 +567,46 @@ struct ShardTrackedData {
 	Reference<AsyncVar<Optional<ShardMetrics>>> stats;
 };

+// Holds the permitted size and IO Bounds for a shard
+struct ShardSizeBounds {
+	StorageMetrics max;
+	StorageMetrics min;
+	StorageMetrics permittedError;
+
+	bool operator==(ShardSizeBounds const& rhs) const {
+		return max == rhs.max && min == rhs.min && permittedError == rhs.permittedError;
+	}
+};
+
+// Gets the permitted size and IO bounds for a shard
+ShardSizeBounds getShardSizeBounds(KeyRangeRef shard, int64_t maxShardSize);
+
+// Determines the maximum shard size based on the size of the database
+int64_t getMaxShardSize(double dbSizeEstimate);
+
+struct StorageQuotaInfo {
+	std::map<Key, uint64_t> quotaMap;
+};
+
+#ifndef __INTEL_COMPILER
+#pragma endregion
+#endif
+
+// FIXME(xwang): Delete Old DD Actors once the refactoring is done
+/////////////////////////////// Old DD Actors //////////////////////////////////////
+#ifndef __INTEL_COMPILER
+#pragma region Old DD Actors
+#endif
+
+struct TeamCollectionInterface {
+	PromiseStream<GetTeamRequest> getTeam;
+};
+
 ACTOR Future<Void> dataDistributionTracker(Reference<InitialDataDistribution> initData,
                                           Database cx,
                                           PromiseStream<RelocateShard> output,
                                           Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
+                                           Reference<PhysicalShardCollection> physicalShardCollection,
                                           PromiseStream<GetMetricsRequest> getShardMetrics,
                                           FutureStream<GetTopKMetricsRequest> getTopKMetrics,
                                           PromiseStream<GetMetricsListRequest> getShardMetricsList,
@ -405,6 +626,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
                                         Reference<AsyncVar<bool>> processingWiggle,
                                         std::vector<TeamCollectionInterface> teamCollection,
                                         Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
+                                         Reference<PhysicalShardCollection> physicalShardCollection,
                                         MoveKeysLock lock,
                                         PromiseStream<Promise<int64_t>> getAverageShardBytes,
                                         FutureStream<Promise<int>> getUnhealthyRelocationCount,
@ -412,24 +634,14 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
                                         int teamSize,
                                         int singleRegionTeamSize,
                                         const DDEnabledState* ddEnabledState);
+#ifndef __INTEL_COMPILER
+#pragma endregion
+#endif

-// Holds the permitted size and IO Bounds for a shard
-struct ShardSizeBounds {
-	StorageMetrics max;
-	StorageMetrics min;
-	StorageMetrics permittedError;
-
-	bool operator==(ShardSizeBounds const& rhs) const {
-		return max == rhs.max && min == rhs.min && permittedError == rhs.permittedError;
-	}
-};
-
-// Gets the permitted size and IO bounds for a shard
-ShardSizeBounds getShardSizeBounds(KeyRangeRef shard, int64_t maxShardSize);
-
-// Determines the maximum shard size based on the size of the database
-int64_t getMaxShardSize(double dbSizeEstimate);
-
+/////////////////////////////// Perpetual Storage Wiggle //////////////////////////////////////
+#ifndef __INTEL_COMPILER
+#pragma region Perpetual Storage Wiggle
+#endif
 class DDTeamCollection;

 struct StorageWiggleMetrics {
@ -591,5 +803,9 @@ struct StorageWiggler : ReferenceCounted<StorageWiggler> {
 	}
 };

+#ifndef __INTEL_COMPILER
+#pragma endregion
+#endif
+
 #include "flow/unactorcompiler.h"
 #endif
--- a/fdbserver/include/fdbserver/EncryptedMutationMessage.h
+++ b/fdbserver/include/fdbserver/EncryptedMutationMessage.h
@ -1,119 +0,0 @@
-/*
- * EncryptedMutationMessage.h
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef FDBSERVER_ENCRYPTEDMUTATIONMESSAGE_H
-#define FDBSERVER_ENCRYPTEDMUTATIONMESSAGE_H
-
-#pragma once
-
-#include "fdbclient/CommitTransaction.h"
-#include "fdbserver/Knobs.h"
-#include "flow/BlobCipher.h"
-
-struct EncryptedMutationMessage {
-
-	BlobCipherEncryptHeader header;
-	StringRef encrypted;
-
-	EncryptedMutationMessage() {}
-
-	std::string toString() const {
-		return format("code: %d, encryption info: %s",
-		              MutationRef::Reserved_For_EncryptedMutationMessage,
-		              header.toString().c_str());
-	}
-
-	template <class Ar>
-	void serialize(Ar& ar) {
-		uint8_t poly = MutationRef::Reserved_For_EncryptedMutationMessage;
-		serializer(ar, poly, header, encrypted);
-	}
-
-	static bool startsEncryptedMutationMessage(uint8_t byte) {
-		return byte == MutationRef::Reserved_For_EncryptedMutationMessage;
-	}
-	template <class Ar>
-	static bool isNextIn(Ar& ar) {
-		return startsEncryptedMutationMessage(*(const uint8_t*)ar.peekBytes(1));
-	}
-
-	// Encrypt given mutation and return an EncryptedMutationMessage.
-	static EncryptedMutationMessage encrypt(
-	    Arena& arena,
-	    const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>& cipherKeys,
-	    const EncryptCipherDomainId& domainId,
-	    const MutationRef& mutation) {
-		ASSERT_NE(domainId, ENCRYPT_INVALID_DOMAIN_ID);
-		auto textCipherItr = cipherKeys.find(domainId);
-		auto headerCipherItr = cipherKeys.find(ENCRYPT_HEADER_DOMAIN_ID);
-		ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid());
-		ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid());
-		uint8_t iv[AES_256_IV_LENGTH];
-		deterministicRandom()->randomBytes(iv, AES_256_IV_LENGTH);
-		BinaryWriter bw(AssumeVersion(g_network->protocolVersion()));
-		bw << mutation;
-		EncryptedMutationMessage encrypted_mutation;
-		EncryptBlobCipherAes265Ctr cipher(textCipherItr->second,
-		                                  headerCipherItr->second,
-		                                  iv,
-		                                  AES_256_IV_LENGTH,
-		                                  ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
-		encrypted_mutation.encrypted =
-		    cipher
-		        .encrypt(static_cast<const uint8_t*>(bw.getData()), bw.getLength(), &encrypted_mutation.header, arena)
-		        ->toStringRef();
-		return encrypted_mutation;
-	}
-
-	// Encrypt system key space mutation and return an EncryptedMutationMessage.
-	static EncryptedMutationMessage encryptMetadata(
-	    Arena& arena,
-	    const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>& cipherKeys,
-	    const MutationRef& mutation) {
-		return encrypt(arena, cipherKeys, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, mutation);
-	}
-
-	// Read an EncryptedMutationMessage from given reader, decrypt and return the encrypted mutation.
-	// Also return decrypt buffer through buf, if it is specified.
-	template <class Ar>
-	static MutationRef decrypt(Ar& ar,
-	                           Arena& arena,
-	                           const std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>& cipherKeys,
-	                           StringRef* buf = nullptr) {
-		ASSERT(SERVER_KNOBS->ENABLE_ENCRYPTION);
-		EncryptedMutationMessage msg;
-		ar >> msg;
-		auto textCipherItr = cipherKeys.find(msg.header.cipherTextDetails);
-		auto headerCipherItr = cipherKeys.find(msg.header.cipherHeaderDetails);
-		ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid());
-		ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid());
-		DecryptBlobCipherAes256Ctr cipher(textCipherItr->second, headerCipherItr->second, msg.header.iv);
-		StringRef plaintext =
-		    cipher.decrypt(msg.encrypted.begin(), msg.encrypted.size(), msg.header, arena)->toStringRef();
-		if (buf != nullptr) {
-			*buf = plaintext;
-		}
-		ArenaReader reader(arena, plaintext, AssumeVersion(g_network->protocolVersion()));
-		MutationRef mutation;
-		reader >> mutation;
-		return mutation;
-	}
-};
-#endif
--- a/fdbserver/include/fdbserver/workloads/BulkSetup.actor.h
+++ b/fdbserver/include/fdbserver/workloads/BulkSetup.actor.h
@ -181,15 +181,30 @@ ACTOR Future<std::vector<std::pair<uint64_t, double>>> trackInsertionCount(Datab

 ACTOR template <class T>
 Future<Void> waitForLowInFlight(Database cx, T* workload) {
+	state Future<Void> timeout = delay(600.0);
 	loop {
-		int64_t inFlight = wait(getDataInFlight(cx, workload->dbInfo));
-		TraceEvent("DynamicWarming").detail("InFlight", inFlight);
-		if (inFlight > 1e6) { // Wait for just 1 MB to be in flight
-			wait(delay(1.0));
-		} else {
-			wait(delay(1.0));
-			TraceEvent("DynamicWarmingDone").log();
-			break;
+		try {
+			if (timeout.isReady()) {
+				throw timed_out();
+			}
+
+			int64_t inFlight = wait(getDataInFlight(cx, workload->dbInfo));
+			TraceEvent("DynamicWarming").detail("InFlight", inFlight);
+			if (inFlight > 1e6) { // Wait for just 1 MB to be in flight
+				wait(delay(1.0));
+			} else {
+				wait(delay(1.0));
+				TraceEvent("DynamicWarmingDone").log();
+				break;
+			}
+		} catch (Error& e) {
+			if (e.code() == error_code_attribute_not_found) {
+				// DD may not be initialized yet and attribute "DataInFlight" can be missing
+				wait(delay(1.0));
+			} else {
+				TraceEvent(SevWarn, "WaitForLowInFlightError").error(e);
+				throw;
+			}
 		}
 	}
 	return Void();
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@ -489,10 +489,3 @@ TEST_CASE("/fdbserver/MasterServer/FigureVersion/NegativeReferenceVersion") {
 	ASSERT_EQ(figureVersion(0, 2.0, -1e6, 5e5, 0.1, 1e6), 550000);
 	return Void();
 }
-
-TEST_CASE("/fdbserver/MasterServer/FigureVersion/Overflow") {
-	// The upper range used in std::clamp should overflow.
-	ASSERT_EQ(figureVersion(std::numeric_limits<Version>::max() - static_cast<Version>(1e6), 1.0, 0, 1e6, 0.1, 1e6),
-	          std::numeric_limits<Version>::max() - static_cast<Version>(1e6 * 0.1));
-	return Void();
-}
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -58,7 +58,6 @@
 #include "fdbrpc/sim_validation.h"
 #include "fdbrpc/Smoother.h"
 #include "fdbrpc/Stats.h"
-#include "fdbserver/EncryptedMutationMessage.h"
 #include "fdbserver/FDBExecHelper.actor.h"
 #include "fdbserver/GetEncryptCipherKeys.h"
 #include "fdbserver/IKeyValueStore.h"
@ -553,7 +552,6 @@ struct ChangeFeedInfo : ReferenceCounted<ChangeFeedInfo> {
 	Version storageVersion = invalidVersion; // The version between the storage version and the durable version are
 	                                         // being written to disk as part of the current commit in updateStorage.
 	Version durableVersion = invalidVersion; // All versions before the durable version are durable on disk
-	// FIXME: this needs to get persisted to disk to still fix same races across restart!
 	Version metadataVersion = invalidVersion; // Last update to the change feed metadata. Used for reasoning about
 	                                          // fetched metadata vs local metadata
 	Version emptyVersion = 0; // The change feed does not have any mutations before emptyVersion
@ -1185,11 +1183,6 @@ public:
 			});
 			specialCounter(
 			    cc, "FetchKeysFullFetchWaiting", [self]() { return self->fetchKeysParallelismFullLock.waiters(); });
-			specialCounter(cc, "FetchChangeFeedFetchActive", [self]() {
-				return self->fetchChangeFeedParallelismLock.activePermits();
-			});
-			specialCounter(
-			    cc, "FetchChangeFeedWaiting", [self]() { return self->fetchChangeFeedParallelismLock.waiters(); });
 			specialCounter(cc, "ServeFetchCheckpointActive", [self]() {
 				return self->serveFetchCheckpointParallelismLock.activePermits();
 			});
@ -1249,7 +1242,6 @@ public:
 	    numWatches(0), noRecentUpdates(false), lastUpdate(now()), updateEagerReads(nullptr),
 	    fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM),
 	    fetchKeysParallelismFullLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_FULL),
-	    fetchChangeFeedParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM),
 	    fetchKeysBytesBudget(SERVER_KNOBS->STORAGE_FETCH_BYTES), fetchKeysBudgetUsed(false),
 	    serveFetchCheckpointParallelismLock(SERVER_KNOBS->SERVE_FETCH_CHECKPOINT_PARALLELISM),
 	    instanceID(deterministicRandom()->randomUniqueID().first()), shuttingDown(false), behind(false),
@ -5804,10 +5796,6 @@ ACTOR Future<Version> fetchChangeFeed(StorageServer* data,
                                      Version endVersion) {
 	wait(delay(0)); // allow this actor to be cancelled by removals

-	// bound active change feed fetches
-	wait(data->fetchChangeFeedParallelismLock.take(TaskPriority::DefaultYield));
-	state FlowLock::Releaser holdingFCFPL(data->fetchChangeFeedParallelismLock);
-
 	TraceEvent(SevDebug, "FetchChangeFeed", data->thisServerID)
 	    .detail("RangeID", changeFeedInfo->id)
 	    .detail("Range", changeFeedInfo->range)
@ -6495,8 +6483,10 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
 		}

 		// FIXME: remove when we no longer support upgrades from 5.X
-		data->cx->enableLocalityLoadBalance = EnableLocalityLoadBalance::True;
-		TraceEvent(SevWarnAlways, "FKReenableLB").detail("FKID", fetchKeysID);
+		if (!data->cx->enableLocalityLoadBalance) {
+			data->cx->enableLocalityLoadBalance = EnableLocalityLoadBalance::True;
+			TraceEvent(SevWarnAlways, "FKReenableLB").detail("FKID", fetchKeysID);
+		}

 		// We have completed the fetch and write of the data, now we wait for MVCC window to pass.
 		//  As we have finished this work, we will allow more work to start...
@ -7998,23 +7988,18 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 				           OTELSpanContextMessage::isNextIn(cloneReader)) {
 					OTELSpanContextMessage scm;
 					cloneReader >> scm;
-				} else if (cloneReader.protocolVersion().hasEncryptionAtRest() &&
-				           EncryptedMutationMessage::isNextIn(cloneReader) && !cipherKeys.present()) {
-					// Encrypted mutation found, but cipher keys haven't been fetch.
-					// Collect cipher details to fetch cipher keys in one batch.
-					EncryptedMutationMessage emm;
-					cloneReader >> emm;
-					cipherDetails.insert(emm.header.cipherTextDetails);
-					cipherDetails.insert(emm.header.cipherHeaderDetails);
-					collectingCipherKeys = true;
 				} else {
 					MutationRef msg;
-					if (cloneReader.protocolVersion().hasEncryptionAtRest() &&
-					    EncryptedMutationMessage::isNextIn(cloneReader)) {
-						assert(cipherKeys.present());
-						msg = EncryptedMutationMessage::decrypt(cloneReader, eager.arena, cipherKeys.get());
-					} else {
-						cloneReader >> msg;
+					cloneReader >> msg;
+					if (msg.isEncrypted()) {
+						if (!cipherKeys.present()) {
+							const BlobCipherEncryptHeader* header = msg.encryptionHeader();
+							cipherDetails.insert(header->cipherTextDetails);
+							cipherDetails.insert(header->cipherHeaderDetails);
+							collectingCipherKeys = true;
+						} else {
+							msg = msg.decrypt(cipherKeys.get(), eager.arena);
+						}
 					}
 					// TraceEvent(SevDebug, "SSReadingLog", data->thisServerID).detail("Mutation", msg);

@ -8158,11 +8143,10 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 				spanContext = scm.spanContext;
 			} else {
 				MutationRef msg;
-				if (rd.protocolVersion().hasEncryptionAtRest() && EncryptedMutationMessage::isNextIn(rd)) {
+				rd >> msg;
+				if (msg.isEncrypted()) {
 					ASSERT(cipherKeys.present());
-					msg = EncryptedMutationMessage::decrypt(rd, rd.arena(), cipherKeys.get());
-				} else {
-					rd >> msg;
+					msg = msg.decrypt(cipherKeys.get(), rd.arena());
 				}

 				Span span("SS:update"_loc, spanContext);
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@ -1576,6 +1576,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 	state Reference<AsyncVar<Optional<RatekeeperInterface>>> rkInterf(new AsyncVar<Optional<RatekeeperInterface>>());
 	state Reference<AsyncVar<Optional<std::pair<int64_t, BlobManagerInterface>>>> bmEpochAndInterf(
 	    new AsyncVar<Optional<std::pair<int64_t, BlobManagerInterface>>>());
+	state UID lastBMRecruitRequestId;
 	state Reference<AsyncVar<Optional<EncryptKeyProxyInterface>>> ekpInterf(
 	    new AsyncVar<Optional<EncryptKeyProxyInterface>>());
 	state Future<Void> handleErrors = workerHandleErrors(errors.getFuture()); // Needs to be stopped last
@ -2090,9 +2091,17 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 				recruited.initEndpoints();

 				if (bmEpochAndInterf->get().present() && bmEpochAndInterf->get().get().first == req.epoch) {
+					ASSERT(req.reqId == lastBMRecruitRequestId);
 					recruited = bmEpochAndInterf->get().get().second;

 					CODE_PROBE(true, "Recruited while already a blob manager.");
+				} else if (lastBMRecruitRequestId == req.reqId && !bmEpochAndInterf->get().present()) {
+					// The previous blob manager WAS present, like the above case, but it died before the CC got the
+					// response to the recruitment request, so the CC retried to recruit the same blob manager id/epoch
+					// from the same reqId. To keep epoch safety between different managers, instead of restarting the
+					// same manager id at the same epoch, we should just tell it the original request succeeded, and let
+					// it realize this manager died via failure detection and start a new one.
+					CODE_PROBE(true, "Recruited while formerly the same blob manager.");
 				} else {
 					// TODO: it'd be more optimal to halt the last manager if present here, but it will figure it out
 					// via the epoch check
@ -2105,6 +2114,8 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 					DUMPTOKEN(recruited.haltBlobGranules);
 					DUMPTOKEN(recruited.blobManagerExclCheckReq);

+					lastBMRecruitRequestId = req.reqId;
+
 					Future<Void> blobManagerProcess = blobManager(recruited, dbInfo, req.epoch);
 					errorForwarders.add(
 					    forwardError(errors,
--- a/fdbserver/workloads/BlobGranuleCorrectnessWorkload.actor.cpp
+++ b/fdbserver/workloads/BlobGranuleCorrectnessWorkload.actor.cpp
@ -143,7 +143,17 @@ struct ThreadData : ReferenceCounted<ThreadData>, NonCopyable {
 			} catch (Error& e) {
 				// Ignore being unable to parse lastKey as it may be a dummy key.
 			}
+
 			if (t2.size() > 0 && t.getInt(0) != t2.getInt(0)) {
+				if (t.size() > BGW_TUPLE_KEY_SIZE - SERVER_KNOBS->BG_KEY_TUPLE_TRUNCATE_OFFSET) {
+					fmt::print("Tenant: {0}, K={1}, E={2}, LK={3}. {4} != {5}\n",
+					           tenant.prefix.printable(),
+					           k.printable(),
+					           e.printable(),
+					           lastKey.printable(),
+					           t.getInt(0),
+					           t2.getInt(0));
+				}
 				ASSERT(t.size() <= BGW_TUPLE_KEY_SIZE - SERVER_KNOBS->BG_KEY_TUPLE_TRUNCATE_OFFSET);
 			}
 		}
@ -230,27 +240,6 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
 		}
 	}

-	ACTOR Future<Void> setUpBlobRange(Database cx, KeyRange keyRange) {
-		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
-		loop {
-			try {
-				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-				tr->set(blobRangeChangeKey, deterministicRandom()->randomUniqueID().toString());
-				wait(krmSetRange(tr, blobRangeKeys.begin, keyRange, LiteralStringRef("1")));
-				wait(tr->commit());
-				if (BGW_DEBUG) {
-					fmt::print("Successfully set up blob granule range for tenant range [{0} - {1})\n",
-					           keyRange.begin.printable(),
-					           keyRange.end.printable());
-				}
-				return Void();
-			} catch (Error& e) {
-				wait(tr->onError(e));
-			}
-		}
-	}
-
 	ACTOR Future<TenantMapEntry> setUpTenant(Database cx, TenantName name) {
 		if (BGW_DEBUG) {
 			fmt::print("Setting up blob granule range for tenant {0}\n", name.printable());
@ -291,7 +280,8 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
 			self->directories[directoryIdx]->directoryRange =
 			    KeyRangeRef(tenantEntry.prefix, tenantEntry.prefix.withSuffix(normalKeys.end));
 			tenants.push_back({ self->directories[directoryIdx]->tenantName, tenantEntry });
-			wait(self->setUpBlobRange(cx, self->directories[directoryIdx]->directoryRange));
+			bool _success = wait(cx->blobbifyRange(self->directories[directoryIdx]->directoryRange));
+			ASSERT(_success);
 		}
 		tenantData.addTenants(tenants);

@ -911,8 +901,8 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
 		loop {
 			state Transaction tr(cx, threadData->tenantName);
 			try {
-				Standalone<VectorRef<KeyRangeRef>> ranges = wait(tr.getBlobGranuleRanges(normalKeys));
-				ASSERT(ranges.size() >= 1);
+				Standalone<VectorRef<KeyRangeRef>> ranges = wait(tr.getBlobGranuleRanges(normalKeys, 1000000));
+				ASSERT(ranges.size() >= 1 && ranges.size() < 1000000);
 				ASSERT(ranges.front().begin == normalKeys.begin);
 				ASSERT(ranges.back().end == normalKeys.end);
 				for (int i = 0; i < ranges.size() - 1; i++) {
--- a/fdbserver/workloads/BlobGranuleVerifier.actor.cpp
+++ b/fdbserver/workloads/BlobGranuleVerifier.actor.cpp
@ -68,6 +68,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 	bool initAtEnd;
 	bool strictPurgeChecking;
 	bool doForcePurge;
+	bool purgeAtLatest;
 	bool clearAndMergeCheck;

 	DatabaseConfiguration config;
@ -87,6 +88,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		timeTravelLimit = getOption(options, LiteralStringRef("timeTravelLimit"), testDuration);
 		timeTravelBufferSize = getOption(options, LiteralStringRef("timeTravelBufferSize"), 100000000);
 		threads = getOption(options, LiteralStringRef("threads"), 1);
+
 		enablePurging = getOption(options, LiteralStringRef("enablePurging"), sharedRandomNumber % 3 == 0);
 		sharedRandomNumber /= 3;
 		// FIXME: re-enable this! There exist several bugs with purging active granules where a small amount of state
@ -98,6 +100,9 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		doForcePurge = getOption(options, LiteralStringRef("doForcePurge"), sharedRandomNumber % 3 == 0);
 		sharedRandomNumber /= 3;

+		purgeAtLatest = getOption(options, LiteralStringRef("purgeAtLatest"), sharedRandomNumber % 3 == 0);
+		sharedRandomNumber /= 3;
+
 		// randomly some tests write data first and then turn on blob granules later, to test conversion of existing DB
 		initAtEnd = !enablePurging && sharedRandomNumber % 10 == 0;
 		sharedRandomNumber /= 10;
@ -105,13 +110,31 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		clearAndMergeCheck = getOption(options, LiteralStringRef("clearAndMergeCheck"), sharedRandomNumber % 10 == 0);
 		sharedRandomNumber /= 10;

+		// don't do strictPurgeChecking or forcePurge if !enablePurging
+		if (!enablePurging) {
+			strictPurgeChecking = false;
+			doForcePurge = false;
+			purgeAtLatest = false;
+		}
+
+		if (doForcePurge) {
+			purgeAtLatest = false;
+		}
+
+		if (purgeAtLatest) {
+			strictPurgeChecking = false;
+		}
+
 		startedForcePurge = false;

 		if (doSetup && BGV_DEBUG) {
 			fmt::print("BlobGranuleVerifier starting\n");
 			fmt::print("  enablePurging={0}\n", enablePurging);
+			fmt::print("  purgeAtLatest={0}\n", purgeAtLatest);
 			fmt::print("  strictPurgeChecking={0}\n", strictPurgeChecking);
 			fmt::print("  doForcePurge={0}\n", doForcePurge);
+			fmt::print("  initAtEnd={0}\n", initAtEnd);
+			fmt::print("  clearAndMergeCheck={0}\n", clearAndMergeCheck);
 		}

 		ASSERT(threads >= 1);
@ -169,7 +192,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 			state Transaction tr(cx);
 			loop {
 				try {
-					Standalone<VectorRef<KeyRangeRef>> allGranules = wait(tr.getBlobGranuleRanges(normalKeys));
+					Standalone<VectorRef<KeyRangeRef>> allGranules = wait(tr.getBlobGranuleRanges(normalKeys, 1000000));
 					self->granuleRanges.set(allGranules);
 					break;
 				} catch (Error& e) {
@ -308,7 +331,8 @@ struct BlobGranuleVerifierWorkload : TestWorkload {

 					// before doing read, purge just before read version
 					state Version newPurgeVersion = 0;
-					state bool doPurging = allowPurging && deterministicRandom()->random01() < 0.5;
+					state bool doPurging =
+					    allowPurging && !self->purgeAtLatest && deterministicRandom()->random01() < 0.5;
 					state bool forcePurge = doPurging && self->doForcePurge && deterministicRandom()->random01() < 0.25;
 					if (doPurging) {
 						CODE_PROBE(true, "BGV considering purge");
@ -436,11 +460,33 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 				state KeyRange range = self->granuleRanges.get()[rIndex];

 				state std::pair<RangeResult, Version> fdb = wait(readFromFDB(cx, range));
-				std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob =
+				state std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob =
 				    wait(readFromBlob(cx, self->bstore, range, 0, fdb.second));
+				if (self->purgeAtLatest && timeTravelChecks.empty() && deterministicRandom()->random01() < 0.25) {
+					// purge at this version, and make sure it's still readable after on our immediate re-read
+					try {
+						Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, fdb.second, {}, false));
+						if (BGV_DEBUG) {
+							fmt::print("BGV Purged Latest @ {0}, waiting\n", fdb.second);
+						}
+						wait(cx->waitPurgeGranulesComplete(purgeKey));
+					} catch (Error& e) {
+						if (e.code() == error_code_operation_cancelled) {
+							throw e;
+						}
+						// purging shouldn't error, it should retry.
+						if (BGV_DEBUG) {
+							fmt::print("Unexpected error {0} purging latest @ {1}!\n", e.name(), newPurgeVersion);
+						}
+						ASSERT(false);
+					}
+					self->purges++;
+				}
 				if (compareFDBAndBlob(fdb.first, blob, range, fdb.second, BGV_DEBUG)) {
-					// TODO: bias for immediately re-reading to catch rollback cases
-					double reReadTime = currentTime + deterministicRandom()->random01() * self->timeTravelLimit;
+					bool rereadImmediately = self->purgeAtLatest || deterministicRandom()->random01() < 0.25;
+					double reReadTime =
+					    currentTime +
+					    (rereadImmediately ? 0.0 : deterministicRandom()->random01() * self->timeTravelLimit);
 					int memory = fdb.first.expectedSize();
 					if (reReadTime <= endTime &&
 					    timeTravelChecksMemory + memory <= (self->timeTravelBufferSize / self->threads)) {
@ -463,7 +509,6 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 				}
 				self->errors++;
 			}
-			// wait(poisson(&last, 5.0));
 			wait(poisson(&last, 0.1));
 		}
 	}
@ -497,7 +542,73 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		}
 	}

-	ACTOR Future<Void> checkPurgedHistoryEntries(Database cx, BlobGranuleVerifierWorkload* self, KeyRange purgeRange) {
+	ACTOR Future<bool> checkGranuleMetadataPurged(Transaction* tr,
+	                                              KeyRange granuleRange,
+	                                              Version historyVersion,
+	                                              UID granuleId,
+	                                              bool strictMetadataCheck,
+	                                              bool possiblyInFlight) {
+		// change feed
+		Optional<Value> changeFeed = wait(tr->get(granuleIDToCFKey(granuleId).withPrefix(changeFeedPrefix)));
+		if (possiblyInFlight && changeFeed.present()) {
+			fmt::print("WARN: Change Feed for [{0} - {1}): {2} not purged, retrying\n",
+			           granuleRange.begin.printable(),
+			           granuleRange.end.printable(),
+			           granuleId.toString().substr(0, 6));
+			return false;
+		}
+		ASSERT(!changeFeed.present());
+
+		// file metadata
+		RangeResult fileMetadata = wait(tr->getRange(blobGranuleFileKeyRangeFor(granuleId), 1));
+		if (possiblyInFlight && !fileMetadata.empty()) {
+			fmt::print("WARN: File metadata for [{0} - {1}): {2} not purged, retrying\n",
+			           granuleRange.begin.printable(),
+			           granuleRange.end.printable(),
+			           granuleId.toString().substr(0, 6));
+			return false;
+		}
+		ASSERT(fileMetadata.empty());
+
+		if (strictMetadataCheck) {
+			// lock
+			Optional<Value> lock = wait(tr->get(blobGranuleLockKeyFor(granuleRange)));
+			if (possiblyInFlight && lock.present()) {
+				return false;
+			}
+			ASSERT(!lock.present());
+
+			// history entry
+			Optional<Value> history = wait(tr->get(blobGranuleHistoryKeyFor(granuleRange, historyVersion)));
+			if (possiblyInFlight && history.present()) {
+				return false;
+			}
+			ASSERT(!history.present());
+
+			// split state
+			RangeResult splitData = wait(tr->getRange(blobGranuleSplitKeyRangeFor(granuleId), 1));
+			if (possiblyInFlight && !splitData.empty()) {
+				return false;
+			}
+			ASSERT(splitData.empty());
+
+			// merge state
+			Optional<Value> merge = wait(tr->get(blobGranuleMergeKeyFor(granuleId)));
+			if (possiblyInFlight && merge.present()) {
+				return false;
+			}
+			ASSERT(!merge.present());
+
+			// FIXME: add merge boundaries!
+		}
+
+		return true;
+	}
+
+	ACTOR Future<Void> checkPurgedHistoryEntries(Database cx,
+	                                             BlobGranuleVerifierWorkload* self,
+	                                             KeyRange purgeRange,
+	                                             bool strictMetadataCheck) {
 		// quick check to make sure we didn't miss any new granules generated between the purge metadata load time and
 		// the actual purge, by checking for any new history keys in the range
 		// FIXME: fix this check! The BW granule check is really the important one, this finds occasional leftover
@ -506,6 +617,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		// force purging, we check that the history version > the force purge version
 		state Transaction tr(cx);
 		state KeyRange cur = blobGranuleHistoryKeys;
+		state std::vector<std::tuple<KeyRange, Version, UID>> granulesToCheck;
 		loop {
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			try {
@ -515,7 +627,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 					Version version;
 					std::tie(keyRange, version) = decodeBlobGranuleHistoryKey(it.key);
 					if (purgeRange.intersects(keyRange)) {
-						if (BGV_DEBUG && version <= self->forcePurgeVersion) {
+						if (BGV_DEBUG) {
 							fmt::print("Found range [{0} - {1}) @ {2} that avoided force purge [{3} - {4}) @ {5}!!\n",
 							           keyRange.begin.printable(),
 							           keyRange.end.printable(),
@ -524,9 +636,13 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 							           purgeRange.end.printable(),
 							           self->forcePurgeVersion);
 						}
-						ASSERT(version > self->forcePurgeVersion);
+						if (strictMetadataCheck) {
+							ASSERT(!purgeRange.intersects(keyRange));
+						} else {
+							Standalone<BlobGranuleHistoryValue> historyValue = decodeBlobGranuleHistoryValue(it.value);
+							granulesToCheck.emplace_back(keyRange, version, historyValue.granuleID);
+						}
 					}
-					// ASSERT(!purgeRange.intersects(keyRange));
 				}
 				if (!history.empty() && history.more) {
 					cur = KeyRangeRef(keyAfter(history.back().key), cur.end);
@ -538,10 +654,35 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 			}
 		}

+		tr.reset();
+		state int i;
+		if (BGV_DEBUG && !granulesToCheck.empty()) {
+			fmt::print("Checking metadata for {0} non-purged ranges\n", granulesToCheck.size());
+		}
+		for (i = 0; i < granulesToCheck.size(); i++) {
+			loop {
+				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+				try {
+					bool success = wait(self->checkGranuleMetadataPurged(&tr,
+					                                                     std::get<0>(granulesToCheck[i]),
+					                                                     std::get<1>(granulesToCheck[i]),
+					                                                     std::get<2>(granulesToCheck[i]),
+					                                                     strictMetadataCheck,
+					                                                     true));
+					if (success) {
+						break;
+					}
+					wait(delay(5.0));
+				} catch (Error& e) {
+					wait(tr.onError(e));
+				}
+			}
+		}
+
 		return Void();
 	}

-	ACTOR Future<Void> checkPurgedChangeFeeds(Database cx, BlobGranuleVerifierWorkload* self, KeyRange purgeRange) {
+	ACTOR Future<bool> checkPurgedChangeFeeds(Database cx, BlobGranuleVerifierWorkload* self, KeyRange purgeRange) {
 		// quick check to make sure we didn't miss any change feeds
 		state Transaction tr(cx);
 		state KeyRange cur = changeFeedKeys;
@ -566,8 +707,28 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 							    purgeRange.end.printable(),
 							    self->forcePurgeVersion);
 						}
+						// FIXME!!: there is a known race with the existing force purge algorithm that would require a
+						// bit of a redesign. This is mostly an edge case though that we don't anticipate seeing much in
+						// actual use, and the impact of these leaked change feeds is limited because the range is
+						// purged anyway.
+						bool foundAnyHistoryForRange = false;
+						for (auto& purgedData : self->purgedDataToCheck) {
+							KeyRange granuleRange = std::get<0>(purgedData);
+							if (granuleRange.intersects(keyRange)) {
+								foundAnyHistoryForRange = true;
+								break;
+							}
+						}
+
+						if (!foundAnyHistoryForRange) {
+							// if range never existed in blob, and was doing the initial snapshot,  it could have a
+							// change feed but not a history entry/snapshot
+							CODE_PROBE(true, "not failing test for leaked feed with no history");
+							fmt::print("Not failing test b/c feed never had history!\n");
+						}
+						return !foundAnyHistoryForRange;
 					}
-					ASSERT(!purgeRange.intersects(keyRange));
+					// ASSERT(!purgeRange.intersects(keyRange));
 				}
 				if (!feeds.empty() && feeds.more) {
 					cur = KeyRangeRef(keyAfter(feeds.back().key), cur.end);
@ -579,7 +740,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 			}
 		}

-		return Void();
+		return true;
 	}

 	ACTOR Future<Void> validateForcePurge(Database cx, BlobGranuleVerifierWorkload* self, KeyRange purgeRange) {
@ -650,34 +811,9 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 			loop {
 				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				try {
-					// change feed
-					Optional<Value> changeFeed = wait(tr.get(granuleIDToCFKey(granuleId).withPrefix(changeFeedPrefix)));
-					ASSERT(!changeFeed.present());
-
-					// file metadata
-					RangeResult fileMetadata = wait(tr.getRange(blobGranuleFileKeyRangeFor(granuleId), 1));
-					ASSERT(fileMetadata.empty());
-
-					if (strictMetadataCheck) {
-						// lock
-						Optional<Value> lock = wait(tr.get(blobGranuleLockKeyFor(granuleRange)));
-						ASSERT(!lock.present());
-
-						// history entry
-						Optional<Value> history = wait(tr.get(blobGranuleHistoryKeyFor(granuleRange, historyVersion)));
-						ASSERT(!history.present());
-
-						// split state
-						RangeResult splitData = wait(tr.getRange(blobGranuleSplitKeyRangeFor(granuleId), 1));
-						ASSERT(splitData.empty());
-
-						// merge state
-						Optional<Value> merge = wait(tr.get(blobGranuleMergeKeyFor(granuleId)));
-						ASSERT(!merge.present());
-
-						// FIXME: add merge boundaries!
-					}
-
+					bool success = wait(self->checkGranuleMetadataPurged(
+					    &tr, granuleRange, historyVersion, granuleId, strictMetadataCheck, false));
+					ASSERT(success);
 					break;
 				} catch (Error& e) {
 					wait(tr.onError(e));
@ -710,13 +846,18 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 			           filesChecked);
 		}

-		wait(self->checkPurgedHistoryEntries(cx, self, purgeRange));
+		wait(self->checkPurgedHistoryEntries(cx, self, purgeRange, strictMetadataCheck));

 		if (BGV_DEBUG) {
 			fmt::print("BGV force purge checked for new granule history entries\n");
 		}

-		wait(self->checkPurgedChangeFeeds(cx, self, purgeRange));
+		loop {
+			bool success = wait(self->checkPurgedChangeFeeds(cx, self, purgeRange));
+			if (success) {
+				break;
+			}
+		}

 		if (BGV_DEBUG) {
 			fmt::print("BGV force purge checked for leaked change feeds\n");
@ -774,16 +915,85 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		return Void();
 	}

-	ACTOR Future<bool> _check(Database cx, BlobGranuleVerifierWorkload* self) {
-		if (self->startedForcePurge) {
-			// data may or may not be gone, depending on whether force purge was registered or not. Only do force purge
-			// check if we're sure it was registerd, otherwise, only do actual checks if we're sure no force purge was
-			// started.
-			if (self->forcePurgeKey.present()) {
-				wait(self->validateForcePurge(cx, self, normalKeys));
+	// Check database against blob granules. This is especially important because during chaos phase this can error, and
+	// initAtEnd doesn't get data checked otherwise
+	ACTOR Future<bool> checkAllData(Database cx, BlobGranuleVerifierWorkload* self) {
+		state Transaction tr(cx);
+		state KeyRange keyRange = normalKeys;
+		state bool gotEOS = false;
+		state int64_t totalRows = 0;
+		loop {
+			state RangeResult output;
+			state Version readVersion;
+			try {
+				Version ver = wait(tr.getReadVersion());
+				readVersion = ver;
+
+				state PromiseStream<Standalone<RangeResultRef>> results;
+				state Future<Void> stream = tr.getRangeStream(results, keyRange, GetRangeLimits());
+
+				loop {
+					Standalone<RangeResultRef> res = waitNext(results.getFuture());
+					output.arena().dependsOn(res.arena());
+					output.append(output.arena(), res.begin(), res.size());
+				}
+			} catch (Error& e) {
+				if (e.code() == error_code_operation_cancelled) {
+					throw e;
+				}
+				if (e.code() == error_code_end_of_stream) {
+					gotEOS = true;
+				} else {
+					wait(tr.onError(e));
+				}
+			}
+
+			if (!output.empty()) {
+				state KeyRange rangeToCheck = KeyRangeRef(keyRange.begin, keyAfter(output.back().key));
+				std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob =
+				    wait(readFromBlob(cx, self->bstore, rangeToCheck, 0, readVersion));
+				if (!compareFDBAndBlob(output, blob, rangeToCheck, readVersion, BGV_DEBUG)) {
+					return false;
+				}
+				totalRows += output.size();
+				keyRange = KeyRangeRef(rangeToCheck.end, keyRange.end);
+			}
+			if (gotEOS) {
+				break;
 			}
-			return true;
 		}
+
+		if (BGV_DEBUG) {
+			fmt::print("BGV Final data check complete, checked {0} rows\n", totalRows);
+		}
+
+		return true;
+	}
+
+	ACTOR Future<bool> _check(Database cx, BlobGranuleVerifierWorkload* self) {
+		state Transaction tr(cx);
+		if (self->doForcePurge) {
+			if (self->startedForcePurge) {
+				if (self->forcePurgeKey.present()) {
+					wait(self->validateForcePurge(cx, self, normalKeys));
+				} // else if we had already started purge during the test but aren't sure whether it was registered or
+				  // not,
+				// don't validate that data was purged since it may never be
+				return true;
+			}
+		} else if (self->enablePurging && self->purgeAtLatest && deterministicRandom()->coinflip()) {
+			Version latestPurgeVersion = wait(self->doGrv(&tr));
+			if (BGV_DEBUG) {
+				fmt::print("BGV Purging Latest @ {0} before final availability check\n", latestPurgeVersion);
+			}
+			Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, latestPurgeVersion, {}, false));
+			wait(cx->waitPurgeGranulesComplete(purgeKey));
+			if (BGV_DEBUG) {
+				fmt::print("BGV Purged Latest before final availability check complete\n");
+			}
+			self->purges++;
+		}
+
 		// check error counts, and do an availability check at the end

 		if (self->doSetup && self->initAtEnd) {
@ -791,7 +1001,6 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 			wait(self->setUpBlobRange(cx));
 		}

-		state Transaction tr(cx);
 		state Version readVersion = wait(self->doGrv(&tr));
 		state Version startReadVersion = readVersion;
 		state int checks = 0;
@ -887,8 +1096,11 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		if (BGV_DEBUG && startReadVersion != readVersion) {
 			fmt::print("Availability check updated read version from {0} to {1}\n", startReadVersion, readVersion);
 		}
+
+		state bool dataPassed = wait(self->checkAllData(cx, self));
+
 		state bool result =
-		    availabilityPassed && self->mismatches == 0 && (checks > 0) && (self->timeTravelTooOld == 0);
+		    availabilityPassed && dataPassed && self->mismatches == 0 && (checks > 0) && (self->timeTravelTooOld == 0);
 		fmt::print("Blob Granule Verifier {0} {1}:\n", self->clientId, result ? "passed" : "failed");
 		fmt::print("  {} successful final granule checks\n", checks);
 		fmt::print("  {} failed final granule checks\n", availabilityPassed ? 0 : 1);
@ -900,6 +1112,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		fmt::print("  {} rows\n", self->rowsRead);
 		fmt::print("  {} bytes\n", self->bytesRead);
 		fmt::print("  {} purges\n", self->purges);
+		fmt::print("  {} final data check\n", dataPassed ? "passed" : "failed");
 		// FIXME: add above as details to trace event

 		TraceEvent("BlobGranuleVerifierChecked").detail("Result", result);
@ -907,11 +1120,49 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		// For some reason simulation is still passing when this fails?.. so assert for now
 		ASSERT(result);

-		// FIXME: if doPurging was set, possibly do one last purge here, and verify it succeeds with no errors
+		if (self->doForcePurge) {
+			// if granules are available, and we didn't do a force purge during the test, do it now
+			ASSERT(!self->startedForcePurge);
+			Version rv = wait(self->doGrv(&tr));
+			self->forcePurgeVersion = rv;
+			self->purgedDataToCheck.clear(); //  in case we started but didn't finish loading it, reset it
+			wait(self->loadGranuleMetadataBeforeForcePurge(cx, self));
+			Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, self->forcePurgeVersion, {}, true));
+			self->forcePurgeKey = purgeKey;
+			wait(self->validateForcePurge(cx, self, normalKeys));
+
+			return true;
+		} else if (self->enablePurging && self->purgeAtLatest && deterministicRandom()->coinflip()) {
+			Version latestPurgeVersion = wait(self->doGrv(&tr));
+			if (BGV_DEBUG) {
+				fmt::print("BGV Purging Latest @ {0} after final availability check, waiting\n", latestPurgeVersion);
+			}
+			Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, latestPurgeVersion, {}, false));
+			wait(cx->waitPurgeGranulesComplete(purgeKey));
+			if (BGV_DEBUG) {
+				fmt::print("BGV Purged Latest after final availability check complete\n");
+			}
+		}

 		if (self->clientId == 0 && SERVER_KNOBS->BG_ENABLE_MERGING && self->clearAndMergeCheck) {
 			CODE_PROBE(true, "BGV clearing database and awaiting merge");
 			wait(clearAndAwaitMerge(cx, normalKeys));
+
+			if (self->enablePurging && self->purgeAtLatest && deterministicRandom()->coinflip()) {
+				Version latestPurgeVersion = wait(self->doGrv(&tr));
+				if (BGV_DEBUG) {
+					fmt::print("BGV Purging Latest @ {0} after clearAndAwaitMerge, waiting\n", latestPurgeVersion);
+				}
+				Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, latestPurgeVersion, {}, false));
+				wait(cx->waitPurgeGranulesComplete(purgeKey));
+				if (BGV_DEBUG) {
+					fmt::print("BGV Purged Latest after clearAndAwaitMerge complete\n");
+				}
+			}
+
+			// read after merge to make sure it completed, granules are available, and data is empty
+			bool dataCheckAfterMerge = wait(self->checkAllData(cx, self));
+			ASSERT(dataCheckAfterMerge);
 		}

 		return result;
--- a/fdbserver/workloads/EncryptionOps.actor.cpp
+++ b/fdbserver/workloads/EncryptionOps.actor.cpp
@ -21,14 +21,17 @@
 #include "fdbclient/DatabaseContext.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "flow/EncryptUtils.h"
+#include "flow/Error.h"
 #include "flow/IRandom.h"
 #include "flow/BlobCipher.h"
 #include "fdbserver/workloads/workloads.actor.h"
+#include "flow/flow.h"
 #include "flow/ITrace.h"
 #include "flow/Trace.h"

 #include <chrono>
 #include <cstring>
+#include <limits>
 #include <memory>
 #include <random>

@ -111,6 +114,7 @@ struct EncryptionOpsWorkload : TestWorkload {
 	int pageSize;
 	int maxBufSize;
 	std::unique_ptr<uint8_t[]> buff;
+	int enableTTLTest;

 	Arena arena;
 	std::unique_ptr<WorkloadMetrics> metrics;
@ -121,7 +125,7 @@ struct EncryptionOpsWorkload : TestWorkload {
 	EncryptCipherBaseKeyId headerBaseCipherId;
 	EncryptCipherRandomSalt headerRandomSalt;

-	EncryptionOpsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
+	EncryptionOpsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx), enableTTLTest(false) {
 		mode = getOption(options, LiteralStringRef("fixedSize"), 1);
 		numIterations = getOption(options, LiteralStringRef("numIterations"), 10);
 		pageSize = getOption(options, LiteralStringRef("pageSize"), 4096);
@ -136,13 +140,18 @@ struct EncryptionOpsWorkload : TestWorkload {

 		metrics = std::make_unique<WorkloadMetrics>();

+		if (wcx.clientId == 0 && mode == 1) {
+			enableTTLTest = true;
+		}
+
 		TraceEvent("EncryptionOpsWorkload")
 		    .detail("Mode", getModeStr())
 		    .detail("MinDomainId", minDomainId)
-		    .detail("MaxDomainId", maxDomainId);
+		    .detail("MaxDomainId", maxDomainId)
+		    .detail("EnableTTL", enableTTLTest);
 	}

-	~EncryptionOpsWorkload() { TraceEvent("EncryptionOpsWorkload_Done").log(); }
+	~EncryptionOpsWorkload() { TraceEvent("EncryptionOpsWorkload.Done").log(); }

 	bool isFixedSizePayload() { return mode == 1; }

@ -165,14 +174,19 @@ struct EncryptionOpsWorkload : TestWorkload {
 	void setupCipherEssentials() {
 		Reference<BlobCipherKeyCache> cipherKeyCache = BlobCipherKeyCache::getInstance();

-		TraceEvent("SetupCipherEssentials_Start").detail("MinDomainId", minDomainId).detail("MaxDomainId", maxDomainId);
+		TraceEvent("SetupCipherEssentials.Start").detail("MinDomainId", minDomainId).detail("MaxDomainId", maxDomainId);

 		uint8_t buff[AES_256_KEY_LENGTH];
 		std::vector<Reference<BlobCipherKey>> cipherKeys;
 		int cipherLen = 0;
 		for (EncryptCipherDomainId id = minDomainId; id <= maxDomainId; id++) {
 			generateRandomBaseCipher(AES_256_KEY_LENGTH, &buff[0], &cipherLen);
-			cipherKeyCache->insertCipherKey(id, minBaseCipherId, buff, cipherLen);
+			cipherKeyCache->insertCipherKey(id,
+			                                minBaseCipherId,
+			                                buff,
+			                                cipherLen,
+			                                std::numeric_limits<int64_t>::max(),
+			                                std::numeric_limits<int64_t>::max());

 			ASSERT(cipherLen > 0 && cipherLen <= AES_256_KEY_LENGTH);

@ -183,13 +197,18 @@ struct EncryptionOpsWorkload : TestWorkload {
 		// insert the Encrypt Header cipherKey; record cipherDetails as getLatestCipher() may not work with multiple
 		// test clients
 		generateRandomBaseCipher(AES_256_KEY_LENGTH, &buff[0], &cipherLen);
-		cipherKeyCache->insertCipherKey(ENCRYPT_HEADER_DOMAIN_ID, headerBaseCipherId, buff, cipherLen);
+		cipherKeyCache->insertCipherKey(ENCRYPT_HEADER_DOMAIN_ID,
+		                                headerBaseCipherId,
+		                                buff,
+		                                cipherLen,
+		                                std::numeric_limits<int64_t>::max(),
+		                                std::numeric_limits<int64_t>::max());
 		Reference<BlobCipherKey> latestCipher = cipherKeyCache->getLatestCipherKey(ENCRYPT_HEADER_DOMAIN_ID);
 		ASSERT_EQ(latestCipher->getBaseCipherId(), headerBaseCipherId);
 		ASSERT_EQ(memcmp(latestCipher->rawBaseCipher(), buff, cipherLen), 0);
 		headerRandomSalt = latestCipher->getSalt();

-		TraceEvent("SetupCipherEssentials_Done")
+		TraceEvent("SetupCipherEssentials.Done")
 		    .detail("MinDomainId", minDomainId)
 		    .detail("MaxDomainId", maxDomainId)
 		    .detail("HeaderBaseCipherId", headerBaseCipherId)
@ -198,9 +217,14 @@ struct EncryptionOpsWorkload : TestWorkload {

 	void resetCipherEssentials() {
 		Reference<BlobCipherKeyCache> cipherKeyCache = BlobCipherKeyCache::getInstance();
-		cipherKeyCache->cleanup();
+		for (EncryptCipherDomainId id = minDomainId; id <= maxDomainId; id++) {
+			cipherKeyCache->resetEncryptDomainId(id);
+		}

-		TraceEvent("ResetCipherEssentials_Done").log();
+		cipherKeyCache->resetEncryptDomainId(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID);
+		cipherKeyCache->resetEncryptDomainId(ENCRYPT_HEADER_DOMAIN_ID);
+
+		TraceEvent("ResetCipherEssentials.Done").log();
 	}

 	void updateLatestBaseCipher(const EncryptCipherDomainId encryptDomainId,
@ -232,7 +256,9 @@ struct EncryptionOpsWorkload : TestWorkload {
 			                                baseCipherId,
 			                                cipherKey->rawBaseCipher(),
 			                                cipherKey->getBaseCipherLen(),
-			                                cipherKey->getSalt());
+			                                cipherKey->getSalt(),
+			                                std::numeric_limits<int64_t>::max(),
+			                                std::numeric_limits<int64_t>::max());
 			// Ensure the update was a NOP
 			Reference<BlobCipherKey> cKey = cipherKeyCache->getCipherKey(domainId, baseCipherId, salt);
 			ASSERT(cKey->isEqual(cipherKey));
@ -297,11 +323,7 @@ struct EncryptionOpsWorkload : TestWorkload {
 		metrics->updateDecryptionTime(std::chrono::duration<double, std::nano>(end - start).count());
 	}

-	Future<Void> setup(Database const& ctx) override { return Void(); }
-
-	std::string description() const override { return "EncryptionOps"; }
-
-	Future<Void> start(Database const& cx) override {
+	void testBlobCipherKeyCacheOps() {
 		uint8_t baseCipher[AES_256_KEY_LENGTH];
 		int baseCipherLen = 0;
 		EncryptCipherBaseKeyId nextBaseCipherId;
@ -322,7 +344,12 @@ struct EncryptionOpsWorkload : TestWorkload {
 			if (updateBaseCipher) {
 				// simulate baseCipherId getting refreshed/updated
 				updateLatestBaseCipher(encryptDomainId, &baseCipher[0], &baseCipherLen, &nextBaseCipherId);
-				cipherKeyCache->insertCipherKey(encryptDomainId, nextBaseCipherId, &baseCipher[0], baseCipherLen);
+				cipherKeyCache->insertCipherKey(encryptDomainId,
+				                                nextBaseCipherId,
+				                                &baseCipher[0],
+				                                baseCipherLen,
+				                                std::numeric_limits<int64_t>::max(),
+				                                std::numeric_limits<int64_t>::max());
 			}

 			auto start = std::chrono::high_resolution_clock::now();
@ -368,6 +395,103 @@ struct EncryptionOpsWorkload : TestWorkload {

 		// Cleanup cipherKeys
 		resetCipherEssentials();
+	}
+
+	static void compareCipherDetails(Reference<BlobCipherKey> cipherKey,
+	                                 const EncryptCipherDomainId domId,
+	                                 const EncryptCipherBaseKeyId baseCipherId,
+	                                 const uint8_t* baseCipher,
+	                                 const int baseCipherLen,
+	                                 const int64_t refreshAt,
+	                                 const int64_t expAt) {
+		ASSERT(cipherKey.isValid());
+		ASSERT_EQ(cipherKey->getDomainId(), domId);
+		ASSERT_EQ(cipherKey->getBaseCipherId(), baseCipherId);
+		ASSERT_EQ(memcmp(cipherKey->rawBaseCipher(), baseCipher, baseCipherLen), 0);
+		ASSERT_EQ(cipherKey->getRefreshAtTS(), refreshAt);
+		ASSERT_EQ(cipherKey->getExpireAtTS(), expAt);
+	}
+
+	ACTOR Future<Void> testBlobCipherKeyCacheTTL(EncryptionOpsWorkload* self) {
+		state Reference<BlobCipherKeyCache> cipherKeyCache = BlobCipherKeyCache::getInstance();
+
+		state EncryptCipherDomainId domId = deterministicRandom()->randomInt(120000, 150000);
+		state EncryptCipherBaseKeyId baseCipherId = deterministicRandom()->randomInt(786, 1024);
+		state std::unique_ptr<uint8_t[]> baseCipher = std::make_unique<uint8_t[]>(AES_256_KEY_LENGTH);
+		state Reference<BlobCipherKey> cipherKey;
+		state EncryptCipherRandomSalt salt;
+		state int64_t refreshAt;
+		state int64_t expAt;
+
+		TraceEvent("TestBlobCipherCacheTTL.Start").detail("DomId", domId);
+
+		deterministicRandom()->randomBytes(baseCipher.get(), AES_256_KEY_LENGTH);
+
+		// Validate 'non-revocable' cipher with no expiration
+		refreshAt = std::numeric_limits<int64_t>::max();
+		expAt = std::numeric_limits<int64_t>::max();
+		cipherKeyCache->insertCipherKey(domId, baseCipherId, baseCipher.get(), AES_256_KEY_LENGTH, refreshAt, expAt);
+		cipherKey = cipherKeyCache->getLatestCipherKey(domId);
+		compareCipherDetails(cipherKey, domId, baseCipherId, baseCipher.get(), AES_256_KEY_LENGTH, refreshAt, expAt);
+
+		TraceEvent("TestBlobCipherCacheTTL.NonRevocableNoExpiry").detail("DomId", domId);
+
+		// Validate 'non-revocable' cipher with expiration
+		state EncryptCipherBaseKeyId baseCipherId_1 = baseCipherId + 1;
+		refreshAt = now() + 5;
+		cipherKeyCache->insertCipherKey(domId, baseCipherId_1, baseCipher.get(), AES_256_KEY_LENGTH, refreshAt, expAt);
+		cipherKey = cipherKeyCache->getLatestCipherKey(domId);
+		ASSERT(cipherKey.isValid());
+		compareCipherDetails(cipherKey, domId, baseCipherId_1, baseCipher.get(), AES_256_KEY_LENGTH, refreshAt, expAt);
+		salt = cipherKey->getSalt();
+		wait(delayUntil(refreshAt));
+		// Ensure that latest cipherKey needs refresh, however, cipher lookup works (non-revocable)
+		cipherKey = cipherKeyCache->getLatestCipherKey(domId);
+		ASSERT(!cipherKey.isValid());
+		cipherKey = cipherKeyCache->getCipherKey(domId, baseCipherId_1, salt);
+		ASSERT(cipherKey.isValid());
+		compareCipherDetails(cipherKey, domId, baseCipherId_1, baseCipher.get(), AES_256_KEY_LENGTH, refreshAt, expAt);
+
+		TraceEvent("TestBlobCipherCacheTTL.NonRevocableWithExpiry").detail("DomId", domId);
+
+		// Validate 'revocable' cipher with expiration
+		state EncryptCipherBaseKeyId baseCipherId_2 = baseCipherId + 2;
+		refreshAt = now() + 5;
+		expAt = refreshAt + 5;
+		cipherKeyCache->insertCipherKey(domId, baseCipherId_2, baseCipher.get(), AES_256_KEY_LENGTH, refreshAt, expAt);
+		cipherKey = cipherKeyCache->getLatestCipherKey(domId);
+		ASSERT(cipherKey.isValid());
+		compareCipherDetails(cipherKey, domId, baseCipherId_2, baseCipher.get(), AES_256_KEY_LENGTH, refreshAt, expAt);
+		salt = cipherKey->getSalt();
+		wait(delayUntil(refreshAt));
+		// Ensure that latest cipherKey needs refresh, however, cipher lookup works (non-revocable)
+		cipherKey = cipherKeyCache->getLatestCipherKey(domId);
+		ASSERT(!cipherKey.isValid());
+		cipherKey = cipherKeyCache->getCipherKey(domId, baseCipherId_2, salt);
+		ASSERT(cipherKey.isValid());
+		compareCipherDetails(cipherKey, domId, baseCipherId_2, baseCipher.get(), AES_256_KEY_LENGTH, refreshAt, expAt);
+		wait(delayUntil(expAt));
+		// Ensure that cipherKey lookup doesn't work after expiry
+		cipherKey = cipherKeyCache->getLatestCipherKey(domId);
+		ASSERT(!cipherKey.isValid());
+		cipherKey = cipherKeyCache->getCipherKey(domId, baseCipherId_2, salt);
+		ASSERT(!cipherKey.isValid());
+
+		TraceEvent("TestBlobCipherCacheTTL.End").detail("DomId", domId);
+		return Void();
+	}
+
+	Future<Void> setup(Database const& ctx) override { return Void(); }
+
+	std::string description() const override { return "EncryptionOps"; }
+
+	Future<Void> start(Database const& cx) override { return _start(cx, this); }
+
+	ACTOR Future<Void> _start(Database cx, EncryptionOpsWorkload* self) {
+		self->testBlobCipherKeyCacheOps();
+		if (self->enableTTLTest) {
+			wait(self->testBlobCipherKeyCacheTTL(self));
+		}
 		return Void();
 	}

--- a/Show More
+++ b/Show More